import * as cheerio from 'cheerio'; import fs from 'fs'; import { createObjectCsvWriter } from 'csv-writer'; // Type definitions interface Arc { id: string; name: string; frName: string | null; startChapter: number; endChapter: number | null; url: string; } interface Character { id: string; name: string; frName: string | null; gender: string | null; age: number | null; height: number | null; origin: string | null; frOrigin: string | null; devilFruitId: string | null; devilFruitUrl: string | null; affiliations: string[]; frAffiliations: string[] | null; bounty: number | null; hakiObservation: boolean; hakiArmament: boolean; hakiConqueror: boolean; epithets: string[]; frEpithets: string[] | null; firstAppearance: number; status: string | null; pictureUrl: string | null; url: string; frUrl: string | null; arcId: string; } interface CharacterListItem { name: string; url: string; chapter: number; } interface DevilFruitData { devilFruitId: string; devilFruitUrl: string; } interface DevilFruit { id: string; name: string; type: string | null; url: string; } const FANDOM_API_BASE = 'https://onepiece.fandom.com/api.php?action=parse&redirects=true&format=json&page='; const FR_FANDOM_API_BASE = 'https://onepiece.fandom.com/fr/api.php?action=parse&redirects=true&format=json&page='; const OUTPUT_DIR = './scraped-data'; const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed const INITIAL_RETRY_DELAY = 1000; const FETCH_CONCURRENCY = 50; // Create output directory if (!fs.existsSync(OUTPUT_DIR)) { fs.mkdirSync(OUTPUT_DIR, { recursive: true }); } /** * Retry a fetch request with exponential backoff */ async function fetchWithRetry( url: string, options: RequestInit = {}, retries: number = 0 ): Promise { try { const headers: Record = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) Firefox/150.0', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', Connection: 'keep-alive', ...((options.headers as Record) || {}) }; const response = await fetch(url, { headers, ...options }); // Check if response is OK (status 200-299) if (response.ok) { return response; } // If not OK and we have retries left, retry if (retries < MAX_RETRIES) { const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries); console.log(`⚠️ HTTP ${response.status} for ${url}, retrying in ${delay}ms...`); await new Promise((resolve) => setTimeout(resolve, delay)); return fetchWithRetry(url, options, retries + 1); } // If we've exhausted retries, throw error throw new Error(`HTTP ${response.status}: ${response.statusText}`); } catch (error) { // If it's a network error and we have retries left, retry if (retries < MAX_RETRIES) { const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries); console.log(`⚠️ Network error: ${(error as Error).message}, retrying in ${delay}ms...`); await new Promise((resolve) => setTimeout(resolve, delay)); return fetchWithRetry(url, options, retries + 1); } // If we've exhausted retries, throw error throw error; } } /** * Get the French link from the API response links array */ function getFrLink(links: { lang: string; ['*']: string; url: string }[]): { url: string } | null { // Get french url by getting parse.langlinks where lang is "fr" and extract the name from there const frLink = links.find( (link: { lang: string; ['*']: string; url: string }) => link.lang === 'fr' ); return frLink ? { url: frLink['url'] } : null; } /** * Normalize string by decoding URI components, punctuation, and replacing spaces with underscores */ function normalizeId(str: string): string { return decodeURIComponent(str) .normalize('NFD') .replace(/[,:.()]/g, '') .replace(/\s+/g, '_') .toLowerCase(); } /** * Fetch all arcs from One Piece fandom using API */ async function fetchAllArcs(): Promise { try { const apiUrl = `${FANDOM_API_BASE}Chapters_and_Volumes`; console.log('Fetching arcs list via API...'); const response = await fetchWithRetry(apiUrl); const jsonData = await response.json(); // Extract HTML from API response const htmlContent = jsonData.parse?.text?.['*']; if (!htmlContent) { throw new Error('Unable to extract HTML content from API response'); } const $ = cheerio.load(htmlContent); const arcs: Arc[] = []; const seenArcUrls = new Set(); // Arc rows are in table cells where first link points to a *_Arc page and text includes chapter range. const arcCells = $('table.wikitable td').toArray(); for (const element of arcCells) { const cell = $(element); const firstLink = cell.find('a').first(); const href = firstLink.attr('href') || ''; let arcName = firstLink.text().trim(); if (!href.startsWith('/wiki/') || !/_Arc$/i.test(href)) { continue; } if (!arcName || !/\bArc\b/i.test(arcName)) { continue; } arcName = arcName.replace(/\bArc\b/i, '').trim(); const cleanUrl = href.replace('/wiki/', ''); if (seenArcUrls.has(cleanUrl)) { continue; } const cellText = cell.text().replace(/\s+/g, ' ').trim(); const chapterMatch = cellText.match(/Chapters\s+(\d+)\s+to\s+(\d+|Current)/i); if (!chapterMatch) { continue; } const startChapter = parseInt(chapterMatch[1], 10); const endChapter = /current/i.test(chapterMatch[2]) ? null : parseInt(chapterMatch[2], 10); let arcId = normalizeId(cleanUrl); arcId = arcId.replace(/_arc$/i, ''); // Query the href page via API to get the correct HTML content (in case of redirect) and extract the French name from there const arcResponse = await fetchWithRetry(`${FANDOM_API_BASE}${cleanUrl}`); const arcJsonData = await arcResponse.json(); let frArcName: string | null = arcJsonData.parse?.langlinks.find( (link: { lang: string; ['*']: string }) => link.lang === 'fr' )?.['*'] || null; // Remove "Arc" suffix from French name if present to keep it consistent with English names (e.g. "Arc de Luffy" becomes "Luffy") if (frArcName && /\bArc\b/i.test(frArcName)) { frArcName = frArcName.replace(/\bArc\b/i, '').trim(); } arcs.push({ id: arcId, name: arcName, frName: frArcName, startChapter, endChapter, url: cleanUrl }); seenArcUrls.add(cleanUrl); } console.log(`Found ${arcs.length} arcs.`); return arcs; } catch (error) { console.error('Error fetching arcs list:', (error as Error).message); return []; } } /** * Save arcs to JSON */ async function saveArcsToJSON(arcs: Arc[]): Promise { const filepath = `${OUTPUT_DIR}/arcs.json`; fs.writeFileSync(filepath, JSON.stringify(arcs, null, 2)); console.log(`✓ Saved to ${filepath}`); } /** * Save arcs to CSV */ async function saveArcsToCSV(arcs: Arc[]): Promise { const filepath = `${OUTPUT_DIR}/arcs.csv`; const csvWriter = createObjectCsvWriter({ path: filepath, header: [ { id: 'id', title: 'ID' }, { id: 'name', title: 'Name' }, { id: 'frName', title: 'French Name' }, { id: 'startChapter', title: 'Start Chapter' }, { id: 'endChapter', title: 'End Chapter' }, { id: 'url', title: 'URL' } ] }); const records = arcs .filter((arc) => arc !== null) .map((arc) => ({ id: arc.id || '', name: arc.name || '', frName: arc.frName || '', startChapter: arc.startChapter || '', endChapter: arc.endChapter || '', url: arc.url || '' })); await csvWriter.writeRecords(records); console.log(`✓ Saved to ${filepath}`); } /** * Fetch all cannon characters from One Piece fandom using API */ async function fetchAllCharactersUrl(): Promise { try { const apiUrl = `${FANDOM_API_BASE}List_of_Canon_Characters`; console.log('Fetching character list via API...'); const response = await fetchWithRetry(apiUrl); const jsonData = await response.json(); // Extract HTML from API response const htmlContent = jsonData.parse?.text?.['*']; if (!htmlContent) { throw new Error('Unable to extract HTML content from API response'); } const $ = cheerio.load(htmlContent); const characters: CharacterListItem[] = []; $('table.fandom-table tbody tr').each((index, element) => { if (index === 0) return; // Skip header row let charUrl = $(element).find('td:nth-child(2) a').attr('href'); const charName = $(element).find('td:nth-child(2) a').text().trim(); let charChapter = $(element).find('td:nth-child(3)').text().trim(); // Remove parentheses and their content from chapter info (e.g. "1 (flashback)" becomes "1") charChapter = charChapter.replace(/\([^)]*\)/g, ''); charChapter = charChapter.replace(/\D/g, ''); // If charChapter is empty, skip the character as it means they don't have a proper page and are just mentioned in the list if (!charChapter) { return; } if (parseInt(charChapter, 10) === 0) { return; } if (charUrl) { charUrl = charUrl.replace('/wiki/', ''); characters.push({ name: charName, url: charUrl, chapter: parseInt(charChapter, 10) }); } }); console.log(`Found ${characters.length} characters.`); return characters; } catch (error) { console.error('Error fetching character list:', (error as Error).message); return []; } } /** * Fetch character data from fandom using provided URL */ async function fetchCharacter( characterUrl: string, characterName: string, characterChapter: number, arcsList: Arc[] ): Promise { try { console.log(`Fetching: ${characterName}...`); // Use API to fetch character page const apiUrl = `${FANDOM_API_BASE}${characterUrl}`; const response = await fetchWithRetry(apiUrl); const jsonData = await response.json(); const categories = jsonData.parse?.categories || []; // Extract HTML from API response const htmlContent = jsonData.parse?.text?.['*']; if (!htmlContent) { throw new Error('Unable to extract HTML content from API response'); } const $ = cheerio.load(htmlContent); const name = characterName; // Generate character ID from URL + name combination const finalCharacterId = normalizeId(characterUrl + '_' + name); // Extract gender from JSON categories let gender: string | null = null; for (const cat of categories) { const catName = cat['*'] || ''; if (catName === 'Male_Characters') { gender = 'Male'; break; } else if (catName === 'Female_Characters') { gender = 'Female'; break; } } // Extract age const age = extractAge($); // Extract affiliations const affiliations = await extractAffiliations($, 'en'); // Extract epithets const epithets = extractEpithets($); // Extract devil fruit const devilFruitData = await extractDevilFruit($); const devilFruitId = devilFruitData?.devilFruitId || null; const devilFruitUrl = devilFruitData?.devilFruitUrl || null; // Extract haki from JSON categories let hakiObservation = false; let hakiArmament = false; let hakiConqueror = false; for (const cat of categories) { const catName = cat['*'] || ''; if (catName === 'Observation_Haki_Users') { hakiObservation = true; } else if (catName === 'Armament_Haki_Users') { hakiArmament = true; } else if (catName === 'Supreme_King_Haki_Users') { hakiConqueror = true; } } // Extract bounty const bounty = extractBounty($); // Extract height const height = extractHeight($); // Use chapter from character list, cast to int const firstAppearance = characterChapter; // Extract origin const origin = extractOrigin($); // Extract status const status = extractStatus($); let arcId = ''; const arc = arcsList.find( (a) => a.startChapter <= firstAppearance && (a.endChapter === null || a.endChapter >= firstAppearance) ); if (!arc) { return null; } arcId = arc.id; const frLink = getFrLink(jsonData.parse?.langlinks || []); const frUrl = frLink ? frLink.url.replace('https://onepiece.fandom.com/fr/wiki/', '') : null; const frjsonData = frUrl ? await fetchWithRetry(`${FR_FANDOM_API_BASE}${frUrl}`).then((res) => res.json()) : null; let frName = frjsonData?.parse?.title || null; const frAffiliations = frjsonData ? await extractAffiliations(cheerio.load(frjsonData.parse?.text?.['*'] || ''), 'fr') : null; const frEpithets = frjsonData ? extractEpithets(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null; const frOrigin = frjsonData ? extractOrigin(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null; if (name !== jsonData.parse?.title) { frName = name; } return { id: finalCharacterId, name, frName, gender, age, height, origin, frOrigin, devilFruitId, devilFruitUrl, affiliations, frAffiliations, bounty, hakiObservation, hakiArmament, hakiConqueror, epithets, frEpithets, firstAppearance, arcId, status, pictureUrl: 'Image_Non_Disponible', url: characterUrl, frUrl }; } catch (error) { console.error(`Error fetching ${characterName}:`, (error as Error).message); return null; } } /** * Extract age from infobox */ function extractAge($: cheerio.CheerioAPI): number | null { const div = $('[data-source="age"] .pi-data-value'); if (div.length === 0) return null; let text = div.html(); if (!text) return null; // Remove all sup blocks (citations) text = text.replace(/]*>.*?<\/sup>/gi, ''); // Get the last element and extract only digits const parts = text.split(']*>/g, '').trim(); // Remove content with parentheses cleanText = cleanText.replace(/\([^)]*\)/g, ''); const digitsOnly = cleanText.replace(/\D/g, ''); return parseInt(digitsOnly) || null; } /** * Extract affiliations from infobox */ async function extractAffiliations($: cheerio.CheerioAPI, lang: string): Promise { const div = $('[data-source="affiliation"] .pi-data-value'); if (div.length === 0) return []; const cleanedDiv = div.clone(); cleanedDiv.find('sup').remove(); const text = cleanedDiv.html(); if (!text) return []; // Resolve affiliations from linked page titles. const links = cleanedDiv.find('a').toArray(); if (links.length > 0) { const linkValues = await Promise.all( links.map(async (el) => { const href = $(el).attr('href') || ''; const resolvedTitle = await fetchWithRetry( `${lang === 'fr' ? FR_FANDOM_API_BASE : FANDOM_API_BASE}${href.replace('/fr/wiki/', '').replace('/wiki/', '')}` ) .then((res) => res.json()) .then((json) => json.parse?.title) .catch(() => null); if (resolvedTitle) { return resolvedTitle; } return $(el).text().trim(); }) ); const uniqueLinks = Array.from(new Set(linkValues.filter(Boolean))); if (uniqueLinks.length > 0) { return uniqueLinks; } } // Fallback to parsing text const cleanText = text.replace(/<[^>]*>/g, '').trim(); const parts = cleanText.split(/\s*\n\s*|\s*;\s*|\s*,\s*/).filter(Boolean); return parts.length > 0 ? parts : []; } /** * Extract epithets from infobox * Handles both quoted and unquoted epithets, keeping only the main/latest readable values. */ function extractEpithets($: cheerio.CheerioAPI): string[] { const div = $('[data-source="epithet"] .pi-data-value'); if (div.length === 0) return []; const cleanedDiv = div.clone(); cleanedDiv.find('sup').remove(); const html = cleanedDiv.html(); if (!html) return []; const plainText = html.replace(//gi, '\n').replace(/<[^>]*>/g, ''); const lines = plainText .split('\n') .map((line) => line.trim()) .filter(Boolean); const epithets = lines .map((line) => { const normalized = line.replace(/\s+/g, ' ').trim(); // Prefer explicit quoted epithet if present. const quotedMatch = normalized.match(/["«“](.*?)["»”]/); if (quotedMatch?.[1]) { return quotedMatch[1].trim(); } // Otherwise keep only the base epithet text before extra notes/translations. return normalized .split(/[;(]/)[0] .replace(/["'«»“”]/g, '') .trim(); }) .filter(Boolean); return Array.from(new Set(epithets)); } /** * Extract devil fruit from infobox * Returns both normalized ID and URL */ async function extractDevilFruit($: cheerio.CheerioAPI): Promise { const link = $('[data-source="dfname"] .pi-data-value a').first(); if (link.length === 0) return null; const href = link.attr('href'); if (!href || !href.startsWith('/wiki/')) return null; const cleanUrl = href.replace('/wiki/', ''); return { devilFruitId: normalizeId(cleanUrl), devilFruitUrl: cleanUrl }; } /** * Extract bounty from infobox */ function extractBounty($: cheerio.CheerioAPI): number | null { const div = $('[data-source="bounty"] .pi-data-value'); if (div.length === 0) return 0; let text = div.html(); if (!text) return 0; // Remove all sup blocks (citations) text = text.replace(/]*>.*?<\/sup>/gi, ''); // Extract the first value before any
tag const firstValue = text.split(']*>/g, '').trim(); // Check if cleanText contains digits if (!/\d/.test(cleanText)) { // If no digits, try second value after
const secondValue = text.split('
')[1]; if (secondValue) { cleanText = secondValue.replace(/<[^>]*>/g, '').trim(); } } // Remove all non-digits cleanText = cleanText.replace(/\D/g, ''); return cleanText ? parseInt(cleanText) : 0; } /** * Extract height from infobox */ function extractHeight($: cheerio.CheerioAPI): number | null { const div = $('[data-source="height"] .pi-data-value'); if (div.length === 0) return null; let text = div.html(); if (!text) return null; // Remove all sup blocks (citations) text = text.replace(/]*>.*?<\/sup>/gi, ''); // Convert line breaks to new lines so we can reliably pick the latest value. const textWithNewLines = text.replace(//gi, '\n'); const lines = textWithNewLines .replace(/<[^>]*>/g, '') .split('\n') .map((line) => line.trim()) .filter(Boolean); // Keep only lines that look like a height value, then pick the latest one. const heightLines = lines.filter((line) => /\d/.test(line) && /(cm|m)/i.test(line)); const latestLine = heightLines.length > 0 ? heightLines[heightLines.length - 1] : lines[lines.length - 1]; if (!latestLine) return null; // Remove descriptive suffixes like "(post-timeskip)". const cleanText = latestLine.replace(/\([^)]*\)/g, '').trim(); const normalized = cleanText.toLowerCase().replace(/\s/g, ''); // Values are stored in meters in this dataset. const cmMatch = normalized.match(/(\d+(?:[.,]\d+)?)cm/); if (cmMatch) { const cm = parseFloat(cmMatch[1].replace(',', '.')); return Number.isFinite(cm) ? cm / 100 : null; } const mMatch = normalized.match(/(\d+(?:[.,]\d+)?)m/); if (mMatch) { const meters = parseFloat(mMatch[1].replace(',', '.')); return Number.isFinite(meters) ? meters : null; } return null; } /** * Extract origin from infobox */ function extractOrigin($: cheerio.CheerioAPI): string | null { const div = $( '[data-source="origin"] .pi-data-value, [data-source="origine"] .pi-data-value' ).first(); if (div.length === 0) return null; let text = div.html(); if (!text) return null; // Remove all sup blocks (citations) text = text.replace(/]*>.*?<\/sup>/gi, ''); // Extract the first value before any
tag const firstValue = text.split(']*>/g, '').trim(); // Remove content with parentheses cleanText = cleanText.replace(/\([^)]*\)/g, '').trim(); return cleanText || null; } /** * Extract status from infobox */ function extractStatus($: cheerio.CheerioAPI): string | null { const div = $('[data-source="status"] .pi-data-value'); if (div.length === 0) return null; const statusText = div.text().trim().toLowerCase(); if (statusText.includes('Alive')) { return 'Alive'; } else if (statusText.includes('Dead')) { return 'Dead'; } else if (statusText.includes('Unknown')) { return 'Unknown'; } return 'Alive'; } /** * Save data to JSON */ async function saveToJSON(characters: Character[]): Promise { const filepath = `${OUTPUT_DIR}/characters.json`; fs.writeFileSync(filepath, JSON.stringify(characters, null, 2)); console.log(`✓ Saved to ${filepath}`); } /** * Save data to CSV */ async function saveToCSV(characters: Character[]): Promise { const filepath = `${OUTPUT_DIR}/characters.csv`; const csvWriter = createObjectCsvWriter({ path: filepath, header: [ { id: 'id', title: 'ID' }, { id: 'name', title: 'Name' }, { id: 'gender', title: 'Gender' }, { id: 'age', title: 'Age' }, { id: 'height', title: 'Height' }, { id: 'origin', title: 'Origin' }, { id: 'status', title: 'Status' }, { id: 'epithets', title: 'Epithets' }, { id: 'devilFruitId', title: 'Devil Fruit ID' }, { id: 'affiliations', title: 'Affiliations' }, { id: 'bounty', title: 'Bounty' }, { id: 'hakiObservation', title: 'Haki Observation' }, { id: 'hakiArmament', title: 'Haki Armament' }, { id: 'hakiConqueror', title: 'Haki Conqueror' }, { id: 'firstAppearance', title: 'First Appearance' }, { id: 'arcId', title: 'Arc ID' }, { id: 'pictureUrl', title: 'Image URL' }, { id: 'url', title: 'Fandom URL' } ] }); const records = characters .filter((c) => c !== null) .map((c) => ({ id: c.id || '', name: c.name || '', gender: c.gender || '', age: c.age || '', height: c.height || '', origin: c.origin || '', status: c.status || '', epithets: Array.isArray(c.epithets) ? c.epithets.join(', ') : c.epithets || '', devilFruitId: c.devilFruitId || '', affiliations: Array.isArray(c.affiliations) ? c.affiliations.join(', ') : c.affiliations || '', bounty: c.bounty ?? 0, hakiObservation: c.hakiObservation ? 1 : 0, hakiArmament: c.hakiArmament ? 1 : 0, hakiConqueror: c.hakiConqueror ? 1 : 0, firstAppearance: c.firstAppearance || '', arcId: c.arcId || '', pictureUrl: c.pictureUrl || '', url: c.url || '' })); await csvWriter.writeRecords(records); console.log(`✓ Saved to ${filepath}`); } /** * Fetch devil fruit data from fandom using provided URL */ async function fetchDevilFruit( devilFruitUrl: string, devilFruitId: string ): Promise { try { console.log(`Fetching devil fruit: ${devilFruitUrl}...`); // Use API to fetch devil fruit page const apiUrl = `${FANDOM_API_BASE}${devilFruitUrl}`; const response = await fetchWithRetry(apiUrl); const jsonData = await response.json(); // Extract HTML from API response const htmlContent = jsonData.parse?.text?.['*']; if (!htmlContent) { throw new Error('Unable to extract HTML content from API response'); } const name = jsonData.parse?.title || devilFruitId.replace(/_/g, ' '); let type: string | null = null; // Determine type based on categories (if categories contain "Paramecia", "Zoan", "Logia" or "Smile") if (jsonData.parse?.categories) { const categories = jsonData.parse.categories.map((cat: { ['*']: string }) => String(cat['*'] || '').toLowerCase() ); if (categories.some((category: string) => category.includes('paramecia'))) { type = 'Paramecia'; } else if (categories.some((category: string) => category.includes('zoan'))) { type = 'Zoan'; } else if (categories.some((category: string) => category.includes('logia'))) { type = 'Logia'; } else if (categories.some((category: string) => category.includes('smile'))) { type = 'Smile'; } } return { id: devilFruitId, name, type, url: devilFruitUrl }; } catch (error) { console.error(`Error fetching devil fruit ${devilFruitUrl}:`, (error as Error).message); return null; } } /** * Save devil fruits to JSON */ async function saveDevilFruitsToJSON(devilFruits: DevilFruit[]): Promise { const filepath = `${OUTPUT_DIR}/devil-fruits.json`; fs.writeFileSync(filepath, JSON.stringify(devilFruits, null, 2)); console.log(`✓ Saved to ${filepath}`); } /** * Save devil fruits to CSV */ async function saveDevilFruitsToCSV(devilFruits: DevilFruit[]): Promise { const filepath = `${OUTPUT_DIR}/devil-fruits.csv`; const csvWriter = createObjectCsvWriter({ path: filepath, header: [ { id: 'id', title: 'ID' }, { id: 'name', title: 'Name' }, { id: 'type', title: 'Type' }, { id: 'url', title: 'URL' } ] }); const records = devilFruits .filter((df) => df !== null) .map((df) => ({ id: df.id || '', name: df.name || '', type: df.type || '', url: df.url || '' })); await csvWriter.writeRecords(records); console.log(`✓ Saved to ${filepath}`); } /** * Main execution */ async function main(): Promise { const format = process.argv[2] || 'all'; // json, csv, or all console.log(`\nOne Piece Scraper - Mode: ${format}\n`); // Step 1: Scraping Arcs console.log('=== Step 1: Scraping Arcs ===\n'); const arcsList = await fetchAllArcs(); if (arcsList.length > 0) { // Display arcs in table format arcsList.forEach((arc) => { console.table({ ID: arc.id, Name: arc.name, FrenchName: arc.frName || '', StartChapter: arc.startChapter, EndChapter: arc.endChapter || 'Ongoing', URL: arc.url }); }); console.log(`\n✓ Found ${arcsList.length} arcs\n`); if (format === 'json' || format === 'all') { await saveArcsToJSON(arcsList); } if (format === 'csv' || format === 'all') { await saveArcsToCSV(arcsList); } } else { console.warn('No arcs found, continuing...\n'); } // Step 2: Scraping Characters console.log('=== Step 1: Scraping Characters ===\n'); const characterList = await fetchAllCharactersUrl(); if (characterList.length === 0) { console.error('No characters found. Exiting.'); return; } const characters: Character[] = []; const devilFruitUrls = new Set(); let failedCharacters: CharacterListItem[] = [...characterList]; while (failedCharacters.length > 0) { const nextFailedCharacters: CharacterListItem[] = []; console.log(`\nFetching ${failedCharacters.length} characters...`); for (let i = 0; i < failedCharacters.length; i += FETCH_CONCURRENCY) { const batch = failedCharacters.slice(i, i + FETCH_CONCURRENCY); const batchResults = await Promise.all( batch.map(async (char) => { const data = await fetchCharacter(char.url, char.name, char.chapter, arcsList); return { char, data }; }) ); for (const { char, data } of batchResults) { if (data) { console.table({ ID: data.id, Name: data.name, Gender: data.gender, Age: data.age, Status: data.status, Epithets: data.epithets.join(', '), Affiliations: data.affiliations.join(', '), DevilFruitId: data.devilFruitId, DevilFruitUrl: data.devilFruitUrl, HakiObservation: data.hakiObservation ? 'Yes' : 'No', HakiArmament: data.hakiArmament ? 'Yes' : 'No', HakiConqueror: data.hakiConqueror ? 'Yes' : 'No', Height: data.height, Bounty: data.bounty, Origin: data.origin, FirstAppearance: data.firstAppearance, pictureUrl: data.pictureUrl, FandomURL: data.url }); if (data.devilFruitUrl) { devilFruitUrls.add(data.devilFruitUrl); } characters.push(data); } else { nextFailedCharacters.push(char); } } } failedCharacters = nextFailedCharacters; if (failedCharacters.length > 0) { console.log(`⚠️ ${failedCharacters.length} characters failed. Retrying...`); } } console.log(`\n✓ Scraped ${characters.length} characters\n`); console.log(`✓ Found ${devilFruitUrls.size} unique devil fruits\n`); // Step 3: Scraping Devil Fruits console.log('=== Step 2: Scraping Devil Fruits ===\n'); if (devilFruitUrls.size === 0) { console.warn('No devil fruits found from characters, skipping...\n'); } else { const devilFruits: DevilFruit[] = []; const devilFruitUrlArray = Array.from(devilFruitUrls); for (let i = 0; i < devilFruitUrlArray.length; i += FETCH_CONCURRENCY) { const batch = devilFruitUrlArray.slice(i, i + FETCH_CONCURRENCY); const batchResults = await Promise.all( batch.map(async (url) => { const data = await fetchDevilFruit(url, normalizeId(url)); return { url, data }; }) ); for (const { data } of batchResults) { if (data) { console.table({ ID: data.id, Name: data.name, Type: data.type, URL: data.url }); devilFruits.push(data); } } } console.log(`\n✓ Scraped ${devilFruits.length} devil fruits\n`); if (format === 'json' || format === 'all') { await saveDevilFruitsToJSON(devilFruits); } if (format === 'csv' || format === 'all') { await saveDevilFruitsToCSV(devilFruits); } // Update characters with normalized devil fruit IDs const devilFruitMap = new Map(devilFruits.map((df) => [df.id, df.id])); characters.forEach((char) => { if (char.devilFruitUrl) { const normalizedId = normalizeId(char.devilFruitUrl); char.devilFruitId = devilFruitMap.get(normalizedId) || normalizedId; } }); } // Save characters after devil fruit IDs are updated if (format === 'json' || format === 'all') { await saveToJSON(characters); } if (format === 'csv' || format === 'all') { await saveToCSV(characters); } console.log('\n✓ Done!\n'); } main().catch(console.error);