From 66afda5101ac67919e6a1543dd3f25eb90cfc7dc Mon Sep 17 00:00:00 2001 From: whidix Date: Sat, 14 Mar 2026 15:34:30 +0100 Subject: [PATCH] Refactor code structure for improved readability and maintainability --- scripts/import-json.ts | 53 +- scripts/scrape-onepiece.ts | 1533 +++++++++++++++++++----------------- 2 files changed, 802 insertions(+), 784 deletions(-) diff --git a/scripts/import-json.ts b/scripts/import-json.ts index 0383a39..e1947d8 100644 --- a/scripts/import-json.ts +++ b/scripts/import-json.ts @@ -4,6 +4,8 @@ import { sql, eq } from 'drizzle-orm'; import fs from 'fs'; import { arc, character, devilFruit, characterScrapeValidation, type DevilFruitType } from '../src/lib/server/db/schema'; +type Status = 'Alive' | 'Dead' | 'Unknown'; + type ArcRecord = { id: string; name: string; @@ -22,9 +24,11 @@ type DevilFruitRecord = { type CharacterRecord = { id: string; name: string; + frName?: string | null; gender?: string | null; age?: number | null; affiliations?: string[] | string | null; + frAffiliations?: string[] | string | null; devilFruitId?: string | null; hakiObservation?: boolean; hakiArmament?: boolean; @@ -32,12 +36,15 @@ type CharacterRecord = { bounty?: number | null; height?: number | null; origin?: string | null; + frOrigin?: string | null; firstAppearance?: number; pictureUrl?: string | null; epithets?: string[] | string | null; - status?: string | null; + frEpithets?: string[] | string | null; + status?: Status | null; arcId?: string | null; url?: string | null; + frUrl?: string | null; }; const DATABASE_URL = process.env.DATABASE_URL || 'file:local.db'; @@ -86,7 +93,7 @@ function toJsonArray(value: string[] | string | null | undefined): string[] | nu function toDevilFruitType(value: DevilFruitType | string | null | undefined): DevilFruitType | null { if (!value) return null; - if (value === 'Paramecia' || value === 'Zoan' || value === 'Logia' || value === 'Unknown') { + if (value === 'Paramecia' || value === 'Zoan' || value === 'Logia' || value === 'Smile' || value === 'Unknown') { return value; } return 'Unknown'; @@ -115,59 +122,25 @@ function transformCharacterData(item: CharacterRecord) { gender: toNullable(item.gender), age: toNullable(item.age), affiliations: toJsonArray(item.affiliations), + frAffiliations: toJsonArray(item.frAffiliations), devilFruitId: toNullable(item.devilFruitId), hakiObservation: !!item.hakiObservation, hakiArmament: !!item.hakiArmament, hakiConqueror: !!item.hakiConqueror, bounty: item.bounty ?? 0, - height: toNumber(item.height as any), + height: toNumber(item.height as string | number | null), origin: toNullable(item.origin), + frOrigin: toNullable(item.frOrigin), firstAppearance: item.firstAppearance ?? 0, pictureUrl: toNullable(item.pictureUrl), epithets: toJsonArray(item.epithets), + frEpithets: toJsonArray(item.frEpithets), status: toNullable(item.status), arcId: toNullable(item.arcId), url: toNullable(item.url) }; } -function hasChanged(jsonData: any, dbData: any): boolean { - if (!dbData) return true; - - // Print any differences for debugging - for (const key in jsonData) { - const jsonValue = jsonData[key]; - const dbValue = dbData[key]; - const jsonString = typeof jsonValue === 'object' ? JSON.stringify(jsonValue) : String(jsonValue); - const dbString = typeof dbValue === 'object' ? JSON.stringify(dbValue) : String(dbValue); - if (jsonString !== dbString) { - console.log(`\nField "${key}" changed for character ID ${jsonData.id}:`); - console.log(` JSON: ${jsonString}`); - console.log(` DB: ${dbString}`); - } } - - // Compare each field - return ( - jsonData.name != dbData.name || - jsonData.gender != dbData.gender || - jsonData.age != dbData.age || - JSON.stringify(jsonData.affiliations) != JSON.stringify(dbData.affiliations) || - jsonData.devilFruitId != dbData.devilFruitId || - jsonData.hakiObservation != dbData.hakiObservation || - jsonData.hakiArmament != dbData.hakiArmament || - jsonData.hakiConqueror != dbData.hakiConqueror || - jsonData.bounty != dbData.bounty || - jsonData.height != dbData.height || - jsonData.origin != dbData.origin || - jsonData.firstAppearance != dbData.firstAppearance || - jsonData.pictureUrl != dbData.pictureUrl || - JSON.stringify(jsonData.epithets) != JSON.stringify(dbData.epithets) || - jsonData.status != dbData.status || - jsonData.arcId != dbData.arcId || - jsonData.url != dbData.url - ); -} - async function isCharacterTableEmpty(): Promise { const result = await db.select({ count: sql`COUNT(*)` }).from(character); return result[0]?.count === 0; diff --git a/scripts/scrape-onepiece.ts b/scripts/scrape-onepiece.ts index 83bd963..e5ed8e5 100644 --- a/scripts/scrape-onepiece.ts +++ b/scripts/scrape-onepiece.ts @@ -4,117 +4,122 @@ import { createObjectCsvWriter } from 'csv-writer'; // Type definitions interface Arc { - id: string; - name: string; - frName: string | null; - startChapter: number; - endChapter: number | null; - url: string; + id: string; + name: string; + frName: string | null; + startChapter: number; + endChapter: number | null; + url: string; } interface Character { - id: string; - name: string; - frName: string | null; - gender: string | null; - age: number | null; - height: number | null; - origin: string | null; - frOrigin: string | null; - devilFruitId: string | null; - devilFruitUrl: string | null; - affiliations: string[]; - frAffiliations: string[] | null; - bounty: number | null; - hakiObservation: boolean; - hakiArmament: boolean; - hakiConqueror: boolean; - epithets: string[]; - frEpithets: string[] | null; - firstAppearance: number; - status: string | null; - pictureUrl: string | null; - url: string; - frUrl: string | null; - arcId: string; + id: string; + name: string; + frName: string | null; + gender: string | null; + age: number | null; + height: number | null; + origin: string | null; + frOrigin: string | null; + devilFruitId: string | null; + devilFruitUrl: string | null; + affiliations: string[]; + frAffiliations: string[] | null; + bounty: number | null; + hakiObservation: boolean; + hakiArmament: boolean; + hakiConqueror: boolean; + epithets: string[]; + frEpithets: string[] | null; + firstAppearance: number; + status: string | null; + pictureUrl: string | null; + url: string; + frUrl: string | null; + arcId: string; } interface CharacterListItem { - name: string; - url: string; - chapter: number; + name: string; + url: string; + chapter: number; } interface DevilFruitData { - devilFruitId: string; - devilFruitUrl: string; + devilFruitId: string; + devilFruitUrl: string; } interface DevilFruit { - id: string; - name: string; - type: string | null; - url: string; + id: string; + name: string; + type: string | null; + url: string; } -const FANDOM_API_BASE = 'https://onepiece.fandom.com/api.php?action=parse&redirects=true&format=json&page='; -const FR_FANDOM_API_BASE = 'https://onepiece.fandom.com/fr/api.php?action=parse&redirects=true&format=json&page='; +const FANDOM_API_BASE = + 'https://onepiece.fandom.com/api.php?action=parse&redirects=true&format=json&page='; +const FR_FANDOM_API_BASE = + 'https://onepiece.fandom.com/fr/api.php?action=parse&redirects=true&format=json&page='; const OUTPUT_DIR = './scraped-data'; const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed const INITIAL_RETRY_DELAY = 1000; const FETCH_CONCURRENCY = 50; - // Create output directory if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); + fs.mkdirSync(OUTPUT_DIR, { recursive: true }); } /** * Retry a fetch request with exponential backoff */ -async function fetchWithRetry(url: string, options: RequestInit = {}, retries: number = 0): Promise { - try { - const headers: Record = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) Firefox/150.0', - 'Accept-Language': 'en-US,en;q=0.9', - 'Accept-Encoding': 'gzip, deflate, br', - 'Connection': 'keep-alive', - ...((options.headers as Record) || {}) - }; +async function fetchWithRetry( + url: string, + options: RequestInit = {}, + retries: number = 0 +): Promise { + try { + const headers: Record = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) Firefox/150.0', + 'Accept-Language': 'en-US,en;q=0.9', + 'Accept-Encoding': 'gzip, deflate, br', + Connection: 'keep-alive', + ...((options.headers as Record) || {}) + }; - const response = await fetch(url, { - headers, - ...options - }); + const response = await fetch(url, { + headers, + ...options + }); - // Check if response is OK (status 200-299) - if (response.ok) { - return response; - } + // Check if response is OK (status 200-299) + if (response.ok) { + return response; + } - // If not OK and we have retries left, retry - if (retries < MAX_RETRIES) { - const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries); - console.log(`⚠️ HTTP ${response.status} for ${url}, retrying in ${delay}ms...`); - await new Promise(resolve => setTimeout(resolve, delay)); - return fetchWithRetry(url, options, retries + 1); - } + // If not OK and we have retries left, retry + if (retries < MAX_RETRIES) { + const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries); + console.log(`⚠️ HTTP ${response.status} for ${url}, retrying in ${delay}ms...`); + await new Promise((resolve) => setTimeout(resolve, delay)); + return fetchWithRetry(url, options, retries + 1); + } - // If we've exhausted retries, throw error - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } catch (error) { - // If it's a network error and we have retries left, retry - if (retries < MAX_RETRIES) { - const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries); - console.log(`⚠️ Network error: ${(error as Error).message}, retrying in ${delay}ms...`); - await new Promise(resolve => setTimeout(resolve, delay)); - return fetchWithRetry(url, options, retries + 1); - } + // If we've exhausted retries, throw error + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } catch (error) { + // If it's a network error and we have retries left, retry + if (retries < MAX_RETRIES) { + const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries); + console.log(`⚠️ Network error: ${(error as Error).message}, retrying in ${delay}ms...`); + await new Promise((resolve) => setTimeout(resolve, delay)); + return fetchWithRetry(url, options, retries + 1); + } - // If we've exhausted retries, throw error - throw error; - } + // If we've exhausted retries, throw error + throw error; + } } /** @@ -122,396 +127,433 @@ async function fetchWithRetry(url: string, options: RequestInit = {}, retries: n */ function getFrLink(links: { lang: string; ['*']: string; url: string }[]): { url: string } | null { - // Get french url by getting parse.langlinks where lang is "fr" and extract the name from there - const frLink = links.find((link: { lang: string; ['*']: string; url: string }) => link.lang === 'fr'); - return frLink ? { url: frLink['url'] } : null; + // Get french url by getting parse.langlinks where lang is "fr" and extract the name from there + const frLink = links.find( + (link: { lang: string; ['*']: string; url: string }) => link.lang === 'fr' + ); + return frLink ? { url: frLink['url'] } : null; } - /** * Normalize string by decoding URI components, punctuation, and replacing spaces with underscores */ function normalizeId(str: string): string { - return decodeURIComponent(str) - .normalize('NFD') - .replace(/[,:.()]/g, '') - .replace(/\s+/g, '_') - .toLowerCase(); + return decodeURIComponent(str) + .normalize('NFD') + .replace(/[,:.()]/g, '') + .replace(/\s+/g, '_') + .toLowerCase(); } /** * Fetch all arcs from One Piece fandom using API */ async function fetchAllArcs(): Promise { - try { - const apiUrl = `${FANDOM_API_BASE}Chapters_and_Volumes`; - console.log('Fetching arcs list via API...'); - const response = await fetchWithRetry(apiUrl); - const jsonData = await response.json(); - - // Extract HTML from API response - const htmlContent = jsonData.parse?.text?.['*']; - if (!htmlContent) { - throw new Error('Unable to extract HTML content from API response'); - } - - const $ = cheerio.load(htmlContent); - const arcs: Arc[] = []; + try { + const apiUrl = `${FANDOM_API_BASE}Chapters_and_Volumes`; + console.log('Fetching arcs list via API...'); + const response = await fetchWithRetry(apiUrl); + const jsonData = await response.json(); - const seenArcUrls = new Set(); + // Extract HTML from API response + const htmlContent = jsonData.parse?.text?.['*']; + if (!htmlContent) { + throw new Error('Unable to extract HTML content from API response'); + } - // Arc rows are in table cells where first link points to a *_Arc page and text includes chapter range. - const arcCells = $('table.wikitable td').toArray(); - for (const element of arcCells) { - const cell = $(element); - const firstLink = cell.find('a').first(); - const href = firstLink.attr('href') || ''; - let arcName = firstLink.text().trim(); + const $ = cheerio.load(htmlContent); + const arcs: Arc[] = []; - if (!href.startsWith('/wiki/') || !/_Arc$/i.test(href)) { - continue; - } + const seenArcUrls = new Set(); - if (!arcName || !/\bArc\b/i.test(arcName)) { - continue; - } + // Arc rows are in table cells where first link points to a *_Arc page and text includes chapter range. + const arcCells = $('table.wikitable td').toArray(); + for (const element of arcCells) { + const cell = $(element); + const firstLink = cell.find('a').first(); + const href = firstLink.attr('href') || ''; + let arcName = firstLink.text().trim(); - arcName = arcName.replace(/\bArc\b/i, '').trim(); + if (!href.startsWith('/wiki/') || !/_Arc$/i.test(href)) { + continue; + } - const cleanUrl = href.replace('/wiki/', ''); - if (seenArcUrls.has(cleanUrl)) { - continue; - } + if (!arcName || !/\bArc\b/i.test(arcName)) { + continue; + } - const cellText = cell.text().replace(/\s+/g, ' ').trim(); - const chapterMatch = cellText.match(/Chapters\s+(\d+)\s+to\s+(\d+|Current)/i); - if (!chapterMatch) { - continue; - } + arcName = arcName.replace(/\bArc\b/i, '').trim(); - const startChapter = parseInt(chapterMatch[1], 10); - const endChapter = /current/i.test(chapterMatch[2]) - ? null - : parseInt(chapterMatch[2], 10); + const cleanUrl = href.replace('/wiki/', ''); + if (seenArcUrls.has(cleanUrl)) { + continue; + } - let arcId = normalizeId(cleanUrl); - arcId = arcId.replace(/_arc$/i, ''); + const cellText = cell.text().replace(/\s+/g, ' ').trim(); + const chapterMatch = cellText.match(/Chapters\s+(\d+)\s+to\s+(\d+|Current)/i); + if (!chapterMatch) { + continue; + } - // Query the href page via API to get the correct HTML content (in case of redirect) and extract the French name from there - const arcResponse = await fetchWithRetry(`${FANDOM_API_BASE}${cleanUrl}`); - const arcJsonData = await arcResponse.json(); - let frArcName: string | null = arcJsonData.parse?.langlinks.find((link: { lang: string; ['*']: string }) => link.lang === 'fr')?.['*'] || null; + const startChapter = parseInt(chapterMatch[1], 10); + const endChapter = /current/i.test(chapterMatch[2]) ? null : parseInt(chapterMatch[2], 10); - // Remove "Arc" suffix from French name if present to keep it consistent with English names (e.g. "Arc de Luffy" becomes "Luffy") - if (frArcName && /\bArc\b/i.test(frArcName)) { - frArcName = frArcName.replace(/\bArc\b/i, '').trim(); - } + let arcId = normalizeId(cleanUrl); + arcId = arcId.replace(/_arc$/i, ''); - arcs.push({ - id: arcId, - name: arcName, - frName: frArcName, - startChapter, - endChapter, - url: cleanUrl - }); + // Query the href page via API to get the correct HTML content (in case of redirect) and extract the French name from there + const arcResponse = await fetchWithRetry(`${FANDOM_API_BASE}${cleanUrl}`); + const arcJsonData = await arcResponse.json(); + let frArcName: string | null = + arcJsonData.parse?.langlinks.find( + (link: { lang: string; ['*']: string }) => link.lang === 'fr' + )?.['*'] || null; - seenArcUrls.add(cleanUrl); - } + // Remove "Arc" suffix from French name if present to keep it consistent with English names (e.g. "Arc de Luffy" becomes "Luffy") + if (frArcName && /\bArc\b/i.test(frArcName)) { + frArcName = frArcName.replace(/\bArc\b/i, '').trim(); + } - console.log(`Found ${arcs.length} arcs.`); - return arcs; - } catch (error) { - console.error('Error fetching arcs list:', (error as Error).message); - return []; - } + arcs.push({ + id: arcId, + name: arcName, + frName: frArcName, + startChapter, + endChapter, + url: cleanUrl + }); + + seenArcUrls.add(cleanUrl); + } + + console.log(`Found ${arcs.length} arcs.`); + return arcs; + } catch (error) { + console.error('Error fetching arcs list:', (error as Error).message); + return []; + } } /** * Save arcs to JSON */ async function saveArcsToJSON(arcs: Arc[]): Promise { - const filepath = `${OUTPUT_DIR}/arcs.json`; - fs.writeFileSync(filepath, JSON.stringify(arcs, null, 2)); - console.log(`✓ Saved to ${filepath}`); + const filepath = `${OUTPUT_DIR}/arcs.json`; + fs.writeFileSync(filepath, JSON.stringify(arcs, null, 2)); + console.log(`✓ Saved to ${filepath}`); } /** * Save arcs to CSV */ async function saveArcsToCSV(arcs: Arc[]): Promise { - const filepath = `${OUTPUT_DIR}/arcs.csv`; - const csvWriter = createObjectCsvWriter({ - path: filepath, - header: [ - { id: 'id', title: 'ID' }, - { id: 'name', title: 'Name' }, - { id: 'frName', title: 'French Name' }, - { id: 'startChapter', title: 'Start Chapter' }, - { id: 'endChapter', title: 'End Chapter' }, - { id: 'url', title: 'URL' } - ], - }); + const filepath = `${OUTPUT_DIR}/arcs.csv`; + const csvWriter = createObjectCsvWriter({ + path: filepath, + header: [ + { id: 'id', title: 'ID' }, + { id: 'name', title: 'Name' }, + { id: 'frName', title: 'French Name' }, + { id: 'startChapter', title: 'Start Chapter' }, + { id: 'endChapter', title: 'End Chapter' }, + { id: 'url', title: 'URL' } + ] + }); - const records = arcs - .filter((arc) => arc !== null) - .map((arc) => ({ - id: arc.id || '', - name: arc.name || '', - frName: arc.frName || '', - startChapter: arc.startChapter || '', - endChapter: arc.endChapter || '', - url: arc.url || '' - })); + const records = arcs + .filter((arc) => arc !== null) + .map((arc) => ({ + id: arc.id || '', + name: arc.name || '', + frName: arc.frName || '', + startChapter: arc.startChapter || '', + endChapter: arc.endChapter || '', + url: arc.url || '' + })); - await csvWriter.writeRecords(records); - console.log(`✓ Saved to ${filepath}`); + await csvWriter.writeRecords(records); + console.log(`✓ Saved to ${filepath}`); } /** * Fetch all cannon characters from One Piece fandom using API */ async function fetchAllCharactersUrl(): Promise { - try { - const apiUrl = `${FANDOM_API_BASE}List_of_Canon_Characters`; - console.log('Fetching character list via API...'); - const response = await fetchWithRetry(apiUrl); - const jsonData = await response.json(); - - // Extract HTML from API response - const htmlContent = jsonData.parse?.text?.['*']; - if (!htmlContent) { - throw new Error('Unable to extract HTML content from API response'); - } - - const $ = cheerio.load(htmlContent); - const characters: CharacterListItem[] = []; - $('table.fandom-table tbody tr').each((index, element) => { - if (index === 0) return; // Skip header row - let charUrl = $(element).find('td:nth-child(2) a').attr('href'); - const charName = $(element).find('td:nth-child(2) a').text().trim(); - let charChapter = $(element).find('td:nth-child(3)').text().trim(); + try { + const apiUrl = `${FANDOM_API_BASE}List_of_Canon_Characters`; + console.log('Fetching character list via API...'); + const response = await fetchWithRetry(apiUrl); + const jsonData = await response.json(); - // Remove parentheses and their content from chapter info (e.g. "1 (flashback)" becomes "1") - charChapter = charChapter.replace(/\([^)]*\)/g, ''); - charChapter = charChapter.replace(/\D/g, ''); + // Extract HTML from API response + const htmlContent = jsonData.parse?.text?.['*']; + if (!htmlContent) { + throw new Error('Unable to extract HTML content from API response'); + } - // If charChapter is empty, skip the character as it means they don't have a proper page and are just mentioned in the list - if (!charChapter) { - return; - } + const $ = cheerio.load(htmlContent); + const characters: CharacterListItem[] = []; + $('table.fandom-table tbody tr').each((index, element) => { + if (index === 0) return; // Skip header row + let charUrl = $(element).find('td:nth-child(2) a').attr('href'); + const charName = $(element).find('td:nth-child(2) a').text().trim(); + let charChapter = $(element).find('td:nth-child(3)').text().trim(); - if (parseInt(charChapter, 10) === 0) { - return; - } + // Remove parentheses and their content from chapter info (e.g. "1 (flashback)" becomes "1") + charChapter = charChapter.replace(/\([^)]*\)/g, ''); + charChapter = charChapter.replace(/\D/g, ''); - if (charUrl) { - charUrl = charUrl.replace('/wiki/', ''); - characters.push({ - name: charName, - url: charUrl, - chapter: parseInt(charChapter, 10) - }); - } - }); - console.log(`Found ${characters.length} characters.`); - return characters; - } catch (error) { - console.error('Error fetching character list:', (error as Error).message); - return []; - } + // If charChapter is empty, skip the character as it means they don't have a proper page and are just mentioned in the list + if (!charChapter) { + return; + } + + if (parseInt(charChapter, 10) === 0) { + return; + } + + if (charUrl) { + charUrl = charUrl.replace('/wiki/', ''); + characters.push({ + name: charName, + url: charUrl, + chapter: parseInt(charChapter, 10) + }); + } + }); + console.log(`Found ${characters.length} characters.`); + return characters; + } catch (error) { + console.error('Error fetching character list:', (error as Error).message); + return []; + } } /** * Fetch character data from fandom using provided URL */ async function fetchCharacter( - characterUrl: string, - characterName: string, - characterChapter: number, - arcsList: Arc[], + characterUrl: string, + characterName: string, + characterChapter: number, + arcsList: Arc[] ): Promise { - try { - console.log(`Fetching: ${characterName}...`); + try { + console.log(`Fetching: ${characterName}...`); - // Use API to fetch character page - const apiUrl = `${FANDOM_API_BASE}${characterUrl}`; - const response = await fetchWithRetry(apiUrl); - const jsonData = await response.json(); + // Use API to fetch character page + const apiUrl = `${FANDOM_API_BASE}${characterUrl}`; + const response = await fetchWithRetry(apiUrl); + const jsonData = await response.json(); - const categories = jsonData.parse?.categories || []; + const categories = jsonData.parse?.categories || []; - // Extract HTML from API response - const htmlContent = jsonData.parse?.text?.['*']; - if (!htmlContent) { - throw new Error('Unable to extract HTML content from API response'); - } + // Extract HTML from API response + const htmlContent = jsonData.parse?.text?.['*']; + if (!htmlContent) { + throw new Error('Unable to extract HTML content from API response'); + } - const $ = cheerio.load(htmlContent); + const $ = cheerio.load(htmlContent); - const name = characterName; + const name = characterName; - // Generate character ID from URL + name combination - const finalCharacterId = normalizeId(characterUrl + '_' + name); + // Generate character ID from URL + name combination + const finalCharacterId = normalizeId(characterUrl + '_' + name); - // Extract gender from JSON categories - let gender: string | null = null; - for (const cat of categories) { - const catName = cat['*'] || ''; - if (catName === 'Male_Characters') { - gender = 'Male'; - break; - } else if (catName === 'Female_Characters') { - gender = 'Female'; - break; - } - } + // Extract gender from JSON categories + let gender: string | null = null; + for (const cat of categories) { + const catName = cat['*'] || ''; + if (catName === 'Male_Characters') { + gender = 'Male'; + break; + } else if (catName === 'Female_Characters') { + gender = 'Female'; + break; + } + } - // Extract age - const age = extractAge($); + // Extract age + const age = extractAge($); - // Extract affiliations - const affiliations = extractAffiliations($); + // Extract affiliations + const affiliations = await extractAffiliations($, 'en'); - // Extract epithets - const epithets = extractEpithets($); + // Extract epithets + const epithets = extractEpithets($); - // Extract devil fruit - const devilFruitData = await extractDevilFruit($); - const devilFruitId = devilFruitData?.devilFruitId || null; - const devilFruitUrl = devilFruitData?.devilFruitUrl || null; + // Extract devil fruit + const devilFruitData = await extractDevilFruit($); + const devilFruitId = devilFruitData?.devilFruitId || null; + const devilFruitUrl = devilFruitData?.devilFruitUrl || null; - // Extract haki from JSON categories - let hakiObservation = false; - let hakiArmament = false; - let hakiConqueror = false; - for (const cat of categories) { - const catName = cat['*'] || ''; - if (catName === 'Observation_Haki_Users') { - hakiObservation = true; - } else if (catName === 'Armament_Haki_Users') { - hakiArmament = true; - } else if (catName === 'Supreme_King_Haki_Users') { - hakiConqueror = true; - } - } + // Extract haki from JSON categories + let hakiObservation = false; + let hakiArmament = false; + let hakiConqueror = false; + for (const cat of categories) { + const catName = cat['*'] || ''; + if (catName === 'Observation_Haki_Users') { + hakiObservation = true; + } else if (catName === 'Armament_Haki_Users') { + hakiArmament = true; + } else if (catName === 'Supreme_King_Haki_Users') { + hakiConqueror = true; + } + } - // Extract bounty - const bounty = extractBounty($); + // Extract bounty + const bounty = extractBounty($); - // Extract height - const height = extractHeight($); + // Extract height + const height = extractHeight($); - // Use chapter from character list, cast to int - const firstAppearance = characterChapter; + // Use chapter from character list, cast to int + const firstAppearance = characterChapter; - // Extract origin - const origin = extractOrigin($); + // Extract origin + const origin = extractOrigin($); - // Extract status - const status = extractStatus($); + // Extract status + const status = extractStatus($); - let arcId = ''; - const arc = arcsList.find(a => a.startChapter <= firstAppearance && (a.endChapter === null || a.endChapter >= firstAppearance)); - if (!arc) { - return null; - } - arcId = arc.id; + let arcId = ''; + const arc = arcsList.find( + (a) => + a.startChapter <= firstAppearance && + (a.endChapter === null || a.endChapter >= firstAppearance) + ); + if (!arc) { + return null; + } + arcId = arc.id; - const frLink = getFrLink(jsonData.parse?.langlinks || []); - const frUrl = frLink ? frLink.url.replace('https://onepiece.fandom.com/fr/wiki/', '') : null; - const frjsonData = frUrl ? await fetchWithRetry(`${FR_FANDOM_API_BASE}${frUrl}`).then(res => res.json()) : null; + const frLink = getFrLink(jsonData.parse?.langlinks || []); + const frUrl = frLink ? frLink.url.replace('https://onepiece.fandom.com/fr/wiki/', '') : null; + const frjsonData = frUrl + ? await fetchWithRetry(`${FR_FANDOM_API_BASE}${frUrl}`).then((res) => res.json()) + : null; - const frName = frjsonData?.parse?.title || null; + let frName = frjsonData?.parse?.title || null; - const frAffiliations = frjsonData ? extractAffiliations(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null; + const frAffiliations = frjsonData + ? await extractAffiliations(cheerio.load(frjsonData.parse?.text?.['*'] || ''), 'fr') + : null; - const frEpithets = frjsonData ? extractEpithets(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null; + const frEpithets = frjsonData + ? extractEpithets(cheerio.load(frjsonData.parse?.text?.['*'] || '')) + : null; - const frOrigin = frjsonData ? extractOrigin(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null; + const frOrigin = frjsonData + ? extractOrigin(cheerio.load(frjsonData.parse?.text?.['*'] || '')) + : null; + if (name !== jsonData.parse?.title) { + frName = name; + } - return { - id: finalCharacterId, - name, - frName, - gender, - age, - height, - origin, - frOrigin, - devilFruitId, - devilFruitUrl, - affiliations, - frAffiliations, - bounty, - hakiObservation, - hakiArmament, - hakiConqueror, - epithets, - frEpithets, - firstAppearance, - arcId, - status, - pictureUrl: "Image_Non_Disponible", - url: characterUrl, - frUrl - }; - } catch (error) { - console.error(`Error fetching ${characterName}:`, (error as Error).message); - return null; - } + return { + id: finalCharacterId, + name, + frName, + gender, + age, + height, + origin, + frOrigin, + devilFruitId, + devilFruitUrl, + affiliations, + frAffiliations, + bounty, + hakiObservation, + hakiArmament, + hakiConqueror, + epithets, + frEpithets, + firstAppearance, + arcId, + status, + pictureUrl: 'Image_Non_Disponible', + url: characterUrl, + frUrl + }; + } catch (error) { + console.error(`Error fetching ${characterName}:`, (error as Error).message); + return null; + } } - /** * Extract age from infobox */ function extractAge($: cheerio.CheerioAPI): number | null { - const div = $('[data-source="age"] .pi-data-value'); - if (div.length === 0) return null; + const div = $('[data-source="age"] .pi-data-value'); + if (div.length === 0) return null; - let text = div.html(); - if (!text) return null; + let text = div.html(); + if (!text) return null; - // Remove all sup blocks (citations) - text = text.replace(/]*>.*?<\/sup>/gi, ''); + // Remove all sup blocks (citations) + text = text.replace(/]*>.*?<\/sup>/gi, ''); - // Get the last element and extract only digits - const parts = text.split(']*>/g, '').trim(); - - // Remove content with parentheses - cleanText = cleanText.replace(/\([^)]*\)/g, ''); - - const digitsOnly = cleanText.replace(/\D/g, ''); - return parseInt(digitsOnly) || null; + // Get the last element and extract only digits + const parts = text.split(']*>/g, '').trim(); + + // Remove content with parentheses + cleanText = cleanText.replace(/\([^)]*\)/g, ''); + + const digitsOnly = cleanText.replace(/\D/g, ''); + return parseInt(digitsOnly) || null; } /** * Extract affiliations from infobox */ -function extractAffiliations($: cheerio.CheerioAPI): string[] { - const div = $('[data-source="affiliation"] .pi-data-value'); - if (div.length === 0) return []; +async function extractAffiliations($: cheerio.CheerioAPI, lang: string): Promise { + const div = $('[data-source="affiliation"] .pi-data-value'); + if (div.length === 0) return []; - const cleanedDiv = div.clone(); - cleanedDiv.find('sup').remove(); + const cleanedDiv = div.clone(); + cleanedDiv.find('sup').remove(); - const text = cleanedDiv.html(); - if (!text) return []; + const text = cleanedDiv.html(); + if (!text) return []; - // Extract all link values - const linkValues = cleanedDiv.find('a').map((_i, el) => $(el).text().trim()).get(); - if (linkValues.length > 0) { - return linkValues; - } + // Resolve affiliations from linked page titles. + const links = cleanedDiv.find('a').toArray(); + if (links.length > 0) { + const linkValues = await Promise.all( + links.map(async (el) => { + const href = $(el).attr('href') || ''; + const resolvedTitle = await fetchWithRetry( + `${lang === 'fr' ? FR_FANDOM_API_BASE : FANDOM_API_BASE}${href.replace('/fr/wiki/', '').replace('/wiki/', '')}` + ) + .then((res) => res.json()) + .then((json) => json.parse?.title) + .catch(() => null); - // Fallback to parsing text - const cleanText = text.replace(/<[^>]*>/g, '').trim(); - const parts = cleanText.split(/\s*\n\s*|\s*;\s*|\s*,\s*/).filter(Boolean); - return parts.length > 0 ? parts : []; + if (resolvedTitle) { + return resolvedTitle; + } + + return $(el).text().trim(); + }) + ); + + const uniqueLinks = Array.from(new Set(linkValues.filter(Boolean))); + if (uniqueLinks.length > 0) { + return uniqueLinks; + } + } + + // Fallback to parsing text + const cleanText = text.replace(/<[^>]*>/g, '').trim(); + const parts = cleanText.split(/\s*\n\s*|\s*;\s*|\s*,\s*/).filter(Boolean); + return parts.length > 0 ? parts : []; } /** @@ -519,43 +561,41 @@ function extractAffiliations($: cheerio.CheerioAPI): string[] { * Handles both quoted and unquoted epithets, keeping only the main/latest readable values. */ function extractEpithets($: cheerio.CheerioAPI): string[] { - const div = $('[data-source="epithet"] .pi-data-value'); - if (div.length === 0) return []; + const div = $('[data-source="epithet"] .pi-data-value'); + if (div.length === 0) return []; - const cleanedDiv = div.clone(); - cleanedDiv.find('sup').remove(); + const cleanedDiv = div.clone(); + cleanedDiv.find('sup').remove(); - const html = cleanedDiv.html(); - if (!html) return []; + const html = cleanedDiv.html(); + if (!html) return []; - const plainText = html - .replace(//gi, '\n') - .replace(/<[^>]*>/g, ''); + const plainText = html.replace(//gi, '\n').replace(/<[^>]*>/g, ''); - const lines = plainText - .split('\n') - .map((line) => line.trim()) - .filter(Boolean); + const lines = plainText + .split('\n') + .map((line) => line.trim()) + .filter(Boolean); - const epithets = lines - .map((line) => { - const normalized = line.replace(/\s+/g, ' ').trim(); + const epithets = lines + .map((line) => { + const normalized = line.replace(/\s+/g, ' ').trim(); - // Prefer explicit quoted epithet if present. - const quotedMatch = normalized.match(/["«“](.*?)["»”]/); - if (quotedMatch?.[1]) { - return quotedMatch[1].trim(); - } + // Prefer explicit quoted epithet if present. + const quotedMatch = normalized.match(/["«“](.*?)["»”]/); + if (quotedMatch?.[1]) { + return quotedMatch[1].trim(); + } - // Otherwise keep only the base epithet text before extra notes/translations. - return normalized - .split(/[;(]/)[0] - .replace(/["'«»“”]/g, '') - .trim(); - }) - .filter(Boolean); + // Otherwise keep only the base epithet text before extra notes/translations. + return normalized + .split(/[;(]/)[0] + .replace(/["'«»“”]/g, '') + .trim(); + }) + .filter(Boolean); - return Array.from(new Set(epithets)); + return Array.from(new Set(epithets)); } /** @@ -563,462 +603,467 @@ function extractEpithets($: cheerio.CheerioAPI): string[] { * Returns both normalized ID and URL */ async function extractDevilFruit($: cheerio.CheerioAPI): Promise { - const link = $('[data-source="dfname"] .pi-data-value a').first(); - if (link.length === 0) return null; + const link = $('[data-source="dfname"] .pi-data-value a').first(); + if (link.length === 0) return null; - const href = link.attr('href'); - if (!href || !href.startsWith('/wiki/')) return null; + const href = link.attr('href'); + if (!href || !href.startsWith('/wiki/')) return null; - const cleanUrl = href.replace('/wiki/', ''); + const cleanUrl = href.replace('/wiki/', ''); - - return { - devilFruitId: normalizeId(cleanUrl), - devilFruitUrl: cleanUrl - }; + return { + devilFruitId: normalizeId(cleanUrl), + devilFruitUrl: cleanUrl + }; } - /** * Extract bounty from infobox */ function extractBounty($: cheerio.CheerioAPI): number | null { - const div = $('[data-source="bounty"] .pi-data-value'); - if (div.length === 0) return 0; + const div = $('[data-source="bounty"] .pi-data-value'); + if (div.length === 0) return 0; - let text = div.html(); - if (!text) return 0; + let text = div.html(); + if (!text) return 0; - // Remove all sup blocks (citations) - text = text.replace(/]*>.*?<\/sup>/gi, ''); + // Remove all sup blocks (citations) + text = text.replace(/]*>.*?<\/sup>/gi, ''); - // Extract the first value before any
tag - const firstValue = text.split(']*>/g, '').trim(); - - // Check if cleanText contains digits - if (!/\d/.test(cleanText)) { - // If no digits, try second value after
- const secondValue = text.split('
')[1]; - if (secondValue) { - cleanText = secondValue.replace(/<[^>]*>/g, '').trim(); - } - } + // Extract the first value before any
tag + const firstValue = text.split(']*>/g, '').trim(); - // Remove all non-digits - cleanText = cleanText.replace(/\D/g, ''); + // Check if cleanText contains digits + if (!/\d/.test(cleanText)) { + // If no digits, try second value after
+ const secondValue = text.split('
')[1]; + if (secondValue) { + cleanText = secondValue.replace(/<[^>]*>/g, '').trim(); + } + } - return cleanText ? parseInt(cleanText) : 0; + // Remove all non-digits + cleanText = cleanText.replace(/\D/g, ''); + + return cleanText ? parseInt(cleanText) : 0; } /** * Extract height from infobox */ function extractHeight($: cheerio.CheerioAPI): number | null { - const div = $('[data-source="height"] .pi-data-value'); - if (div.length === 0) return null; + const div = $('[data-source="height"] .pi-data-value'); + if (div.length === 0) return null; - let text = div.html(); - if (!text) return null; + let text = div.html(); + if (!text) return null; - // Remove all sup blocks (citations) - text = text.replace(/]*>.*?<\/sup>/gi, ''); + // Remove all sup blocks (citations) + text = text.replace(/]*>.*?<\/sup>/gi, ''); - // Convert line breaks to new lines so we can reliably pick the latest value. - const textWithNewLines = text.replace(//gi, '\n'); - const lines = textWithNewLines - .replace(/<[^>]*>/g, '') - .split('\n') - .map((line) => line.trim()) - .filter(Boolean); + // Convert line breaks to new lines so we can reliably pick the latest value. + const textWithNewLines = text.replace(//gi, '\n'); + const lines = textWithNewLines + .replace(/<[^>]*>/g, '') + .split('\n') + .map((line) => line.trim()) + .filter(Boolean); - // Keep only lines that look like a height value, then pick the latest one. - const heightLines = lines.filter((line) => /\d/.test(line) && /(cm|m)/i.test(line)); - const latestLine = heightLines.length > 0 ? heightLines[heightLines.length - 1] : lines[lines.length - 1]; - if (!latestLine) return null; + // Keep only lines that look like a height value, then pick the latest one. + const heightLines = lines.filter((line) => /\d/.test(line) && /(cm|m)/i.test(line)); + const latestLine = + heightLines.length > 0 ? heightLines[heightLines.length - 1] : lines[lines.length - 1]; + if (!latestLine) return null; - // Remove descriptive suffixes like "(post-timeskip)". - const cleanText = latestLine.replace(/\([^)]*\)/g, '').trim(); - const normalized = cleanText.toLowerCase().replace(/\s/g, ''); + // Remove descriptive suffixes like "(post-timeskip)". + const cleanText = latestLine.replace(/\([^)]*\)/g, '').trim(); + const normalized = cleanText.toLowerCase().replace(/\s/g, ''); - // Values are stored in meters in this dataset. - const cmMatch = normalized.match(/(\d+(?:[.,]\d+)?)cm/); - if (cmMatch) { - const cm = parseFloat(cmMatch[1].replace(',', '.')); - return Number.isFinite(cm) ? cm / 100 : null; - } + // Values are stored in meters in this dataset. + const cmMatch = normalized.match(/(\d+(?:[.,]\d+)?)cm/); + if (cmMatch) { + const cm = parseFloat(cmMatch[1].replace(',', '.')); + return Number.isFinite(cm) ? cm / 100 : null; + } - const mMatch = normalized.match(/(\d+(?:[.,]\d+)?)m/); - if (mMatch) { - const meters = parseFloat(mMatch[1].replace(',', '.')); - return Number.isFinite(meters) ? meters : null; - } + const mMatch = normalized.match(/(\d+(?:[.,]\d+)?)m/); + if (mMatch) { + const meters = parseFloat(mMatch[1].replace(',', '.')); + return Number.isFinite(meters) ? meters : null; + } - return null; + return null; } /** * Extract origin from infobox */ function extractOrigin($: cheerio.CheerioAPI): string | null { - const div = $('[data-source="origin"] .pi-data-value, [data-source="origine"] .pi-data-value').first(); - if (div.length === 0) return null; + const div = $( + '[data-source="origin"] .pi-data-value, [data-source="origine"] .pi-data-value' + ).first(); + if (div.length === 0) return null; - let text = div.html(); - if (!text) return null; + let text = div.html(); + if (!text) return null; - // Remove all sup blocks (citations) - text = text.replace(/]*>.*?<\/sup>/gi, ''); + // Remove all sup blocks (citations) + text = text.replace(/]*>.*?<\/sup>/gi, ''); - // Extract the first value before any
tag - const firstValue = text.split(']*>/g, '').trim(); - - // Remove content with parentheses - cleanText = cleanText.replace(/\([^)]*\)/g, '').trim(); - - return cleanText || null; + // Extract the first value before any
tag + const firstValue = text.split(']*>/g, '').trim(); + + // Remove content with parentheses + cleanText = cleanText.replace(/\([^)]*\)/g, '').trim(); + + return cleanText || null; } /** * Extract status from infobox */ function extractStatus($: cheerio.CheerioAPI): string | null { - const div = $('[data-source="status"] .pi-data-value'); - if (div.length === 0) return null; + const div = $('[data-source="status"] .pi-data-value'); + if (div.length === 0) return null; - const statusText = div.text().trim().toLowerCase(); - - if (statusText.includes('Alive')) { - return 'Alive'; - } else if (statusText.includes('Dead')) { - return 'Dead'; - } else if (statusText.includes('Unknown')) { - return 'Unknown'; - } - - return 'Alive'; + const statusText = div.text().trim().toLowerCase(); + + if (statusText.includes('Alive')) { + return 'Alive'; + } else if (statusText.includes('Dead')) { + return 'Dead'; + } else if (statusText.includes('Unknown')) { + return 'Unknown'; + } + + return 'Alive'; } - /** * Save data to JSON */ async function saveToJSON(characters: Character[]): Promise { - const filepath = `${OUTPUT_DIR}/characters.json`; - fs.writeFileSync(filepath, JSON.stringify(characters, null, 2)); - console.log(`✓ Saved to ${filepath}`); + const filepath = `${OUTPUT_DIR}/characters.json`; + fs.writeFileSync(filepath, JSON.stringify(characters, null, 2)); + console.log(`✓ Saved to ${filepath}`); } /** * Save data to CSV */ async function saveToCSV(characters: Character[]): Promise { - const filepath = `${OUTPUT_DIR}/characters.csv`; - const csvWriter = createObjectCsvWriter({ - path: filepath, - header: [ - { id: 'id', title: 'ID' }, - { id: 'name', title: 'Name' }, - { id: 'gender', title: 'Gender' }, - { id: 'age', title: 'Age' }, - { id: 'height', title: 'Height' }, - { id: 'origin', title: 'Origin' }, - { id: 'status', title: 'Status' }, - { id: 'epithets', title: 'Epithets' }, - { id: 'devilFruitId', title: 'Devil Fruit ID' }, - { id: 'affiliations', title: 'Affiliations' }, - { id: 'bounty', title: 'Bounty' }, - { id: 'hakiObservation', title: 'Haki Observation' }, - { id: 'hakiArmament', title: 'Haki Armament' }, - { id: 'hakiConqueror', title: 'Haki Conqueror' }, - { id: 'firstAppearance', title: 'First Appearance' }, - { id: 'arcId', title: 'Arc ID' }, - { id: 'pictureUrl', title: 'Image URL' }, - { id: 'url', title: 'Fandom URL' } - ], - }); + const filepath = `${OUTPUT_DIR}/characters.csv`; + const csvWriter = createObjectCsvWriter({ + path: filepath, + header: [ + { id: 'id', title: 'ID' }, + { id: 'name', title: 'Name' }, + { id: 'gender', title: 'Gender' }, + { id: 'age', title: 'Age' }, + { id: 'height', title: 'Height' }, + { id: 'origin', title: 'Origin' }, + { id: 'status', title: 'Status' }, + { id: 'epithets', title: 'Epithets' }, + { id: 'devilFruitId', title: 'Devil Fruit ID' }, + { id: 'affiliations', title: 'Affiliations' }, + { id: 'bounty', title: 'Bounty' }, + { id: 'hakiObservation', title: 'Haki Observation' }, + { id: 'hakiArmament', title: 'Haki Armament' }, + { id: 'hakiConqueror', title: 'Haki Conqueror' }, + { id: 'firstAppearance', title: 'First Appearance' }, + { id: 'arcId', title: 'Arc ID' }, + { id: 'pictureUrl', title: 'Image URL' }, + { id: 'url', title: 'Fandom URL' } + ] + }); - const records = characters - .filter((c) => c !== null) - .map((c) => ({ - id: c.id || '', - name: c.name || '', - gender: c.gender || '', - age: c.age || '', - height: c.height || '', - origin: c.origin || '', - status: c.status || '', - epithets: Array.isArray(c.epithets) ? c.epithets.join(', ') : (c.epithets || ''), - devilFruitId: c.devilFruitId || '', - affiliations: Array.isArray(c.affiliations) ? c.affiliations.join(', ') : (c.affiliations || ''), - bounty: c.bounty ?? 0, - hakiObservation: c.hakiObservation ? 1 : 0, - hakiArmament: c.hakiArmament ? 1 : 0, - hakiConqueror: c.hakiConqueror ? 1 : 0, - firstAppearance: c.firstAppearance || '', - arcId: c.arcId || '', - pictureUrl: c.pictureUrl || '', - url: c.url || '' - })); + const records = characters + .filter((c) => c !== null) + .map((c) => ({ + id: c.id || '', + name: c.name || '', + gender: c.gender || '', + age: c.age || '', + height: c.height || '', + origin: c.origin || '', + status: c.status || '', + epithets: Array.isArray(c.epithets) ? c.epithets.join(', ') : c.epithets || '', + devilFruitId: c.devilFruitId || '', + affiliations: Array.isArray(c.affiliations) + ? c.affiliations.join(', ') + : c.affiliations || '', + bounty: c.bounty ?? 0, + hakiObservation: c.hakiObservation ? 1 : 0, + hakiArmament: c.hakiArmament ? 1 : 0, + hakiConqueror: c.hakiConqueror ? 1 : 0, + firstAppearance: c.firstAppearance || '', + arcId: c.arcId || '', + pictureUrl: c.pictureUrl || '', + url: c.url || '' + })); - await csvWriter.writeRecords(records); - console.log(`✓ Saved to ${filepath}`); + await csvWriter.writeRecords(records); + console.log(`✓ Saved to ${filepath}`); } /** * Fetch devil fruit data from fandom using provided URL */ -async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Promise { - try { - console.log(`Fetching devil fruit: ${devilFruitUrl}...`); +async function fetchDevilFruit( + devilFruitUrl: string, + devilFruitId: string +): Promise { + try { + console.log(`Fetching devil fruit: ${devilFruitUrl}...`); - // Use API to fetch devil fruit page - const apiUrl = `${FANDOM_API_BASE}${devilFruitUrl}`; - const response = await fetchWithRetry(apiUrl); - const jsonData = await response.json(); - - // Extract HTML from API response - const htmlContent = jsonData.parse?.text?.['*']; - if (!htmlContent) { - throw new Error('Unable to extract HTML content from API response'); - } + // Use API to fetch devil fruit page + const apiUrl = `${FANDOM_API_BASE}${devilFruitUrl}`; + const response = await fetchWithRetry(apiUrl); + const jsonData = await response.json(); - const name = jsonData.parse?.title || devilFruitId.replace(/_/g, ' '); + // Extract HTML from API response + const htmlContent = jsonData.parse?.text?.['*']; + if (!htmlContent) { + throw new Error('Unable to extract HTML content from API response'); + } - let type: string | null = null; - // Determine type based on categories (if categories contain "Paramecia", "Zoan", "Logia" or "Smile") - if (jsonData.parse?.categories) { - const categories = jsonData.parse.categories - .map((cat: { ['*']: string }) => String(cat['*'] || '').toLowerCase()); + const name = jsonData.parse?.title || devilFruitId.replace(/_/g, ' '); - if (categories.some((category: string) => category.includes('paramecia'))) { - type = 'Paramecia'; - } else if (categories.some((category: string) => category.includes('zoan'))) { - type = 'Zoan'; - } else if (categories.some((category: string) => category.includes('logia'))) { - type = 'Logia'; - } else if (categories.some((category: string) => category.includes('smile'))) { - type = 'Smile'; - } - } + let type: string | null = null; + // Determine type based on categories (if categories contain "Paramecia", "Zoan", "Logia" or "Smile") + if (jsonData.parse?.categories) { + const categories = jsonData.parse.categories.map((cat: { ['*']: string }) => + String(cat['*'] || '').toLowerCase() + ); - return { - id: devilFruitId, - name, - type, - url: devilFruitUrl - }; - } catch (error) { - console.error(`Error fetching devil fruit ${devilFruitUrl}:`, (error as Error).message); - return null; - } + if (categories.some((category: string) => category.includes('paramecia'))) { + type = 'Paramecia'; + } else if (categories.some((category: string) => category.includes('zoan'))) { + type = 'Zoan'; + } else if (categories.some((category: string) => category.includes('logia'))) { + type = 'Logia'; + } else if (categories.some((category: string) => category.includes('smile'))) { + type = 'Smile'; + } + } + + return { + id: devilFruitId, + name, + type, + url: devilFruitUrl + }; + } catch (error) { + console.error(`Error fetching devil fruit ${devilFruitUrl}:`, (error as Error).message); + return null; + } } /** * Save devil fruits to JSON */ async function saveDevilFruitsToJSON(devilFruits: DevilFruit[]): Promise { - const filepath = `${OUTPUT_DIR}/devil-fruits.json`; - fs.writeFileSync(filepath, JSON.stringify(devilFruits, null, 2)); - console.log(`✓ Saved to ${filepath}`); + const filepath = `${OUTPUT_DIR}/devil-fruits.json`; + fs.writeFileSync(filepath, JSON.stringify(devilFruits, null, 2)); + console.log(`✓ Saved to ${filepath}`); } /** * Save devil fruits to CSV */ async function saveDevilFruitsToCSV(devilFruits: DevilFruit[]): Promise { - const filepath = `${OUTPUT_DIR}/devil-fruits.csv`; - const csvWriter = createObjectCsvWriter({ - path: filepath, - header: [ - { id: 'id', title: 'ID' }, - { id: 'name', title: 'Name' }, - { id: 'type', title: 'Type' }, - { id: 'url', title: 'URL' } - ], - }); + const filepath = `${OUTPUT_DIR}/devil-fruits.csv`; + const csvWriter = createObjectCsvWriter({ + path: filepath, + header: [ + { id: 'id', title: 'ID' }, + { id: 'name', title: 'Name' }, + { id: 'type', title: 'Type' }, + { id: 'url', title: 'URL' } + ] + }); - const records = devilFruits - .filter((df) => df !== null) - .map((df) => ({ - id: df.id || '', - name: df.name || '', - type: df.type || '', - url: df.url || '' - })); + const records = devilFruits + .filter((df) => df !== null) + .map((df) => ({ + id: df.id || '', + name: df.name || '', + type: df.type || '', + url: df.url || '' + })); - await csvWriter.writeRecords(records); - console.log(`✓ Saved to ${filepath}`); + await csvWriter.writeRecords(records); + console.log(`✓ Saved to ${filepath}`); } /** * Main execution */ async function main(): Promise { - const format = process.argv[2] || 'all'; // json, csv, or all + const format = process.argv[2] || 'all'; // json, csv, or all - console.log(`\nOne Piece Scraper - Mode: ${format}\n`); + console.log(`\nOne Piece Scraper - Mode: ${format}\n`); - // Step 1: Scraping Arcs - console.log('=== Step 1: Scraping Arcs ===\n'); - const arcsList = await fetchAllArcs(); - - if (arcsList.length > 0) { - // Display arcs in table format - arcsList.forEach((arc) => { - console.table({ - ID: arc.id, - Name: arc.name, - FrenchName: arc.frName || '', - StartChapter: arc.startChapter, - EndChapter: arc.endChapter || 'Ongoing', - URL: arc.url - }); - }); + // Step 1: Scraping Arcs + console.log('=== Step 1: Scraping Arcs ===\n'); + const arcsList = await fetchAllArcs(); - console.log(`\n✓ Found ${arcsList.length} arcs\n`); + if (arcsList.length > 0) { + // Display arcs in table format + arcsList.forEach((arc) => { + console.table({ + ID: arc.id, + Name: arc.name, + FrenchName: arc.frName || '', + StartChapter: arc.startChapter, + EndChapter: arc.endChapter || 'Ongoing', + URL: arc.url + }); + }); - if (format === 'json' || format === 'all') { - await saveArcsToJSON(arcsList); - } - if (format === 'csv' || format === 'all') { - await saveArcsToCSV(arcsList); - } - } else { - console.warn('No arcs found, continuing...\n'); - } + console.log(`\n✓ Found ${arcsList.length} arcs\n`); - // Step 2: Scraping Characters - console.log('=== Step 1: Scraping Characters ===\n'); - const characterList = await fetchAllCharactersUrl(); - - if (characterList.length === 0) { - console.error('No characters found. Exiting.'); - return; - } + if (format === 'json' || format === 'all') { + await saveArcsToJSON(arcsList); + } + if (format === 'csv' || format === 'all') { + await saveArcsToCSV(arcsList); + } + } else { + console.warn('No arcs found, continuing...\n'); + } - const characters: Character[] = []; - const devilFruitUrls = new Set(); - let failedCharacters: CharacterListItem[] = [...characterList]; + // Step 2: Scraping Characters + console.log('=== Step 1: Scraping Characters ===\n'); + const characterList = await fetchAllCharactersUrl(); - while (failedCharacters.length > 0) { - const nextFailedCharacters: CharacterListItem[] = []; - console.log(`\nFetching ${failedCharacters.length} characters...`); + if (characterList.length === 0) { + console.error('No characters found. Exiting.'); + return; + } - for (let i = 0; i < failedCharacters.length; i += FETCH_CONCURRENCY) { - const batch = failedCharacters.slice(i, i + FETCH_CONCURRENCY); - const batchResults = await Promise.all( - batch.map(async (char) => { - const data = await fetchCharacter(char.url, char.name, char.chapter, arcsList); - return { char, data }; - }) - ); + const characters: Character[] = []; + const devilFruitUrls = new Set(); + let failedCharacters: CharacterListItem[] = [...characterList]; - for (const { char, data } of batchResults) { - if (data) { - console.table({ - ID: data.id, - Name: data.name, - Gender: data.gender, - Age: data.age, - Status: data.status, - Epithets: data.epithets.join(', '), - Affiliations: data.affiliations.join(', '), - DevilFruitId: data.devilFruitId, - DevilFruitUrl: data.devilFruitUrl, - HakiObservation: data.hakiObservation ? 'Yes' : 'No', - HakiArmament: data.hakiArmament ? 'Yes' : 'No', - HakiConqueror: data.hakiConqueror ? 'Yes' : 'No', - Height: data.height, - Bounty: data.bounty, - Origin: data.origin, - FirstAppearance: data.firstAppearance, - pictureUrl: data.pictureUrl, - FandomURL: data.url - }); + while (failedCharacters.length > 0) { + const nextFailedCharacters: CharacterListItem[] = []; + console.log(`\nFetching ${failedCharacters.length} characters...`); - if (data.devilFruitUrl) { - devilFruitUrls.add(data.devilFruitUrl); - } + for (let i = 0; i < failedCharacters.length; i += FETCH_CONCURRENCY) { + const batch = failedCharacters.slice(i, i + FETCH_CONCURRENCY); + const batchResults = await Promise.all( + batch.map(async (char) => { + const data = await fetchCharacter(char.url, char.name, char.chapter, arcsList); + return { char, data }; + }) + ); + for (const { char, data } of batchResults) { + if (data) { + console.table({ + ID: data.id, + Name: data.name, + Gender: data.gender, + Age: data.age, + Status: data.status, + Epithets: data.epithets.join(', '), + Affiliations: data.affiliations.join(', '), + DevilFruitId: data.devilFruitId, + DevilFruitUrl: data.devilFruitUrl, + HakiObservation: data.hakiObservation ? 'Yes' : 'No', + HakiArmament: data.hakiArmament ? 'Yes' : 'No', + HakiConqueror: data.hakiConqueror ? 'Yes' : 'No', + Height: data.height, + Bounty: data.bounty, + Origin: data.origin, + FirstAppearance: data.firstAppearance, + pictureUrl: data.pictureUrl, + FandomURL: data.url + }); - characters.push(data); - } else { - nextFailedCharacters.push(char); - } - } - } + if (data.devilFruitUrl) { + devilFruitUrls.add(data.devilFruitUrl); + } - failedCharacters = nextFailedCharacters; - if (failedCharacters.length > 0) { - console.log(`⚠️ ${failedCharacters.length} characters failed. Retrying...`); - } - } + characters.push(data); + } else { + nextFailedCharacters.push(char); + } + } + } - console.log(`\n✓ Scraped ${characters.length} characters\n`); - console.log(`✓ Found ${devilFruitUrls.size} unique devil fruits\n`); + failedCharacters = nextFailedCharacters; + if (failedCharacters.length > 0) { + console.log(`⚠️ ${failedCharacters.length} characters failed. Retrying...`); + } + } - // Step 3: Scraping Devil Fruits - console.log('=== Step 2: Scraping Devil Fruits ===\n'); - - if (devilFruitUrls.size === 0) { - console.warn('No devil fruits found from characters, skipping...\n'); - } else { - const devilFruits: DevilFruit[] = []; - const devilFruitUrlArray = Array.from(devilFruitUrls); + console.log(`\n✓ Scraped ${characters.length} characters\n`); + console.log(`✓ Found ${devilFruitUrls.size} unique devil fruits\n`); - for (let i = 0; i < devilFruitUrlArray.length; i += FETCH_CONCURRENCY) { - const batch = devilFruitUrlArray.slice(i, i + FETCH_CONCURRENCY); - const batchResults = await Promise.all( - batch.map(async (url) => { - const data = await fetchDevilFruit(url, normalizeId(url)); - return { url, data }; - }) - ); + // Step 3: Scraping Devil Fruits + console.log('=== Step 2: Scraping Devil Fruits ===\n'); - for (const { data } of batchResults) { - if (data) { - console.table({ - ID: data.id, - Name: data.name, - Type: data.type, - URL: data.url - }); + if (devilFruitUrls.size === 0) { + console.warn('No devil fruits found from characters, skipping...\n'); + } else { + const devilFruits: DevilFruit[] = []; + const devilFruitUrlArray = Array.from(devilFruitUrls); - devilFruits.push(data); - } - } - } + for (let i = 0; i < devilFruitUrlArray.length; i += FETCH_CONCURRENCY) { + const batch = devilFruitUrlArray.slice(i, i + FETCH_CONCURRENCY); + const batchResults = await Promise.all( + batch.map(async (url) => { + const data = await fetchDevilFruit(url, normalizeId(url)); + return { url, data }; + }) + ); - console.log(`\n✓ Scraped ${devilFruits.length} devil fruits\n`); + for (const { data } of batchResults) { + if (data) { + console.table({ + ID: data.id, + Name: data.name, + Type: data.type, + URL: data.url + }); - if (format === 'json' || format === 'all') { - await saveDevilFruitsToJSON(devilFruits); - } - if (format === 'csv' || format === 'all') { - await saveDevilFruitsToCSV(devilFruits); - } + devilFruits.push(data); + } + } + } - // Update characters with normalized devil fruit IDs - const devilFruitMap = new Map(devilFruits.map(df => [df.id, df.id])); - characters.forEach(char => { - if (char.devilFruitUrl) { - const normalizedId = normalizeId(char.devilFruitUrl); - char.devilFruitId = devilFruitMap.get(normalizedId) || normalizedId; - } - }); - } + console.log(`\n✓ Scraped ${devilFruits.length} devil fruits\n`); - // Save characters after devil fruit IDs are updated - if (format === 'json' || format === 'all') { - await saveToJSON(characters); - } - if (format === 'csv' || format === 'all') { - await saveToCSV(characters); - } + if (format === 'json' || format === 'all') { + await saveDevilFruitsToJSON(devilFruits); + } + if (format === 'csv' || format === 'all') { + await saveDevilFruitsToCSV(devilFruits); + } - console.log('\n✓ Done!\n'); + // Update characters with normalized devil fruit IDs + const devilFruitMap = new Map(devilFruits.map((df) => [df.id, df.id])); + characters.forEach((char) => { + if (char.devilFruitUrl) { + const normalizedId = normalizeId(char.devilFruitUrl); + char.devilFruitId = devilFruitMap.get(normalizedId) || normalizedId; + } + }); + } + + // Save characters after devil fruit IDs are updated + if (format === 'json' || format === 'all') { + await saveToJSON(characters); + } + if (format === 'csv' || format === 'all') { + await saveToCSV(characters); + } + + console.log('\n✓ Done!\n'); } main().catch(console.error);