From 70de84f3ab19cf4810d5093892ec8a3d5c6d7893 Mon Sep 17 00:00:00 2001 From: whidix Date: Tue, 3 Mar 2026 23:12:17 +0100 Subject: [PATCH] feat: update DevilFruitType to include 'Smile' for enhanced categorization --- scripts/scrape-onepiece.ts | 266 +++++++++++++++++++++--------------- src/lib/server/db/schema.ts | 2 +- 2 files changed, 156 insertions(+), 112 deletions(-) diff --git a/scripts/scrape-onepiece.ts b/scripts/scrape-onepiece.ts index 5b0a0d2..60d90a6 100644 --- a/scripts/scrape-onepiece.ts +++ b/scripts/scrape-onepiece.ts @@ -52,10 +52,11 @@ interface DevilFruit { url: string; } -const FANDOM_BASE_URL = 'https://onepiece.fandom.com/fr/wiki'; +const FANDOM_API_BASE = 'https://onepiece.fandom.com/fr/api.php?action=parse&format=json&page='; const OUTPUT_DIR = './scraped-data'; const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed const INITIAL_RETRY_DELAY = 1000; +const CHARACTER_FETCH_CONCURRENCY = 50; // Store cookies across requests (simulate browser behavior) const cookies = new Map(); @@ -99,7 +100,7 @@ async function fetchWithRetry(url: string, options: RequestInit = {}, retries: n if (cookieHeader) { headers['Cookie'] = cookieHeader; } - + const response = await fetch(url, { headers, ...options @@ -153,15 +154,22 @@ function normalizeId(str: string): string { } /** - * Fetch all arcs from One Piece fandom + * Fetch all arcs from One Piece fandom using API */ async function fetchAllArcs(): Promise { try { - const url = `${FANDOM_BASE_URL}/Chapitres_et_Tomes`; - console.log('Fetching arcs list...'); - const response = await fetchWithRetry(url); - const data = await response.text(); - const $ = cheerio.load(data); + const apiUrl = `${FANDOM_API_BASE}Chapitres_et_Tomes`; + console.log('Fetching arcs list via API...'); + const response = await fetchWithRetry(apiUrl); + const jsonData = await response.json() as any; + + // Extract HTML from API response + const htmlContent = jsonData.parse?.text?.['*']; + if (!htmlContent) { + throw new Error('Unable to extract HTML content from API response'); + } + + const $ = cheerio.load(htmlContent); const arcs: Arc[] = []; // Find all arc links in the table @@ -247,15 +255,22 @@ async function saveArcsToCSV(arcs: Arc[]): Promise { } /** - * Fetch all cannon characters from One Piece fandom + * Fetch all cannon characters from One Piece fandom using API */ async function fetchAllCharactersUrl(): Promise { try { - const url = `${FANDOM_BASE_URL}/Liste_des_Personnages_Canon`; - console.log('Fetching character list...'); - const response = await fetchWithRetry(url); - const data = await response.text(); - const $ = cheerio.load(data); + const apiUrl = `${FANDOM_API_BASE}Liste_des_Personnages_Canon`; + console.log('Fetching character list via API...'); + const response = await fetchWithRetry(apiUrl); + const jsonData = await response.json() as any; + + // Extract HTML from API response + const htmlContent = jsonData.parse?.text?.['*']; + if (!htmlContent) { + throw new Error('Unable to extract HTML content from API response'); + } + + const $ = cheerio.load(htmlContent); const characters: CharacterListItem[] = []; $('table.wikitable tbody tr').each((index, element) => { if (index === 0) return; // Skip header row @@ -303,43 +318,47 @@ async function fetchCharacter( try { console.log(`Fetching: ${characterName}...`); - const response = await fetchWithRetry(`${FANDOM_BASE_URL}/${characterUrl}`, { - redirect: 'follow' - }); + // Use API to fetch character page + const apiUrl = `${FANDOM_API_BASE}${characterUrl}`; + let response = await fetchWithRetry(apiUrl); - // Use final URL after redirects (canonical character page) + let jsonData = await response.json() as any; + + // Use final page name from API (if parse.limks contains one element, it means the original page was a redirect, so we use the the element 0 as the final URL, otherwise we use the original URL) let finalCharacterUrl = characterUrl; - let finalCharacterId = normalizeId(characterUrl); - try { - const finalUrl = new URL(response.url); - const characterUrlPath = finalUrl.pathname.replace('/fr/wiki/', ''); - if (characterUrlPath) { - finalCharacterUrl = characterUrlPath; - finalCharacterId = normalizeId(characterUrlPath); - } - } catch { - // If HTTP is not ok or redirected URL, throw an error to be caught in the outer block - if (!response.ok) { - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } + if (jsonData.parse?.links?.length === 1) { + finalCharacterUrl = jsonData.parse.links[0]['*']; + // Query the API again with the final URL to get the correct HTML content (in case of redirect) + response = await fetchWithRetry(`${FANDOM_API_BASE}${finalCharacterUrl}`); + jsonData = await response.json() as any; + } + + const categories = jsonData.parse?.categories || []; + + // Extract HTML from API response + const htmlContent = jsonData.parse?.text?.['*']; + if (!htmlContent) { + throw new Error('Unable to extract HTML content from API response'); } - const data = await response.text(); + const $ = cheerio.load(htmlContent); - const $ = cheerio.load(data); - - // Extract character name - const name = $('h1.mw-page-title-main').text().trim() || characterName.replace(/_/g, ' '); + const name = characterName; // Generate character ID from URL + name combination - finalCharacterId = normalizeId(finalCharacterUrl + '_' + name); + const finalCharacterId = normalizeId(finalCharacterUrl + '_' + name); - // Extract gender from the specific categories link + // Extract gender from JSON categories let gender: string | null = null; - if ($('.page-header__categories a[title="Catégorie:Personnages Masculins"]').length > 0) { - gender = 'Male'; - } else if ($('.page-header__categories a[title="Catégorie:Personnages Féminins"]').length > 0) { - gender = 'Female'; + for (const cat of categories) { + const catName = cat['*'] || ''; + if (catName === 'Personnages_Masculins') { + gender = 'Male'; + break; + } else if (catName === 'Personnages_Féminins') { + gender = 'Female'; + break; + } } // Extract age @@ -356,10 +375,20 @@ async function fetchCharacter( const devilFruitId = devilFruitData?.devilFruitId || null; const devilFruitUrl = devilFruitData?.devilFruitUrl || null; - // Extract haki - const hakiObservation = $('.page-header__categories a[title="Catégorie:Utilisateurs du Haki de l\'observation"]').length > 0; - const hakiArmament = $('.page-header__categories a[title="Catégorie:Utilisateurs du Haki de l\'armement"]').length > 0; - const hakiConqueror = $('.page-header__categories a[title="Catégorie:Utilisateurs du Haki des rois"]').length > 0; + // Extract haki from JSON categories + let hakiObservation = false; + let hakiArmament = false; + let hakiConqueror = false; + for (const cat of categories) { + const catName = cat['*'] || ''; + if (catName === 'Utilisateurs_du_Haki_de_l\'observation') { + hakiObservation = true; + } else if (catName === 'Utilisateurs_du_Haki_de_l\'armement') { + hakiArmament = true; + } else if (catName === 'Utilisateurs_du_Haki_des_rois') { + hakiConqueror = true; + } + } // Extract bounty const bounty = extractBounty($); @@ -499,16 +528,18 @@ async function extractDevilFruit($: cheerio.CheerioAPI): Promise { */ async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Promise { try { - console.log(`Fetching devil fruit: ${devilFruitId}...`); + console.log(`Fetching devil fruit: ${devilFruitUrl}...`); - const response = await fetchWithRetry(`${FANDOM_BASE_URL}/${devilFruitUrl}`); - const data = await response.text(); - const $ = cheerio.load(data); + // Use API to fetch devil fruit page + const apiUrl = `${FANDOM_API_BASE}${devilFruitUrl}`; + const response = await fetchWithRetry(apiUrl); + const jsonData = await response.json() as any; + + // Extract HTML from API response + const htmlContent = jsonData.parse?.text?.['*']; + if (!htmlContent) { + throw new Error('Unable to extract HTML content from API response'); + } - const name = $('span.mw-page-title-main').text().trim(); + const name = jsonData.parse?.title || devilFruitId.replace(/_/g, ' '); - // Extract type from label in infobox let type: string | null = null; - const typeDiv = $('[data-source="type"] .pi-data-value'); - if (typeDiv.length > 0) { - const typeText = typeDiv.text().trim().toLowerCase(); - if (typeText.includes('zoan')) { - type = 'Zoan'; - } else if (typeText.includes('paramecia')) { + // Determine type based on categories (if categories contain "Paramecia", "Zoan", "Logia" or "Smile") + if (jsonData.parse?.categories) { + const categories = jsonData.parse.categories + .map((cat: any) => String(cat['*'] || '').toLowerCase()); + + if (categories.some((category: string) => category.includes('paramecia'))) { type = 'Paramecia'; - } else if (typeText.includes('logia')) { + } else if (categories.some((category: string) => category.includes('zoan'))) { + type = 'Zoan'; + } else if (categories.some((category: string) => category.includes('logia'))) { type = 'Logia'; + } else if (categories.some((category: string) => category.includes('smile'))) { + type = 'Smile'; } } @@ -838,50 +879,53 @@ async function main(): Promise { const nextFailedCharacters: CharacterListItem[] = []; console.log(`\nFetching ${failedCharacters.length} characters...`); - for (let i = 0; i < failedCharacters.length; i++) { - const char = failedCharacters[i]; - const data = await fetchCharacter(char.url, char.name, char.pictureUrl, char.chapter); - - if (data) { - console.table({ - ID: data.id, - Name: data.name, - Gender: data.gender, - Age: data.age, - Status: data.status, - Epithets: data.epithets.join(', '), - Affiliations: data.affiliations.join(', '), - DevilFruitId: data.devilFruitId, - DevilFruitUrl: data.devilFruitUrl, - HakiObservation: data.hakiObservation ? 'Yes' : 'No', - HakiArmament: data.hakiArmament ? 'Yes' : 'No', - HakiConqueror: data.hakiConqueror ? 'Yes' : 'No', - Height: data.height, - Bounty: data.bounty, - Origin: data.origin, - FirstAppearance: data.firstAppearance, - pictureUrl: data.pictureUrl, - FandomURL: data.url - }); + for (let i = 0; i < failedCharacters.length; i += CHARACTER_FETCH_CONCURRENCY) { + const batch = failedCharacters.slice(i, i + CHARACTER_FETCH_CONCURRENCY); + const batchResults = await Promise.all( + batch.map(async (char) => { + const data = await fetchCharacter(char.url, char.name, char.pictureUrl, char.chapter); + return { char, data }; + }) + ); - // Collect devil fruit URLs - if (data.devilFruitUrl) { - devilFruitUrls.add(data.devilFruitUrl); - } + for (const { char, data } of batchResults) { + if (data) { + console.table({ + ID: data.id, + Name: data.name, + Gender: data.gender, + Age: data.age, + Status: data.status, + Epithets: data.epithets.join(', '), + Affiliations: data.affiliations.join(', '), + DevilFruitId: data.devilFruitId, + DevilFruitUrl: data.devilFruitUrl, + HakiObservation: data.hakiObservation ? 'Yes' : 'No', + HakiArmament: data.hakiArmament ? 'Yes' : 'No', + HakiConqueror: data.hakiConqueror ? 'Yes' : 'No', + Height: data.height, + Bounty: data.bounty, + Origin: data.origin, + FirstAppearance: data.firstAppearance, + pictureUrl: data.pictureUrl, + FandomURL: data.url + }); - // Add arc IDs to character data - if (data.firstAppearance) { - const arc = arcsList.find(a => a.startChapter <= data.firstAppearance && (a.endChapter === null || a.endChapter >= data.firstAppearance)); - if (arc) { - data.arcId = arc.id; + if (data.devilFruitUrl) { + devilFruitUrls.add(data.devilFruitUrl); } - } - characters.push(data); - } else { - // Add to retry list and wait before next character - nextFailedCharacters.push(char); - await new Promise(resolve => setTimeout(resolve, 1000)); + if (data.firstAppearance) { + const arc = arcsList.find(a => a.startChapter <= data.firstAppearance && (a.endChapter === null || a.endChapter >= data.firstAppearance)); + if (arc) { + data.arcId = arc.id; + } + } + + characters.push(data); + } else { + nextFailedCharacters.push(char); + } } } diff --git a/src/lib/server/db/schema.ts b/src/lib/server/db/schema.ts index 9637fc2..ffb34f8 100644 --- a/src/lib/server/db/schema.ts +++ b/src/lib/server/db/schema.ts @@ -2,7 +2,7 @@ import { integer, sqliteTable, text, real, unique } from 'drizzle-orm/sqlite-cor import { user } from './auth.schema'; // Define devil fruit types -export type DevilFruitType = 'Paramecia' | 'Zoan' | 'Logia' | 'Unknown'; +export type DevilFruitType = 'Paramecia' | 'Zoan' | 'Logia' | 'Smile' | 'Unknown'; // Define the site config table schema export const config = sqliteTable('config', {