feat: update DevilFruitType to include 'Smile' for enhanced categorization
All checks were successful
Build Docker Image / build (push) Successful in 1m28s

This commit is contained in:
2026-03-03 23:12:17 +01:00
parent 6402c378dd
commit 70de84f3ab
2 changed files with 156 additions and 112 deletions

View File

@@ -52,10 +52,11 @@ interface DevilFruit {
url: string; url: string;
} }
const FANDOM_BASE_URL = 'https://onepiece.fandom.com/fr/wiki'; const FANDOM_API_BASE = 'https://onepiece.fandom.com/fr/api.php?action=parse&format=json&page=';
const OUTPUT_DIR = './scraped-data'; const OUTPUT_DIR = './scraped-data';
const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed
const INITIAL_RETRY_DELAY = 1000; const INITIAL_RETRY_DELAY = 1000;
const CHARACTER_FETCH_CONCURRENCY = 50;
// Store cookies across requests (simulate browser behavior) // Store cookies across requests (simulate browser behavior)
const cookies = new Map<string, string>(); const cookies = new Map<string, string>();
@@ -153,15 +154,22 @@ function normalizeId(str: string): string {
} }
/** /**
* Fetch all arcs from One Piece fandom * Fetch all arcs from One Piece fandom using API
*/ */
async function fetchAllArcs(): Promise<Arc[]> { async function fetchAllArcs(): Promise<Arc[]> {
try { try {
const url = `${FANDOM_BASE_URL}/Chapitres_et_Tomes`; const apiUrl = `${FANDOM_API_BASE}Chapitres_et_Tomes`;
console.log('Fetching arcs list...'); console.log('Fetching arcs list via API...');
const response = await fetchWithRetry(url); const response = await fetchWithRetry(apiUrl);
const data = await response.text(); const jsonData = await response.json() as any;
const $ = cheerio.load(data);
// Extract HTML from API response
const htmlContent = jsonData.parse?.text?.['*'];
if (!htmlContent) {
throw new Error('Unable to extract HTML content from API response');
}
const $ = cheerio.load(htmlContent);
const arcs: Arc[] = []; const arcs: Arc[] = [];
// Find all arc links in the table // Find all arc links in the table
@@ -247,15 +255,22 @@ async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
} }
/** /**
* Fetch all cannon characters from One Piece fandom * Fetch all cannon characters from One Piece fandom using API
*/ */
async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> { async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
try { try {
const url = `${FANDOM_BASE_URL}/Liste_des_Personnages_Canon`; const apiUrl = `${FANDOM_API_BASE}Liste_des_Personnages_Canon`;
console.log('Fetching character list...'); console.log('Fetching character list via API...');
const response = await fetchWithRetry(url); const response = await fetchWithRetry(apiUrl);
const data = await response.text(); const jsonData = await response.json() as any;
const $ = cheerio.load(data);
// Extract HTML from API response
const htmlContent = jsonData.parse?.text?.['*'];
if (!htmlContent) {
throw new Error('Unable to extract HTML content from API response');
}
const $ = cheerio.load(htmlContent);
const characters: CharacterListItem[] = []; const characters: CharacterListItem[] = [];
$('table.wikitable tbody tr').each((index, element) => { $('table.wikitable tbody tr').each((index, element) => {
if (index === 0) return; // Skip header row if (index === 0) return; // Skip header row
@@ -303,43 +318,47 @@ async function fetchCharacter(
try { try {
console.log(`Fetching: ${characterName}...`); console.log(`Fetching: ${characterName}...`);
const response = await fetchWithRetry(`${FANDOM_BASE_URL}/${characterUrl}`, { // Use API to fetch character page
redirect: 'follow' const apiUrl = `${FANDOM_API_BASE}${characterUrl}`;
}); let response = await fetchWithRetry(apiUrl);
// Use final URL after redirects (canonical character page) let jsonData = await response.json() as any;
// Use final page name from API (if parse.limks contains one element, it means the original page was a redirect, so we use the the element 0 as the final URL, otherwise we use the original URL)
let finalCharacterUrl = characterUrl; let finalCharacterUrl = characterUrl;
let finalCharacterId = normalizeId(characterUrl); if (jsonData.parse?.links?.length === 1) {
try { finalCharacterUrl = jsonData.parse.links[0]['*'];
const finalUrl = new URL(response.url); // Query the API again with the final URL to get the correct HTML content (in case of redirect)
const characterUrlPath = finalUrl.pathname.replace('/fr/wiki/', ''); response = await fetchWithRetry(`${FANDOM_API_BASE}${finalCharacterUrl}`);
if (characterUrlPath) { jsonData = await response.json() as any;
finalCharacterUrl = characterUrlPath;
finalCharacterId = normalizeId(characterUrlPath);
}
} catch {
// If HTTP is not ok or redirected URL, throw an error to be caught in the outer block
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
} }
const data = await response.text(); const categories = jsonData.parse?.categories || [];
const $ = cheerio.load(data); // Extract HTML from API response
const htmlContent = jsonData.parse?.text?.['*'];
if (!htmlContent) {
throw new Error('Unable to extract HTML content from API response');
}
// Extract character name const $ = cheerio.load(htmlContent);
const name = $('h1.mw-page-title-main').text().trim() || characterName.replace(/_/g, ' ');
const name = characterName;
// Generate character ID from URL + name combination // Generate character ID from URL + name combination
finalCharacterId = normalizeId(finalCharacterUrl + '_' + name); const finalCharacterId = normalizeId(finalCharacterUrl + '_' + name);
// Extract gender from the specific categories link // Extract gender from JSON categories
let gender: string | null = null; let gender: string | null = null;
if ($('.page-header__categories a[title="Catégorie:Personnages Masculins"]').length > 0) { for (const cat of categories) {
gender = 'Male'; const catName = cat['*'] || '';
} else if ($('.page-header__categories a[title="Catégorie:Personnages Féminins"]').length > 0) { if (catName === 'Personnages_Masculins') {
gender = 'Female'; gender = 'Male';
break;
} else if (catName === 'Personnages_Féminins') {
gender = 'Female';
break;
}
} }
// Extract age // Extract age
@@ -356,10 +375,20 @@ async function fetchCharacter(
const devilFruitId = devilFruitData?.devilFruitId || null; const devilFruitId = devilFruitData?.devilFruitId || null;
const devilFruitUrl = devilFruitData?.devilFruitUrl || null; const devilFruitUrl = devilFruitData?.devilFruitUrl || null;
// Extract haki // Extract haki from JSON categories
const hakiObservation = $('.page-header__categories a[title="Catégorie:Utilisateurs du Haki de l\'observation"]').length > 0; let hakiObservation = false;
const hakiArmament = $('.page-header__categories a[title="Catégorie:Utilisateurs du Haki de l\'armement"]').length > 0; let hakiArmament = false;
const hakiConqueror = $('.page-header__categories a[title="Catégorie:Utilisateurs du Haki des rois"]').length > 0; let hakiConqueror = false;
for (const cat of categories) {
const catName = cat['*'] || '';
if (catName === 'Utilisateurs_du_Haki_de_l\'observation') {
hakiObservation = true;
} else if (catName === 'Utilisateurs_du_Haki_de_l\'armement') {
hakiArmament = true;
} else if (catName === 'Utilisateurs_du_Haki_des_rois') {
hakiConqueror = true;
}
}
// Extract bounty // Extract bounty
const bounty = extractBounty($); const bounty = extractBounty($);
@@ -499,15 +528,17 @@ async function extractDevilFruit($: cheerio.CheerioAPI): Promise<DevilFruitData
const cleanUrl = href.replace('/fr/wiki/', ''); const cleanUrl = href.replace('/fr/wiki/', '');
try { try {
// Fetch the page to follow redirects // Fetch the page via API to follow redirects
const response = await fetchWithRetry(`${FANDOM_BASE_URL}/${cleanUrl}`, { const apiUrl = `${FANDOM_API_BASE}${decodeURIComponent(cleanUrl)}`;
redirect: 'follow' // Explicitly follow redirects const response = await fetchWithRetry(apiUrl);
}); const jsonData = await response.json() as any;
// Use final page name from API (if parse.links contains one element, it means the original page was a redirect, so we use the the element 0 as the final URL, otherwise we use the original URL)
let finalPath = cleanUrl;
if (jsonData.parse?.links?.length === 1) {
finalPath = jsonData.parse.links[0]['*'];
}
// Use the final URL after redirects
const finalUrl = new URL(response.url);
const pathname = finalUrl.pathname;
const finalPath = pathname.replace('/fr/wiki/', '');
if (finalPath) { if (finalPath) {
return { return {
@@ -628,9 +659,9 @@ function extractOrigin($: cheerio.CheerioAPI): string | null {
/** /**
* Extract status from infobox * Extract status from infobox
*/ */
function extractStatus($: cheerio.CheerioAPI): string | null { function extractStatus($: cheerio.CheerioAPI): string {
const div = $('[data-source="statut"] .pi-data-value'); const div = $('[data-source="statut"] .pi-data-value');
if (div.length === 0) return null; if (div.length === 0) return 'Alive';
const statusText = div.text().trim().toLowerCase(); const statusText = div.text().trim().toLowerCase();
@@ -640,7 +671,7 @@ function extractStatus($: cheerio.CheerioAPI): string | null {
return 'Dead'; return 'Dead';
} }
return null; return 'Alive';
} }
@@ -714,25 +745,35 @@ async function saveToCSV(characters: Character[]): Promise<void> {
*/ */
async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Promise<DevilFruit | null> { async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Promise<DevilFruit | null> {
try { try {
console.log(`Fetching devil fruit: ${devilFruitId}...`); console.log(`Fetching devil fruit: ${devilFruitUrl}...`);
const response = await fetchWithRetry(`${FANDOM_BASE_URL}/${devilFruitUrl}`); // Use API to fetch devil fruit page
const data = await response.text(); const apiUrl = `${FANDOM_API_BASE}${devilFruitUrl}`;
const $ = cheerio.load(data); const response = await fetchWithRetry(apiUrl);
const jsonData = await response.json() as any;
const name = $('span.mw-page-title-main').text().trim(); // Extract HTML from API response
const htmlContent = jsonData.parse?.text?.['*'];
if (!htmlContent) {
throw new Error('Unable to extract HTML content from API response');
}
const name = jsonData.parse?.title || devilFruitId.replace(/_/g, ' ');
// Extract type from label in infobox
let type: string | null = null; let type: string | null = null;
const typeDiv = $('[data-source="type"] .pi-data-value'); // Determine type based on categories (if categories contain "Paramecia", "Zoan", "Logia" or "Smile")
if (typeDiv.length > 0) { if (jsonData.parse?.categories) {
const typeText = typeDiv.text().trim().toLowerCase(); const categories = jsonData.parse.categories
if (typeText.includes('zoan')) { .map((cat: any) => String(cat['*'] || '').toLowerCase());
type = 'Zoan';
} else if (typeText.includes('paramecia')) { if (categories.some((category: string) => category.includes('paramecia'))) {
type = 'Paramecia'; type = 'Paramecia';
} else if (typeText.includes('logia')) { } else if (categories.some((category: string) => category.includes('zoan'))) {
type = 'Zoan';
} else if (categories.some((category: string) => category.includes('logia'))) {
type = 'Logia'; type = 'Logia';
} else if (categories.some((category: string) => category.includes('smile'))) {
type = 'Smile';
} }
} }
@@ -838,50 +879,53 @@ async function main(): Promise<void> {
const nextFailedCharacters: CharacterListItem[] = []; const nextFailedCharacters: CharacterListItem[] = [];
console.log(`\nFetching ${failedCharacters.length} characters...`); console.log(`\nFetching ${failedCharacters.length} characters...`);
for (let i = 0; i < failedCharacters.length; i++) { for (let i = 0; i < failedCharacters.length; i += CHARACTER_FETCH_CONCURRENCY) {
const char = failedCharacters[i]; const batch = failedCharacters.slice(i, i + CHARACTER_FETCH_CONCURRENCY);
const data = await fetchCharacter(char.url, char.name, char.pictureUrl, char.chapter); const batchResults = await Promise.all(
batch.map(async (char) => {
const data = await fetchCharacter(char.url, char.name, char.pictureUrl, char.chapter);
return { char, data };
})
);
if (data) { for (const { char, data } of batchResults) {
console.table({ if (data) {
ID: data.id, console.table({
Name: data.name, ID: data.id,
Gender: data.gender, Name: data.name,
Age: data.age, Gender: data.gender,
Status: data.status, Age: data.age,
Epithets: data.epithets.join(', '), Status: data.status,
Affiliations: data.affiliations.join(', '), Epithets: data.epithets.join(', '),
DevilFruitId: data.devilFruitId, Affiliations: data.affiliations.join(', '),
DevilFruitUrl: data.devilFruitUrl, DevilFruitId: data.devilFruitId,
HakiObservation: data.hakiObservation ? 'Yes' : 'No', DevilFruitUrl: data.devilFruitUrl,
HakiArmament: data.hakiArmament ? 'Yes' : 'No', HakiObservation: data.hakiObservation ? 'Yes' : 'No',
HakiConqueror: data.hakiConqueror ? 'Yes' : 'No', HakiArmament: data.hakiArmament ? 'Yes' : 'No',
Height: data.height, HakiConqueror: data.hakiConqueror ? 'Yes' : 'No',
Bounty: data.bounty, Height: data.height,
Origin: data.origin, Bounty: data.bounty,
FirstAppearance: data.firstAppearance, Origin: data.origin,
pictureUrl: data.pictureUrl, FirstAppearance: data.firstAppearance,
FandomURL: data.url pictureUrl: data.pictureUrl,
}); FandomURL: data.url
});
// Collect devil fruit URLs if (data.devilFruitUrl) {
if (data.devilFruitUrl) { devilFruitUrls.add(data.devilFruitUrl);
devilFruitUrls.add(data.devilFruitUrl);
}
// Add arc IDs to character data
if (data.firstAppearance) {
const arc = arcsList.find(a => a.startChapter <= data.firstAppearance && (a.endChapter === null || a.endChapter >= data.firstAppearance));
if (arc) {
data.arcId = arc.id;
} }
}
characters.push(data); if (data.firstAppearance) {
} else { const arc = arcsList.find(a => a.startChapter <= data.firstAppearance && (a.endChapter === null || a.endChapter >= data.firstAppearance));
// Add to retry list and wait before next character if (arc) {
nextFailedCharacters.push(char); data.arcId = arc.id;
await new Promise(resolve => setTimeout(resolve, 1000)); }
}
characters.push(data);
} else {
nextFailedCharacters.push(char);
}
} }
} }

View File

@@ -2,7 +2,7 @@ import { integer, sqliteTable, text, real, unique } from 'drizzle-orm/sqlite-cor
import { user } from './auth.schema'; import { user } from './auth.schema';
// Define devil fruit types // Define devil fruit types
export type DevilFruitType = 'Paramecia' | 'Zoan' | 'Logia' | 'Unknown'; export type DevilFruitType = 'Paramecia' | 'Zoan' | 'Logia' | 'Smile' | 'Unknown';
// Define the site config table schema // Define the site config table schema
export const config = sqliteTable('config', { export const config = sqliteTable('config', {