feat: update DevilFruitType to include 'Smile' for enhanced categorization
All checks were successful
Build Docker Image / build (push) Successful in 1m28s

This commit is contained in:
2026-03-03 23:12:17 +01:00
parent 6402c378dd
commit 70de84f3ab
2 changed files with 156 additions and 112 deletions

View File

@@ -52,10 +52,11 @@ interface DevilFruit {
url: string;
}
const FANDOM_BASE_URL = 'https://onepiece.fandom.com/fr/wiki';
const FANDOM_API_BASE = 'https://onepiece.fandom.com/fr/api.php?action=parse&format=json&page=';
const OUTPUT_DIR = './scraped-data';
const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed
const INITIAL_RETRY_DELAY = 1000;
const CHARACTER_FETCH_CONCURRENCY = 50;
// Store cookies across requests (simulate browser behavior)
const cookies = new Map<string, string>();
@@ -99,7 +100,7 @@ async function fetchWithRetry(url: string, options: RequestInit = {}, retries: n
if (cookieHeader) {
headers['Cookie'] = cookieHeader;
}
const response = await fetch(url, {
headers,
...options
@@ -153,15 +154,22 @@ function normalizeId(str: string): string {
}
/**
* Fetch all arcs from One Piece fandom
* Fetch all arcs from One Piece fandom using API
*/
async function fetchAllArcs(): Promise<Arc[]> {
try {
const url = `${FANDOM_BASE_URL}/Chapitres_et_Tomes`;
console.log('Fetching arcs list...');
const response = await fetchWithRetry(url);
const data = await response.text();
const $ = cheerio.load(data);
const apiUrl = `${FANDOM_API_BASE}Chapitres_et_Tomes`;
console.log('Fetching arcs list via API...');
const response = await fetchWithRetry(apiUrl);
const jsonData = await response.json() as any;
// Extract HTML from API response
const htmlContent = jsonData.parse?.text?.['*'];
if (!htmlContent) {
throw new Error('Unable to extract HTML content from API response');
}
const $ = cheerio.load(htmlContent);
const arcs: Arc[] = [];
// Find all arc links in the table
@@ -247,15 +255,22 @@ async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
}
/**
* Fetch all cannon characters from One Piece fandom
* Fetch all cannon characters from One Piece fandom using API
*/
async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
try {
const url = `${FANDOM_BASE_URL}/Liste_des_Personnages_Canon`;
console.log('Fetching character list...');
const response = await fetchWithRetry(url);
const data = await response.text();
const $ = cheerio.load(data);
const apiUrl = `${FANDOM_API_BASE}Liste_des_Personnages_Canon`;
console.log('Fetching character list via API...');
const response = await fetchWithRetry(apiUrl);
const jsonData = await response.json() as any;
// Extract HTML from API response
const htmlContent = jsonData.parse?.text?.['*'];
if (!htmlContent) {
throw new Error('Unable to extract HTML content from API response');
}
const $ = cheerio.load(htmlContent);
const characters: CharacterListItem[] = [];
$('table.wikitable tbody tr').each((index, element) => {
if (index === 0) return; // Skip header row
@@ -303,43 +318,47 @@ async function fetchCharacter(
try {
console.log(`Fetching: ${characterName}...`);
const response = await fetchWithRetry(`${FANDOM_BASE_URL}/${characterUrl}`, {
redirect: 'follow'
});
// Use API to fetch character page
const apiUrl = `${FANDOM_API_BASE}${characterUrl}`;
let response = await fetchWithRetry(apiUrl);
// Use final URL after redirects (canonical character page)
let jsonData = await response.json() as any;
// Use final page name from API (if parse.limks contains one element, it means the original page was a redirect, so we use the the element 0 as the final URL, otherwise we use the original URL)
let finalCharacterUrl = characterUrl;
let finalCharacterId = normalizeId(characterUrl);
try {
const finalUrl = new URL(response.url);
const characterUrlPath = finalUrl.pathname.replace('/fr/wiki/', '');
if (characterUrlPath) {
finalCharacterUrl = characterUrlPath;
finalCharacterId = normalizeId(characterUrlPath);
}
} catch {
// If HTTP is not ok or redirected URL, throw an error to be caught in the outer block
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
if (jsonData.parse?.links?.length === 1) {
finalCharacterUrl = jsonData.parse.links[0]['*'];
// Query the API again with the final URL to get the correct HTML content (in case of redirect)
response = await fetchWithRetry(`${FANDOM_API_BASE}${finalCharacterUrl}`);
jsonData = await response.json() as any;
}
const categories = jsonData.parse?.categories || [];
// Extract HTML from API response
const htmlContent = jsonData.parse?.text?.['*'];
if (!htmlContent) {
throw new Error('Unable to extract HTML content from API response');
}
const data = await response.text();
const $ = cheerio.load(htmlContent);
const $ = cheerio.load(data);
// Extract character name
const name = $('h1.mw-page-title-main').text().trim() || characterName.replace(/_/g, ' ');
const name = characterName;
// Generate character ID from URL + name combination
finalCharacterId = normalizeId(finalCharacterUrl + '_' + name);
const finalCharacterId = normalizeId(finalCharacterUrl + '_' + name);
// Extract gender from the specific categories link
// Extract gender from JSON categories
let gender: string | null = null;
if ($('.page-header__categories a[title="Catégorie:Personnages Masculins"]').length > 0) {
gender = 'Male';
} else if ($('.page-header__categories a[title="Catégorie:Personnages Féminins"]').length > 0) {
gender = 'Female';
for (const cat of categories) {
const catName = cat['*'] || '';
if (catName === 'Personnages_Masculins') {
gender = 'Male';
break;
} else if (catName === 'Personnages_Féminins') {
gender = 'Female';
break;
}
}
// Extract age
@@ -356,10 +375,20 @@ async function fetchCharacter(
const devilFruitId = devilFruitData?.devilFruitId || null;
const devilFruitUrl = devilFruitData?.devilFruitUrl || null;
// Extract haki
const hakiObservation = $('.page-header__categories a[title="Catégorie:Utilisateurs du Haki de l\'observation"]').length > 0;
const hakiArmament = $('.page-header__categories a[title="Catégorie:Utilisateurs du Haki de l\'armement"]').length > 0;
const hakiConqueror = $('.page-header__categories a[title="Catégorie:Utilisateurs du Haki des rois"]').length > 0;
// Extract haki from JSON categories
let hakiObservation = false;
let hakiArmament = false;
let hakiConqueror = false;
for (const cat of categories) {
const catName = cat['*'] || '';
if (catName === 'Utilisateurs_du_Haki_de_l\'observation') {
hakiObservation = true;
} else if (catName === 'Utilisateurs_du_Haki_de_l\'armement') {
hakiArmament = true;
} else if (catName === 'Utilisateurs_du_Haki_des_rois') {
hakiConqueror = true;
}
}
// Extract bounty
const bounty = extractBounty($);
@@ -499,16 +528,18 @@ async function extractDevilFruit($: cheerio.CheerioAPI): Promise<DevilFruitData
const cleanUrl = href.replace('/fr/wiki/', '');
try {
// Fetch the page to follow redirects
const response = await fetchWithRetry(`${FANDOM_BASE_URL}/${cleanUrl}`, {
redirect: 'follow' // Explicitly follow redirects
});
// Use the final URL after redirects
const finalUrl = new URL(response.url);
const pathname = finalUrl.pathname;
const finalPath = pathname.replace('/fr/wiki/', '');
// Fetch the page via API to follow redirects
const apiUrl = `${FANDOM_API_BASE}${decodeURIComponent(cleanUrl)}`;
const response = await fetchWithRetry(apiUrl);
const jsonData = await response.json() as any;
// Use final page name from API (if parse.links contains one element, it means the original page was a redirect, so we use the the element 0 as the final URL, otherwise we use the original URL)
let finalPath = cleanUrl;
if (jsonData.parse?.links?.length === 1) {
finalPath = jsonData.parse.links[0]['*'];
}
if (finalPath) {
return {
devilFruitId: normalizeId(finalPath),
@@ -628,9 +659,9 @@ function extractOrigin($: cheerio.CheerioAPI): string | null {
/**
* Extract status from infobox
*/
function extractStatus($: cheerio.CheerioAPI): string | null {
function extractStatus($: cheerio.CheerioAPI): string {
const div = $('[data-source="statut"] .pi-data-value');
if (div.length === 0) return null;
if (div.length === 0) return 'Alive';
const statusText = div.text().trim().toLowerCase();
@@ -640,7 +671,7 @@ function extractStatus($: cheerio.CheerioAPI): string | null {
return 'Dead';
}
return null;
return 'Alive';
}
@@ -714,25 +745,35 @@ async function saveToCSV(characters: Character[]): Promise<void> {
*/
async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Promise<DevilFruit | null> {
try {
console.log(`Fetching devil fruit: ${devilFruitId}...`);
console.log(`Fetching devil fruit: ${devilFruitUrl}...`);
const response = await fetchWithRetry(`${FANDOM_BASE_URL}/${devilFruitUrl}`);
const data = await response.text();
const $ = cheerio.load(data);
// Use API to fetch devil fruit page
const apiUrl = `${FANDOM_API_BASE}${devilFruitUrl}`;
const response = await fetchWithRetry(apiUrl);
const jsonData = await response.json() as any;
// Extract HTML from API response
const htmlContent = jsonData.parse?.text?.['*'];
if (!htmlContent) {
throw new Error('Unable to extract HTML content from API response');
}
const name = $('span.mw-page-title-main').text().trim();
const name = jsonData.parse?.title || devilFruitId.replace(/_/g, ' ');
// Extract type from label in infobox
let type: string | null = null;
const typeDiv = $('[data-source="type"] .pi-data-value');
if (typeDiv.length > 0) {
const typeText = typeDiv.text().trim().toLowerCase();
if (typeText.includes('zoan')) {
type = 'Zoan';
} else if (typeText.includes('paramecia')) {
// Determine type based on categories (if categories contain "Paramecia", "Zoan", "Logia" or "Smile")
if (jsonData.parse?.categories) {
const categories = jsonData.parse.categories
.map((cat: any) => String(cat['*'] || '').toLowerCase());
if (categories.some((category: string) => category.includes('paramecia'))) {
type = 'Paramecia';
} else if (typeText.includes('logia')) {
} else if (categories.some((category: string) => category.includes('zoan'))) {
type = 'Zoan';
} else if (categories.some((category: string) => category.includes('logia'))) {
type = 'Logia';
} else if (categories.some((category: string) => category.includes('smile'))) {
type = 'Smile';
}
}
@@ -838,50 +879,53 @@ async function main(): Promise<void> {
const nextFailedCharacters: CharacterListItem[] = [];
console.log(`\nFetching ${failedCharacters.length} characters...`);
for (let i = 0; i < failedCharacters.length; i++) {
const char = failedCharacters[i];
const data = await fetchCharacter(char.url, char.name, char.pictureUrl, char.chapter);
if (data) {
console.table({
ID: data.id,
Name: data.name,
Gender: data.gender,
Age: data.age,
Status: data.status,
Epithets: data.epithets.join(', '),
Affiliations: data.affiliations.join(', '),
DevilFruitId: data.devilFruitId,
DevilFruitUrl: data.devilFruitUrl,
HakiObservation: data.hakiObservation ? 'Yes' : 'No',
HakiArmament: data.hakiArmament ? 'Yes' : 'No',
HakiConqueror: data.hakiConqueror ? 'Yes' : 'No',
Height: data.height,
Bounty: data.bounty,
Origin: data.origin,
FirstAppearance: data.firstAppearance,
pictureUrl: data.pictureUrl,
FandomURL: data.url
});
for (let i = 0; i < failedCharacters.length; i += CHARACTER_FETCH_CONCURRENCY) {
const batch = failedCharacters.slice(i, i + CHARACTER_FETCH_CONCURRENCY);
const batchResults = await Promise.all(
batch.map(async (char) => {
const data = await fetchCharacter(char.url, char.name, char.pictureUrl, char.chapter);
return { char, data };
})
);
// Collect devil fruit URLs
if (data.devilFruitUrl) {
devilFruitUrls.add(data.devilFruitUrl);
}
for (const { char, data } of batchResults) {
if (data) {
console.table({
ID: data.id,
Name: data.name,
Gender: data.gender,
Age: data.age,
Status: data.status,
Epithets: data.epithets.join(', '),
Affiliations: data.affiliations.join(', '),
DevilFruitId: data.devilFruitId,
DevilFruitUrl: data.devilFruitUrl,
HakiObservation: data.hakiObservation ? 'Yes' : 'No',
HakiArmament: data.hakiArmament ? 'Yes' : 'No',
HakiConqueror: data.hakiConqueror ? 'Yes' : 'No',
Height: data.height,
Bounty: data.bounty,
Origin: data.origin,
FirstAppearance: data.firstAppearance,
pictureUrl: data.pictureUrl,
FandomURL: data.url
});
// Add arc IDs to character data
if (data.firstAppearance) {
const arc = arcsList.find(a => a.startChapter <= data.firstAppearance && (a.endChapter === null || a.endChapter >= data.firstAppearance));
if (arc) {
data.arcId = arc.id;
if (data.devilFruitUrl) {
devilFruitUrls.add(data.devilFruitUrl);
}
}
characters.push(data);
} else {
// Add to retry list and wait before next character
nextFailedCharacters.push(char);
await new Promise(resolve => setTimeout(resolve, 1000));
if (data.firstAppearance) {
const arc = arcsList.find(a => a.startChapter <= data.firstAppearance && (a.endChapter === null || a.endChapter >= data.firstAppearance));
if (arc) {
data.arcId = arc.id;
}
}
characters.push(data);
} else {
nextFailedCharacters.push(char);
}
}
}

View File

@@ -2,7 +2,7 @@ import { integer, sqliteTable, text, real, unique } from 'drizzle-orm/sqlite-cor
import { user } from './auth.schema';
// Define devil fruit types
export type DevilFruitType = 'Paramecia' | 'Zoan' | 'Logia' | 'Unknown';
export type DevilFruitType = 'Paramecia' | 'Zoan' | 'Logia' | 'Smile' | 'Unknown';
// Define the site config table schema
export const config = sqliteTable('config', {