Refactor database schema and update scraping logic for One Piece characters and arcs

- Updated database schema to include French names and adjusted field names for consistency. - Modified scraping script to fetch and store French names for arcs and characters. - Improved API calls to handle redirects and fetch additional data for characters. - Enhanced data extraction methods for character attributes and devil fruits. - Cleaned up code for better readability and maintainability.
2026-03-14 01:23:29 +01:00
parent a91b298ee5
commit a041a8caf5
24 changed files with 844 additions and 11193 deletions
--- a/scripts/scrape-onepiece.ts
+++ b/scripts/scrape-onepiece.ts
@@ -6,6 +6,7 @@ import { createObjectCsvWriter } from 'csv-writer';
 interface Arc {
    id: string;
    name: string;
+    frName: string | null;
    startChapter: number;
    endChapter: number | null;
    url: string;
@@ -14,30 +15,34 @@ interface Arc {
 interface Character {
    id: string;
    name: string;
+    frName: string | null;
    gender: string | null;
    age: number | null;
    height: number | null;
    origin: string | null;
+    frOrigin: string | null;
    devilFruitId: string | null;
    devilFruitUrl: string | null;
    affiliations: string[];
+    frAffiliations: string[] | null;
    bounty: number | null;
    hakiObservation: boolean;
    hakiArmament: boolean;
    hakiConqueror: boolean;
    epithets: string[];
+    frEpithets: string[] | null;
    firstAppearance: number;
    status: string | null;
    pictureUrl: string | null;
    url: string;
-    arcId?: string;
+    frUrl: string | null;
+    arcId: string;
 }

 interface CharacterListItem {
    name: string;
    url: string;
-    pictureUrl: string | null;
-    chapter: string;
+    chapter: number;
 }

 interface DevilFruitData {
@@ -52,30 +57,13 @@ interface DevilFruit {
    url: string;
 }

-const FANDOM_API_BASE = 'https://onepiece.fandom.com/fr/api.php?action=parse&format=json&page=';
+const FANDOM_API_BASE = 'https://onepiece.fandom.com/api.php?action=parse&redirects=true&format=json&page=';
+const FR_FANDOM_API_BASE = 'https://onepiece.fandom.com/fr/api.php?action=parse&redirects=true&format=json&page=';
 const OUTPUT_DIR = './scraped-data';
 const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed
 const INITIAL_RETRY_DELAY = 1000;
 const FETCH_CONCURRENCY = 50;

-// Store cookies across requests (simulate browser behavior)
-const cookies = new Map<string, string>();
-
-function getCookieHeader(): string {
-    const cookieArray = Array.from(cookies.values()).map(c => c.split(';')[0]);
-    return cookieArray.length > 0 ? cookieArray.join('; ') : '';
-}
-
-function saveCookies(setCookieHeader: string | string[] | null): void {
-    if (setCookieHeader) {
-        const cookiesList = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader];
-        cookiesList.forEach(cookie => {
-            const [nameValue] = cookie.split(';');
-            const [name] = nameValue.split('=');
-            if (name) cookies.set(name, cookie);
-        });
-    }
-}

 // Create output directory
 if (!fs.existsSync(OUTPUT_DIR)) {
@@ -94,23 +82,11 @@ async function fetchWithRetry(url: string, options: RequestInit = {}, retries: n
            'Connection': 'keep-alive',
            ...((options.headers as Record<string, string>) || {})
        };
-        
-        // Add cookies from previous requests
-        const cookieHeader = getCookieHeader();
-        if (cookieHeader) {
-            headers['Cookie'] = cookieHeader;
-        }

        const response = await fetch(url, {
            headers,
            ...options
-        } as any);
-        
-        // Save cookies from response
-        const setCookie = response.headers.get('set-cookie');
-        if (setCookie) {
-            saveCookies(setCookie);
-        }
+        });

        // Check if response is OK (status 200-299)
        if (response.ok) {
@@ -141,6 +117,16 @@ async function fetchWithRetry(url: string, options: RequestInit = {}, retries: n
    }
 }

+/**
+ * Get the French link from the API response links array
+ */
+
+function getFrLink(links: { lang: string; ['*']: string; url: string }[]): { url: string } | null {
+    // Get french url by getting parse.langlinks where lang is "fr" and extract the name from there
+    const frLink = links.find((link: { lang: string; ['*']: string; url: string }) => link.lang === 'fr');
+    return frLink ? { url: frLink['url'] } : null;
+}
+

 /**
 * Normalize string by decoding URI components, punctuation, and replacing spaces with underscores
@@ -148,7 +134,7 @@ async function fetchWithRetry(url: string, options: RequestInit = {}, retries: n
 function normalizeId(str: string): string {
    return decodeURIComponent(str)
        .normalize('NFD')
-        .replace(/[,:.\(\)]/g, '')
+        .replace(/[,:.()]/g, '')
        .replace(/\s+/g, '_')
        .toLowerCase();
 }
@@ -158,10 +144,10 @@ function normalizeId(str: string): string {
 */
 async function fetchAllArcs(): Promise<Arc[]> {
    try {
-        const apiUrl = `${FANDOM_API_BASE}Chapitres_et_Tomes`;
+        const apiUrl = `${FANDOM_API_BASE}Chapters_and_Volumes`;
        console.log('Fetching arcs list via API...');
        const response = await fetchWithRetry(apiUrl);
-        const jsonData = await response.json() as any;
+        const jsonData = await response.json();
        
        // Extract HTML from API response
        const htmlContent = jsonData.parse?.text?.['*'];
@@ -172,40 +158,66 @@ async function fetchAllArcs(): Promise<Arc[]> {
        const $ = cheerio.load(htmlContent);
        const arcs: Arc[] = [];

-        // Find all arc links in the table
-        $('table.wikitable td a').each((index, element) => {
-            const text = $(element).text().trim();
-            const href = $(element).attr('href');
-            
-            // Check if it's an arc link (contains "Arc" and chapter info)
-            if (text.includes('Arc') && text.includes('Ch.') && href) {
-                // Extract arc name and chapter range
-                // Example text: "Arc Ville d'Orange(Ch.8 à 21)[T.1 à 3]"
-                console.log(`Processing arc link: ${text} (${href})`);
-                const nameMatch = text.match(/^(.*?Arc.*?)\s*\(Ch\.(\d+)(?:\s*à\s*(?:(\d+)|(?:...)))?\)/);
-                if (nameMatch) {
-                    let arcName = nameMatch[1].trim();
-                    // Remove "Arc " from the name
-                    arcName = arcName.replace(/^Arc\s+/i, '');
-                    
-                    const startChapter = parseInt(nameMatch[2]);
-                    const endChapter = nameMatch[3] ? parseInt(nameMatch[3]) : null;
+        const seenArcUrls = new Set<string>();

-                    // Generate arc ID by normalizing the url 
-                    let arcId = normalizeId(href.replace('/fr/wiki/', ''));
-                    // Remove "Arc_" from the id
-                    arcId = arcId.replace(/^arc_/i, '');
+        // Arc rows are in table cells where first link points to a *_Arc page and text includes chapter range.
+        const arcCells = $('table.wikitable td').toArray();
+        for (const element of arcCells) {
+            const cell = $(element);
+            const firstLink = cell.find('a').first();
+            const href = firstLink.attr('href') || '';
+            let arcName = firstLink.text().trim();

-                    arcs.push({
-                        id: arcId,
-                        name: arcName,
-                        startChapter,
-                        endChapter,
-                        url: href.replace('/fr/wiki/', '')
-                    });
-                }
+            if (!href.startsWith('/wiki/') || !/_Arc$/i.test(href)) {
+                continue;
            }
-        });
+
+            if (!arcName || !/\bArc\b/i.test(arcName)) {
+                continue;
+            }
+
+            arcName = arcName.replace(/\bArc\b/i, '').trim();
+
+            const cleanUrl = href.replace('/wiki/', '');
+            if (seenArcUrls.has(cleanUrl)) {
+                continue;
+            }
+
+            const cellText = cell.text().replace(/\s+/g, ' ').trim();
+            const chapterMatch = cellText.match(/Chapters\s+(\d+)\s+to\s+(\d+|Current)/i);
+            if (!chapterMatch) {
+                continue;
+            }
+
+            const startChapter = parseInt(chapterMatch[1], 10);
+            const endChapter = /current/i.test(chapterMatch[2])
+                ? null
+                : parseInt(chapterMatch[2], 10);
+
+            let arcId = normalizeId(cleanUrl);
+            arcId = arcId.replace(/_arc$/i, '');
+
+            // Query the href page via API to get the correct HTML content (in case of redirect) and extract the French name from there
+            const arcResponse = await fetchWithRetry(`${FANDOM_API_BASE}${cleanUrl}`);
+            const arcJsonData = await arcResponse.json();
+            let frArcName: string | null = arcJsonData.parse?.langlinks.find((link: { lang: string; ['*']: string }) => link.lang === 'fr')?.['*'] || null;
+
+            // Remove "Arc" suffix from French name if present to keep it consistent with English names (e.g. "Arc de Luffy" becomes "Luffy")
+            if (frArcName && /\bArc\b/i.test(frArcName)) {
+                frArcName = frArcName.replace(/\bArc\b/i, '').trim();
+            }
+
+            arcs.push({
+                id: arcId,
+                name: arcName,
+                frName: frArcName,
+                startChapter,
+                endChapter,
+                url: cleanUrl
+            });
+
+            seenArcUrls.add(cleanUrl);
+        }

        console.log(`Found ${arcs.length} arcs.`);
        return arcs;
@@ -234,6 +246,7 @@ async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
        header: [
            { id: 'id', title: 'ID' },
            { id: 'name', title: 'Name' },
+            { id: 'frName', title: 'French Name' },
            { id: 'startChapter', title: 'Start Chapter' },
            { id: 'endChapter', title: 'End Chapter' },
            { id: 'url', title: 'URL' }
@@ -245,6 +258,7 @@ async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
        .map((arc) => ({
            id: arc.id || '',
            name: arc.name || '',
+            frName: arc.frName || '',
            startChapter: arc.startChapter || '',
            endChapter: arc.endChapter || '',
            url: arc.url || ''
@@ -259,10 +273,10 @@ async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
 */
 async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
    try {
-        const apiUrl = `${FANDOM_API_BASE}Liste_des_Personnages_Canon`;
+        const apiUrl = `${FANDOM_API_BASE}List_of_Canon_Characters`;
        console.log('Fetching character list via API...');
        const response = await fetchWithRetry(apiUrl);
-        const jsonData = await response.json() as any;
+        const jsonData = await response.json();
        
        // Extract HTML from API response
        const htmlContent = jsonData.parse?.text?.['*'];
@@ -272,11 +286,10 @@ async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
        
        const $ = cheerio.load(htmlContent);
        const characters: CharacterListItem[] = [];
-        $('table.wikitable tbody tr').each((index, element) => {
+        $('table.fandom-table tbody tr').each((index, element) => {
            if (index === 0) return; // Skip header row
-            let charpictureUrl = $(element).find('td:nth-child(1) a img').attr('data-src') || $(element).find('td:nth-child(1) a img').attr('src');
            let charUrl = $(element).find('td:nth-child(2) a').attr('href');
-            let charName = $(element).find('td:nth-child(2) a').text().trim();
+            const charName = $(element).find('td:nth-child(2) a').text().trim();
            let charChapter = $(element).find('td:nth-child(3)').text().trim();

            // Remove parentheses and their content from chapter info (e.g. "1 (flashback)" becomes "1")
@@ -288,13 +301,16 @@ async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
                return;
            }

+            if (parseInt(charChapter, 10) === 0) {
+                return;
+            }
+
            if (charUrl) {
-                charUrl = charUrl.replace('/fr/wiki/', '');
+                charUrl = charUrl.replace('/wiki/', '');
                characters.push({
                    name: charName,
                    url: charUrl,
-                    pictureUrl: charpictureUrl || null,
-                    chapter: charChapter,
+                    chapter: parseInt(charChapter, 10)
                });
            }
        });
@@ -312,27 +328,17 @@ async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
 async function fetchCharacter(
    characterUrl: string,
    characterName: string,
-    characterpictureUrl: string | null,
-    characterChapter: string
+    characterChapter: number,
+    arcsList: Arc[],
 ): Promise<Character | null> {
    try {
        console.log(`Fetching: ${characterName}...`);

        // Use API to fetch character page
        const apiUrl = `${FANDOM_API_BASE}${characterUrl}`;
-        let response = await fetchWithRetry(apiUrl);
+        const response = await fetchWithRetry(apiUrl);
+        const jsonData = await response.json();

-        let jsonData = await response.json() as any;
-
-        // Use final page name from API (if parse.limks contains one element, it means the original page was a redirect, so we use the the element 0 as the final URL, otherwise we use the original URL)
-        let finalCharacterUrl = characterUrl;
-        if (jsonData.parse?.links?.length === 1) {
-            finalCharacterUrl = jsonData.parse.links[0]['*'];
-            // Query the API again with the final URL to get the correct HTML content (in case of redirect)
-            response = await fetchWithRetry(`${FANDOM_API_BASE}${finalCharacterUrl}`);
-            jsonData = await response.json() as any;
-        }
-        
        const categories = jsonData.parse?.categories || [];

        // Extract HTML from API response
@@ -346,16 +352,16 @@ async function fetchCharacter(
        const name = characterName;

        // Generate character ID from URL + name combination
-        const finalCharacterId = normalizeId(finalCharacterUrl + '_' + name);
+        const finalCharacterId = normalizeId(characterUrl + '_' + name);

        // Extract gender from JSON categories
        let gender: string | null = null;
        for (const cat of categories) {
            const catName = cat['*'] || '';
-            if (catName === 'Personnages_Masculins') {
+            if (catName === 'Male_Characters') {
                gender = 'Male';
                break;
-            } else if (catName === 'Personnages_Féminins') {
+            } else if (catName === 'Female_Characters') {
                gender = 'Female';
                break;
            }
@@ -381,11 +387,11 @@ async function fetchCharacter(
        let hakiConqueror = false;
        for (const cat of categories) {
            const catName = cat['*'] || '';
-            if (catName === 'Utilisateurs_du_Haki_de_l\'observation') {
+            if (catName === 'Observation_Haki_Users') {
                hakiObservation = true;
-            } else if (catName === 'Utilisateurs_du_Haki_de_l\'armement') {
+            } else if (catName === 'Armament_Haki_Users') {
                hakiArmament = true;
-            } else if (catName === 'Utilisateurs_du_Haki_des_rois') {
+            } else if (catName === 'Supreme_King_Haki_Users') {
                hakiConqueror = true;
            }
        }
@@ -397,7 +403,7 @@ async function fetchCharacter(
        const height = extractHeight($);

        // Use chapter from character list, cast to int 
-        let firstAppearance = parseInt(characterChapter);
+        const firstAppearance = characterChapter;

        // Extract origin
        const origin = extractOrigin($);
@@ -405,31 +411,51 @@ async function fetchCharacter(
        // Extract status
        const status = extractStatus($);

-        // Extract image URL and clean it
-        let pictureUrl = characterpictureUrl;
-        if (pictureUrl && pictureUrl.includes('Image_Non_Disponible')) {
-            pictureUrl = null;
+        let arcId = '';
+        const arc = arcsList.find(a => a.startChapter <= firstAppearance && (a.endChapter === null || a.endChapter >= firstAppearance));
+        if (!arc) {
+            return null;
        }
+        arcId = arc.id;
+
+        const frLink = getFrLink(jsonData.parse?.langlinks || []);
+        const frUrl = frLink ? frLink.url.replace('https://onepiece.fandom.com/fr/wiki/', '') : null;
+        const frjsonData = frUrl ? await fetchWithRetry(`${FR_FANDOM_API_BASE}${frUrl}`).then(res => res.json()) : null;
+
+        const frName = frjsonData?.parse?.title || null;
+
+        const frAffiliations = frjsonData ? extractAffiliations(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null;
+
+        const frEpithets = frjsonData ? extractEpithets(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null;
+
+        const frOrigin = frjsonData ? extractOrigin(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null;
+

        return {
            id: finalCharacterId,
            name,
+            frName,
            gender,
            age,
            height,
            origin,
+            frOrigin,
            devilFruitId,
            devilFruitUrl,
            affiliations,
+            frAffiliations,
            bounty,
            hakiObservation,
            hakiArmament,
            hakiConqueror,
            epithets,
+            frEpithets,
            firstAppearance,
+            arcId,
            status,
-            pictureUrl,
-            url: finalCharacterUrl
+            pictureUrl: "Image_Non_Disponible",
+            url: characterUrl,
+            frUrl
        };
    } catch (error) {
        console.error(`Error fetching ${characterName}:`, (error as Error).message);
@@ -442,7 +468,7 @@ async function fetchCharacter(
 * Extract age from infobox
 */
 function extractAge($: cheerio.CheerioAPI): number | null {
-    const div = $('[data-source="âge"] .pi-data-value');
+    const div = $('[data-source="age"] .pi-data-value');
    if (div.length === 0) return null;

    let text = div.html();
@@ -473,11 +499,11 @@ function extractAffiliations($: cheerio.CheerioAPI): string[] {
    const cleanedDiv = div.clone();
    cleanedDiv.find('sup').remove();

-    let text = cleanedDiv.html();
+    const text = cleanedDiv.html();
    if (!text) return [];

    // Extract all link values
-    const linkValues = cleanedDiv.find('a').map((i, el) => $(el).text().trim()).get();
+    const linkValues = cleanedDiv.find('a').map((_i, el) => $(el).text().trim()).get();
    if (linkValues.length > 0) {
        return linkValues;
    }
@@ -490,28 +516,46 @@ function extractAffiliations($: cheerio.CheerioAPI): string[] {

 /**
 * Extract epithets from infobox
- * Epithets are always between double quotes
+ * Handles both quoted and unquoted epithets, keeping only the main/latest readable values.
 */
 function extractEpithets($: cheerio.CheerioAPI): string[] {
-    const div = $('[data-source="épithète"] .pi-data-value');
+    const div = $('[data-source="epithet"] .pi-data-value');
    if (div.length === 0) return [];

    const cleanedDiv = div.clone();
    cleanedDiv.find('sup').remove();

-    let text = cleanedDiv.text();
-    if (!text) return [];
+    const html = cleanedDiv.html();
+    if (!html) return [];

-    // Extract all text between double quotes (both straight and curly quotes)
-    const matches = text.match(/["«"]([^"»"]+)["»"]/g);
-    if (!matches) return [];
+    const plainText = html
+        .replace(/<br\s*\/?\s*>/gi, '\n')
+        .replace(/<[^>]*>/g, '');

-    // Remove the quotes and trim
-    const epithets = matches.map(match => 
-        match.replace(/^["«"]|["»"]$/g, '').trim()
-    ).filter(Boolean);
+    const lines = plainText
+        .split('\n')
+        .map((line) => line.trim())
+        .filter(Boolean);

-    return epithets;
+    const epithets = lines
+        .map((line) => {
+            const normalized = line.replace(/\s+/g, ' ').trim();
+
+            // Prefer explicit quoted epithet if present.
+            const quotedMatch = normalized.match(/["«“](.*?)["»”]/);
+            if (quotedMatch?.[1]) {
+                return quotedMatch[1].trim();
+            }
+
+            // Otherwise keep only the base epithet text before extra notes/translations.
+            return normalized
+                .split(/[;(]/)[0]
+                .replace(/["'«»“”]/g, '')
+                .trim();
+        })
+        .filter(Boolean);
+
+    return Array.from(new Set(epithets));
 }

 /**
@@ -519,49 +563,27 @@ function extractEpithets($: cheerio.CheerioAPI): string[] {
 * Returns both normalized ID and URL
 */
 async function extractDevilFruit($: cheerio.CheerioAPI): Promise<DevilFruitData | null> {
-    const link = $('[data-source="dfnom"] .pi-data-value a').first();
+    const link = $('[data-source="dfname"] .pi-data-value a').first();
    if (link.length === 0) return null;

    const href = link.attr('href');
-    if (!href || !href.startsWith('/fr/wiki/')) return null;
+    if (!href || !href.startsWith('/wiki/')) return null;

-    const cleanUrl = href.replace('/fr/wiki/', '');
-    
-    try {
-        // Fetch the page via API to follow redirects
-        const apiUrl = `${FANDOM_API_BASE}${decodeURIComponent(cleanUrl)}`;
-        const response = await fetchWithRetry(apiUrl);
-        const jsonData = await response.json() as any;
-        
-        // Use final page name from API (if parse.links contains one element, it means the original page was a redirect, so we use the the element 0 as the final URL, otherwise we use the original URL)
-        let finalPath = cleanUrl;
-        if (jsonData.parse?.links?.length === 1) {
-            finalPath = jsonData.parse.links[0]['*'];
-        }
+    const cleanUrl = href.replace('/wiki/', '');


-        if (finalPath) {
-            return {
-                devilFruitId: normalizeId(finalPath),
-                devilFruitUrl: finalPath
-            };
-        }
-    } catch (error) {
-        console.error(`Error fetching devil fruit page: ${(error as Error).message}`);
-    }
-    
-    // Fallback to the original href
    return {
        devilFruitId: normalizeId(cleanUrl),
        devilFruitUrl: cleanUrl
    };
 }

+
 /**
 * Extract bounty from infobox
 */
 function extractBounty($: cheerio.CheerioAPI): number | null {
-    const div = $('[data-source="prime"] .pi-data-value');
+    const div = $('[data-source="bounty"] .pi-data-value');
    if (div.length === 0) return 0;

    let text = div.html();
@@ -593,7 +615,7 @@ function extractBounty($: cheerio.CheerioAPI): number | null {
 * Extract height from infobox
 */
 function extractHeight($: cheerio.CheerioAPI): number | null {
-    const div = $('[data-source="taille"] .pi-data-value');
+    const div = $('[data-source="height"] .pi-data-value');
    if (div.length === 0) return null;

    let text = div.html();
@@ -602,43 +624,44 @@ function extractHeight($: cheerio.CheerioAPI): number | null {
    // Remove all sup blocks (citations)
    text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');

-    // Check if there's a <p> tag - if yes, use content from <p>
-    let content;
-    const pMatch = text.match(/<p[^>]*>(.*?)<\/p>/i);
-    if (pMatch) {
-        // Extract content from the <p> tag
-        content = pMatch[1];
-    } else {
-        // Use the last value method (after any <br> tag)
-        content = text.split('<br>').pop();
-    }
-    
-    let cleanText = (content || '').replace(/<[^>]*>/g, '').trim();
-    
-    // Remove content with parentheses
-    cleanText = cleanText.replace(/\([^)]*\)/g, '');
-    
-    // Normalize units for meters or centimeters
+    // Convert line breaks to new lines so we can reliably pick the latest value.
+    const textWithNewLines = text.replace(/<br\s*\/?\s*>/gi, '\n');
+    const lines = textWithNewLines
+        .replace(/<[^>]*>/g, '')
+        .split('\n')
+        .map((line) => line.trim())
+        .filter(Boolean);
+
+    // Keep only lines that look like a height value, then pick the latest one.
+    const heightLines = lines.filter((line) => /\d/.test(line) && /(cm|m)/i.test(line));
+    const latestLine = heightLines.length > 0 ? heightLines[heightLines.length - 1] : lines[lines.length - 1];
+    if (!latestLine) return null;
+
+    // Remove descriptive suffixes like "(post-timeskip)".
+    const cleanText = latestLine.replace(/\([^)]*\)/g, '').trim();
    const normalized = cleanText.toLowerCase().replace(/\s/g, '');
-    if (normalized.includes('cm')) {
-        const digitsOnly = normalized.replace(/\D/g, '');
-        const cm = parseFloat(digitsOnly);
-        return cm ? cm / 100 : null;
+
+    // Values are stored in meters in this dataset.
+    const cmMatch = normalized.match(/(\d+(?:[.,]\d+)?)cm/);
+    if (cmMatch) {
+        const cm = parseFloat(cmMatch[1].replace(',', '.'));
+        return Number.isFinite(cm) ? cm / 100 : null;
    }

-    if (normalized.includes('m')) {
-        const parts = normalized.split('m').filter(Boolean);
-        return parts.length > 0 ? parseFloat(parts.join('.')) : null;
+    const mMatch = normalized.match(/(\d+(?:[.,]\d+)?)m/);
+    if (mMatch) {
+        const meters = parseFloat(mMatch[1].replace(',', '.'));
+        return Number.isFinite(meters) ? meters : null;
    }
-    
-    return normalized.length > 0 ? parseFloat(normalized.replace(/\D/g, '')) : null;
+
+    return null;
 }

 /**
 * Extract origin from infobox
 */
 function extractOrigin($: cheerio.CheerioAPI): string | null {
-    const div = $('[data-source="origine"] .pi-data-value');
+    const div = $('[data-source="origin"] .pi-data-value, [data-source="origine"] .pi-data-value').first();
    if (div.length === 0) return null;

    let text = div.html();
@@ -661,16 +684,16 @@ function extractOrigin($: cheerio.CheerioAPI): string | null {
 * Extract status from infobox
 */
 function extractStatus($: cheerio.CheerioAPI): string | null {
-    const div = $('[data-source="statut"] .pi-data-value');
+    const div = $('[data-source="status"] .pi-data-value');
    if (div.length === 0) return null;

    const statusText = div.text().trim().toLowerCase();
    
-    if (statusText.includes('vivant')) {
+    if (statusText.includes('Alive')) {
        return 'Alive';
-    } else if (statusText.includes('décédé')) {
+    } else if (statusText.includes('Dead')) {
        return 'Dead';
-    } else if (statusText.includes('inconnu')) {
+    } else if (statusText.includes('Unknown')) {
        return 'Unknown';
    }
    
@@ -753,7 +776,7 @@ async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Pro
        // Use API to fetch devil fruit page
        const apiUrl = `${FANDOM_API_BASE}${devilFruitUrl}`;
        const response = await fetchWithRetry(apiUrl);
-        const jsonData = await response.json() as any;
+        const jsonData = await response.json();
        
        // Extract HTML from API response
        const htmlContent = jsonData.parse?.text?.['*'];
@@ -767,7 +790,7 @@ async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Pro
        // Determine type based on categories (if categories contain "Paramecia", "Zoan", "Logia" or "Smile")
        if (jsonData.parse?.categories) {
            const categories = jsonData.parse.categories
-                .map((cat: any) => String(cat['*'] || '').toLowerCase());
+                .map((cat: { ['*']: string }) => String(cat['*'] || '').toLowerCase());

            if (categories.some((category: string) => category.includes('paramecia'))) {
                type = 'Paramecia';
@@ -847,6 +870,7 @@ async function main(): Promise<void> {
            console.table({
                ID: arc.id,
                Name: arc.name,
+                FrenchName: arc.frName || '',
                StartChapter: arc.startChapter,
                EndChapter: arc.endChapter || 'Ongoing',
                URL: arc.url
@@ -886,7 +910,7 @@ async function main(): Promise<void> {
            const batch = failedCharacters.slice(i, i + FETCH_CONCURRENCY);
            const batchResults = await Promise.all(
                batch.map(async (char) => {
-                    const data = await fetchCharacter(char.url, char.name, char.pictureUrl, char.chapter);
+                    const data = await fetchCharacter(char.url, char.name, char.chapter, arcsList);
                    return { char, data };
                })
            );
@@ -918,12 +942,6 @@ async function main(): Promise<void> {
                        devilFruitUrls.add(data.devilFruitUrl);
                    }

-                    if (data.firstAppearance) {
-                        const arc = arcsList.find(a => a.startChapter <= data.firstAppearance && (a.endChapter === null || a.endChapter >= data.firstAppearance));
-                        if (arc) {
-                            data.arcId = arc.id;
-                        }
-                    }

                    characters.push(data);
                } else {