OnePieceDle/scripts/scrape-onepiece.ts

import * as cheerio from 'cheerio';
import fs from 'fs';
import { createObjectCsvWriter } from 'csv-writer';

// Type definitions
interface Arc {
    id: string;
    name: string;
    frName: string | null;
    startChapter: number;
    endChapter: number | null;
    url: string;
}

interface Character {
    id: string;
    name: string;
    frName: string | null;
    gender: string | null;
    age: number | null;
    height: number | null;
    origin: string | null;
    frOrigin: string | null;
    devilFruitId: string | null;
    devilFruitUrl: string | null;
    affiliations: string[];
    frAffiliations: string[] | null;
    bounty: number | null;
    hakiObservation: boolean;
    hakiArmament: boolean;
    hakiConqueror: boolean;
    epithets: string[];
    frEpithets: string[] | null;
    firstAppearance: number;
    status: string | null;
    pictureUrl: string | null;
    url: string;
    frUrl: string | null;
    arcId: string;
}

interface CharacterListItem {
    name: string;
    url: string;
    chapter: number;
}

interface DevilFruitData {
    devilFruitId: string;
    devilFruitUrl: string;
}

interface DevilFruit {
    id: string;
    name: string;
    type: string | null;
    url: string;
}

const FANDOM_API_BASE = 'https://onepiece.fandom.com/api.php?action=parse&redirects=true&format=json&page=';
const FR_FANDOM_API_BASE = 'https://onepiece.fandom.com/fr/api.php?action=parse&redirects=true&format=json&page=';
const OUTPUT_DIR = './scraped-data';
const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed
const INITIAL_RETRY_DELAY = 1000;
const FETCH_CONCURRENCY = 50;


// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
    fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}

/**
 * Retry a fetch request with exponential backoff
 */
async function fetchWithRetry(url: string, options: RequestInit = {}, retries: number = 0): Promise<Response> {
    try {
        const headers: Record<string, string> = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) Firefox/150.0',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            ...((options.headers as Record<string, string>) || {})
        };

        const response = await fetch(url, {
            headers,
            ...options
        });

        // Check if response is OK (status 200-299)
        if (response.ok) {
            return response;
        }

        // If not OK and we have retries left, retry
        if (retries < MAX_RETRIES) {
            const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
            console.log(`⚠️  HTTP ${response.status} for ${url}, retrying in ${delay}ms...`);
            await new Promise(resolve => setTimeout(resolve, delay));
            return fetchWithRetry(url, options, retries + 1);
        }

        // If we've exhausted retries, throw error
        throw new Error(`HTTP ${response.status}: ${response.statusText}`);
    } catch (error) {
        // If it's a network error and we have retries left, retry
        if (retries < MAX_RETRIES) {
            const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
            console.log(`⚠️  Network error: ${(error as Error).message}, retrying in ${delay}ms...`);
            await new Promise(resolve => setTimeout(resolve, delay));
            return fetchWithRetry(url, options, retries + 1);
        }

        // If we've exhausted retries, throw error
        throw error;
    }
}

/**
 * Get the French link from the API response links array
 */

function getFrLink(links: { lang: string; ['*']: string; url: string }[]): { url: string } | null {
    // Get french url by getting parse.langlinks where lang is "fr" and extract the name from there
    const frLink = links.find((link: { lang: string; ['*']: string; url: string }) => link.lang === 'fr');
    return frLink ? { url: frLink['url'] } : null;
}


/**
 * Normalize string by decoding URI components, punctuation, and replacing spaces with underscores
 */
function normalizeId(str: string): string {
    return decodeURIComponent(str)
        .normalize('NFD')
        .replace(/[,:.()]/g, '')
        .replace(/\s+/g, '_')
        .toLowerCase();
}

/**
 * Fetch all arcs from One Piece fandom using API
 */
async function fetchAllArcs(): Promise<Arc[]> {
    try {
        const apiUrl = `${FANDOM_API_BASE}Chapters_and_Volumes`;
        console.log('Fetching arcs list via API...');
        const response = await fetchWithRetry(apiUrl);
        const jsonData = await response.json();

        // Extract HTML from API response
        const htmlContent = jsonData.parse?.text?.['*'];
        if (!htmlContent) {
            throw new Error('Unable to extract HTML content from API response');
        }

        const $ = cheerio.load(htmlContent);
        const arcs: Arc[] = [];

        const seenArcUrls = new Set<string>();

        // Arc rows are in table cells where first link points to a *_Arc page and text includes chapter range.
        const arcCells = $('table.wikitable td').toArray();
        for (const element of arcCells) {
            const cell = $(element);
            const firstLink = cell.find('a').first();
            const href = firstLink.attr('href') || '';
            let arcName = firstLink.text().trim();

            if (!href.startsWith('/wiki/') || !/_Arc$/i.test(href)) {
                continue;
            }

            if (!arcName || !/\bArc\b/i.test(arcName)) {
                continue;
            }

            arcName = arcName.replace(/\bArc\b/i, '').trim();

            const cleanUrl = href.replace('/wiki/', '');
            if (seenArcUrls.has(cleanUrl)) {
                continue;
            }

            const cellText = cell.text().replace(/\s+/g, ' ').trim();
            const chapterMatch = cellText.match(/Chapters\s+(\d+)\s+to\s+(\d+|Current)/i);
            if (!chapterMatch) {
                continue;
            }

            const startChapter = parseInt(chapterMatch[1], 10);
            const endChapter = /current/i.test(chapterMatch[2])
                ? null
                : parseInt(chapterMatch[2], 10);

            let arcId = normalizeId(cleanUrl);
            arcId = arcId.replace(/_arc$/i, '');

            // Query the href page via API to get the correct HTML content (in case of redirect) and extract the French name from there
            const arcResponse = await fetchWithRetry(`${FANDOM_API_BASE}${cleanUrl}`);
            const arcJsonData = await arcResponse.json();
            let frArcName: string | null = arcJsonData.parse?.langlinks.find((link: { lang: string; ['*']: string }) => link.lang === 'fr')?.['*'] || null;

            // Remove "Arc" suffix from French name if present to keep it consistent with English names (e.g. "Arc de Luffy" becomes "Luffy")
            if (frArcName && /\bArc\b/i.test(frArcName)) {
                frArcName = frArcName.replace(/\bArc\b/i, '').trim();
            }

            arcs.push({
                id: arcId,
                name: arcName,
                frName: frArcName,
                startChapter,
                endChapter,
                url: cleanUrl
            });

            seenArcUrls.add(cleanUrl);
        }

        console.log(`Found ${arcs.length} arcs.`);
        return arcs;
    } catch (error) {
        console.error('Error fetching arcs list:', (error as Error).message);
        return [];
    }
}

/**
 * Save arcs to JSON
 */
async function saveArcsToJSON(arcs: Arc[]): Promise<void> {
    const filepath = `${OUTPUT_DIR}/arcs.json`;
    fs.writeFileSync(filepath, JSON.stringify(arcs, null, 2));
    console.log(`✓ Saved to ${filepath}`);
}

/**
 * Save arcs to CSV
 */
async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
    const filepath = `${OUTPUT_DIR}/arcs.csv`;
    const csvWriter = createObjectCsvWriter({
        path: filepath,
        header: [
            { id: 'id', title: 'ID' },
            { id: 'name', title: 'Name' },
            { id: 'frName', title: 'French Name' },
            { id: 'startChapter', title: 'Start Chapter' },
            { id: 'endChapter', title: 'End Chapter' },
            { id: 'url', title: 'URL' }
        ],
    });

    const records = arcs
        .filter((arc) => arc !== null)
        .map((arc) => ({
            id: arc.id || '',
            name: arc.name || '',
            frName: arc.frName || '',
            startChapter: arc.startChapter || '',
            endChapter: arc.endChapter || '',
            url: arc.url || ''
        }));

    await csvWriter.writeRecords(records);
    console.log(`✓ Saved to ${filepath}`);
}

/**
 * Fetch all cannon characters from One Piece fandom using API
 */
async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
    try {
        const apiUrl = `${FANDOM_API_BASE}List_of_Canon_Characters`;
        console.log('Fetching character list via API...');
        const response = await fetchWithRetry(apiUrl);
        const jsonData = await response.json();

        // Extract HTML from API response
        const htmlContent = jsonData.parse?.text?.['*'];
        if (!htmlContent) {
            throw new Error('Unable to extract HTML content from API response');
        }

        const $ = cheerio.load(htmlContent);
        const characters: CharacterListItem[] = [];
        $('table.fandom-table tbody tr').each((index, element) => {
            if (index === 0) return; // Skip header row
            let charUrl = $(element).find('td:nth-child(2) a').attr('href');
            const charName = $(element).find('td:nth-child(2) a').text().trim();
            let charChapter = $(element).find('td:nth-child(3)').text().trim();

            // Remove parentheses and their content from chapter info (e.g. "1 (flashback)" becomes "1")
            charChapter = charChapter.replace(/\([^)]*\)/g, '');
            charChapter = charChapter.replace(/\D/g, '');

            // If charChapter is empty, skip the character as it means they don't have a proper page and are just mentioned in the list
            if (!charChapter) {
                return;
            }

            if (parseInt(charChapter, 10) === 0) {
                return;
            }

            if (charUrl) {
                charUrl = charUrl.replace('/wiki/', '');
                characters.push({
                    name: charName,
                    url: charUrl,
                    chapter: parseInt(charChapter, 10)
                });
            }
        });
        console.log(`Found ${characters.length} characters.`);
        return characters;
    } catch (error) {
        console.error('Error fetching character list:', (error as Error).message);
        return [];
    }
}

/**
 * Fetch character data from fandom using provided URL
 */
async function fetchCharacter(
    characterUrl: string,
    characterName: string,
    characterChapter: number,
    arcsList: Arc[],
): Promise<Character | null> {
    try {
        console.log(`Fetching: ${characterName}...`);

        // Use API to fetch character page
        const apiUrl = `${FANDOM_API_BASE}${characterUrl}`;
        const response = await fetchWithRetry(apiUrl);
        const jsonData = await response.json();

        const categories = jsonData.parse?.categories || [];

        // Extract HTML from API response
        const htmlContent = jsonData.parse?.text?.['*'];
        if (!htmlContent) {
            throw new Error('Unable to extract HTML content from API response');
        }

        const $ = cheerio.load(htmlContent);

        const name = characterName;

        // Generate character ID from URL + name combination
        const finalCharacterId = normalizeId(characterUrl + '_' + name);

        // Extract gender from JSON categories
        let gender: string | null = null;
        for (const cat of categories) {
            const catName = cat['*'] || '';
            if (catName === 'Male_Characters') {
                gender = 'Male';
                break;
            } else if (catName === 'Female_Characters') {
                gender = 'Female';
                break;
            }
        }

        // Extract age
        const age = extractAge($);

        // Extract affiliations
        const affiliations = extractAffiliations($);

        // Extract epithets
        const epithets = extractEpithets($);

        // Extract devil fruit
        const devilFruitData = await extractDevilFruit($);
        const devilFruitId = devilFruitData?.devilFruitId || null;
        const devilFruitUrl = devilFruitData?.devilFruitUrl || null;

        // Extract haki from JSON categories
        let hakiObservation = false;
        let hakiArmament = false;
        let hakiConqueror = false;
        for (const cat of categories) {
            const catName = cat['*'] || '';
            if (catName === 'Observation_Haki_Users') {
                hakiObservation = true;
            } else if (catName === 'Armament_Haki_Users') {
                hakiArmament = true;
            } else if (catName === 'Supreme_King_Haki_Users') {
                hakiConqueror = true;
            }
        }

        // Extract bounty
        const bounty = extractBounty($);

        // Extract height
        const height = extractHeight($);

        // Use chapter from character list, cast to int
        const firstAppearance = characterChapter;

        // Extract origin
        const origin = extractOrigin($);

        // Extract status
        const status = extractStatus($);

        let arcId = '';
        const arc = arcsList.find(a => a.startChapter <= firstAppearance && (a.endChapter === null || a.endChapter >= firstAppearance));
        if (!arc) {
            return null;
        }
        arcId = arc.id;

        const frLink = getFrLink(jsonData.parse?.langlinks || []);
        const frUrl = frLink ? frLink.url.replace('https://onepiece.fandom.com/fr/wiki/', '') : null;
        const frjsonData = frUrl ? await fetchWithRetry(`${FR_FANDOM_API_BASE}${frUrl}`).then(res => res.json()) : null;

        const frName = frjsonData?.parse?.title || null;

        const frAffiliations = frjsonData ? extractAffiliations(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null;

        const frEpithets = frjsonData ? extractEpithets(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null;

        const frOrigin = frjsonData ? extractOrigin(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null;


        return {
            id: finalCharacterId,
            name,
            frName,
            gender,
            age,
            height,
            origin,
            frOrigin,
            devilFruitId,
            devilFruitUrl,
            affiliations,
            frAffiliations,
            bounty,
            hakiObservation,
            hakiArmament,
            hakiConqueror,
            epithets,
            frEpithets,
            firstAppearance,
            arcId,
            status,
            pictureUrl: "Image_Non_Disponible",
            url: characterUrl,
            frUrl
        };
    } catch (error) {
        console.error(`Error fetching ${characterName}:`, (error as Error).message);
        return null;
    }
}


/**
 * Extract age from infobox
 */
function extractAge($: cheerio.CheerioAPI): number | null {
    const div = $('[data-source="age"] .pi-data-value');
    if (div.length === 0) return null;

    let text = div.html();
    if (!text) return null;

    // Remove all sup blocks (citations)
    text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');

    // Get the last element and extract only digits
    const parts = text.split('<br');
    const lastPart = parts[parts.length - 1];
    let cleanText = lastPart.replace(/<[^>]*>/g, '').trim();

    // Remove content with parentheses
    cleanText = cleanText.replace(/\([^)]*\)/g, '');

    const digitsOnly = cleanText.replace(/\D/g, '');
    return parseInt(digitsOnly) || null;
}

/**
 * Extract affiliations from infobox
 */
function extractAffiliations($: cheerio.CheerioAPI): string[] {
    const div = $('[data-source="affiliation"] .pi-data-value');
    if (div.length === 0) return [];

    const cleanedDiv = div.clone();
    cleanedDiv.find('sup').remove();

    const text = cleanedDiv.html();
    if (!text) return [];

    // Extract all link values
    const linkValues = cleanedDiv.find('a').map((_i, el) => $(el).text().trim()).get();
    if (linkValues.length > 0) {
        return linkValues;
    }

    // Fallback to parsing text
    const cleanText = text.replace(/<[^>]*>/g, '').trim();
    const parts = cleanText.split(/\s*\n\s*|\s*;\s*|\s*,\s*/).filter(Boolean);
    return parts.length > 0 ? parts : [];
}

/**
 * Extract epithets from infobox
 * Handles both quoted and unquoted epithets, keeping only the main/latest readable values.
 */
function extractEpithets($: cheerio.CheerioAPI): string[] {
    const div = $('[data-source="epithet"] .pi-data-value');
    if (div.length === 0) return [];

    const cleanedDiv = div.clone();
    cleanedDiv.find('sup').remove();

    const html = cleanedDiv.html();
    if (!html) return [];

    const plainText = html
        .replace(/<br\s*\/?\s*>/gi, '\n')
        .replace(/<[^>]*>/g, '');

    const lines = plainText
        .split('\n')
        .map((line) => line.trim())
        .filter(Boolean);

    const epithets = lines
        .map((line) => {
            const normalized = line.replace(/\s+/g, ' ').trim();

            // Prefer explicit quoted epithet if present.
            const quotedMatch = normalized.match(/["«“](.*?)["»”]/);
            if (quotedMatch?.[1]) {
                return quotedMatch[1].trim();
            }

            // Otherwise keep only the base epithet text before extra notes/translations.
            return normalized
                .split(/[;(]/)[0]
                .replace(/["'«»“”]/g, '')
                .trim();
        })
        .filter(Boolean);

    return Array.from(new Set(epithets));
}

/**
 * Extract devil fruit from infobox
 * Returns both normalized ID and URL
 */
async function extractDevilFruit($: cheerio.CheerioAPI): Promise<DevilFruitData | null> {
    const link = $('[data-source="dfname"] .pi-data-value a').first();
    if (link.length === 0) return null;

    const href = link.attr('href');
    if (!href || !href.startsWith('/wiki/')) return null;

    const cleanUrl = href.replace('/wiki/', '');


    return {
        devilFruitId: normalizeId(cleanUrl),
        devilFruitUrl: cleanUrl
    };
}


/**
 * Extract bounty from infobox
 */
function extractBounty($: cheerio.CheerioAPI): number | null {
    const div = $('[data-source="bounty"] .pi-data-value');
    if (div.length === 0) return 0;

    let text = div.html();
    if (!text) return 0;

    // Remove all sup blocks (citations)
    text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');

    // Extract the first value before any <br> tag
    const firstValue = text.split('<br')[0].trim();
    let cleanText = firstValue.replace(/<[^>]*>/g, '').trim();

    // Check if cleanText contains digits
    if (!/\d/.test(cleanText)) {
        // If no digits, try second value after <br>
        const secondValue = text.split('<br>')[1];
        if (secondValue) {
            cleanText = secondValue.replace(/<[^>]*>/g, '').trim();
        }
    }

    // Remove all non-digits
    cleanText = cleanText.replace(/\D/g, '');

    return cleanText ? parseInt(cleanText) : 0;
}

/**
 * Extract height from infobox
 */
function extractHeight($: cheerio.CheerioAPI): number | null {
    const div = $('[data-source="height"] .pi-data-value');
    if (div.length === 0) return null;

    let text = div.html();
    if (!text) return null;

    // Remove all sup blocks (citations)
    text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');

    // Convert line breaks to new lines so we can reliably pick the latest value.
    const textWithNewLines = text.replace(/<br\s*\/?\s*>/gi, '\n');
    const lines = textWithNewLines
        .replace(/<[^>]*>/g, '')
        .split('\n')
        .map((line) => line.trim())
        .filter(Boolean);

    // Keep only lines that look like a height value, then pick the latest one.
    const heightLines = lines.filter((line) => /\d/.test(line) && /(cm|m)/i.test(line));
    const latestLine = heightLines.length > 0 ? heightLines[heightLines.length - 1] : lines[lines.length - 1];
    if (!latestLine) return null;

    // Remove descriptive suffixes like "(post-timeskip)".
    const cleanText = latestLine.replace(/\([^)]*\)/g, '').trim();
    const normalized = cleanText.toLowerCase().replace(/\s/g, '');

    // Values are stored in meters in this dataset.
    const cmMatch = normalized.match(/(\d+(?:[.,]\d+)?)cm/);
    if (cmMatch) {
        const cm = parseFloat(cmMatch[1].replace(',', '.'));
        return Number.isFinite(cm) ? cm / 100 : null;
    }

    const mMatch = normalized.match(/(\d+(?:[.,]\d+)?)m/);
    if (mMatch) {
        const meters = parseFloat(mMatch[1].replace(',', '.'));
        return Number.isFinite(meters) ? meters : null;
    }

    return null;
}

/**
 * Extract origin from infobox
 */
function extractOrigin($: cheerio.CheerioAPI): string | null {
    const div = $('[data-source="origin"] .pi-data-value, [data-source="origine"] .pi-data-value').first();
    if (div.length === 0) return null;

    let text = div.html();
    if (!text) return null;

    // Remove all sup blocks (citations)
    text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');

    // Extract the first value before any <br> tag
    const firstValue = text.split('<br')[0].trim();
    let cleanText = firstValue.replace(/<[^>]*>/g, '').trim();

    // Remove content with parentheses
    cleanText = cleanText.replace(/\([^)]*\)/g, '').trim();

    return cleanText || null;
}

/**
 * Extract status from infobox
 */
function extractStatus($: cheerio.CheerioAPI): string | null {
    const div = $('[data-source="status"] .pi-data-value');
    if (div.length === 0) return null;

    const statusText = div.text().trim().toLowerCase();

    if (statusText.includes('Alive')) {
        return 'Alive';
    } else if (statusText.includes('Dead')) {
        return 'Dead';
    } else if (statusText.includes('Unknown')) {
        return 'Unknown';
    }

    return 'Alive';
}


/**
 * Save data to JSON
 */
async function saveToJSON(characters: Character[]): Promise<void> {
    const filepath = `${OUTPUT_DIR}/characters.json`;
    fs.writeFileSync(filepath, JSON.stringify(characters, null, 2));
    console.log(`✓ Saved to ${filepath}`);
}

/**
 * Save data to CSV
 */
async function saveToCSV(characters: Character[]): Promise<void> {
    const filepath = `${OUTPUT_DIR}/characters.csv`;
    const csvWriter = createObjectCsvWriter({
        path: filepath,
        header: [
            { id: 'id', title: 'ID' },
            { id: 'name', title: 'Name' },
            { id: 'gender', title: 'Gender' },
            { id: 'age', title: 'Age' },
            { id: 'height', title: 'Height' },
            { id: 'origin', title: 'Origin' },
            { id: 'status', title: 'Status' },
            { id: 'epithets', title: 'Epithets' },
            { id: 'devilFruitId', title: 'Devil Fruit ID' },
            { id: 'affiliations', title: 'Affiliations' },
            { id: 'bounty', title: 'Bounty' },
            { id: 'hakiObservation', title: 'Haki Observation' },
            { id: 'hakiArmament', title: 'Haki Armament' },
            { id: 'hakiConqueror', title: 'Haki Conqueror' },
            { id: 'firstAppearance', title: 'First Appearance' },
            { id: 'arcId', title: 'Arc ID' },
            { id: 'pictureUrl', title: 'Image URL' },
            { id: 'url', title: 'Fandom URL' }
        ],
    });

    const records = characters
        .filter((c) => c !== null)
        .map((c) => ({
            id: c.id || '',
            name: c.name || '',
            gender: c.gender || '',
            age: c.age || '',
            height: c.height || '',
            origin: c.origin || '',
            status: c.status || '',
            epithets: Array.isArray(c.epithets) ? c.epithets.join(', ') : (c.epithets || ''),
            devilFruitId: c.devilFruitId || '',
            affiliations: Array.isArray(c.affiliations) ? c.affiliations.join(', ') : (c.affiliations || ''),
            bounty: c.bounty ?? 0,
            hakiObservation: c.hakiObservation ? 1 : 0,
            hakiArmament: c.hakiArmament ? 1 : 0,
            hakiConqueror: c.hakiConqueror ? 1 : 0,
            firstAppearance: c.firstAppearance || '',
            arcId: c.arcId || '',
            pictureUrl: c.pictureUrl || '',
            url: c.url || ''
        }));

    await csvWriter.writeRecords(records);
    console.log(`✓ Saved to ${filepath}`);
}

/**
 * Fetch devil fruit data from fandom using provided URL
 */
async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Promise<DevilFruit | null> {
    try {
        console.log(`Fetching devil fruit: ${devilFruitUrl}...`);

        // Use API to fetch devil fruit page
        const apiUrl = `${FANDOM_API_BASE}${devilFruitUrl}`;
        const response = await fetchWithRetry(apiUrl);
        const jsonData = await response.json();

        // Extract HTML from API response
        const htmlContent = jsonData.parse?.text?.['*'];
        if (!htmlContent) {
            throw new Error('Unable to extract HTML content from API response');
        }

        const name = jsonData.parse?.title || devilFruitId.replace(/_/g, ' ');

        let type: string | null = null;
        // Determine type based on categories (if categories contain "Paramecia", "Zoan", "Logia" or "Smile")
        if (jsonData.parse?.categories) {
            const categories = jsonData.parse.categories
                .map((cat: { ['*']: string }) => String(cat['*'] || '').toLowerCase());

            if (categories.some((category: string) => category.includes('paramecia'))) {
                type = 'Paramecia';
            } else if (categories.some((category: string) => category.includes('zoan'))) {
                type = 'Zoan';
            } else if (categories.some((category: string) => category.includes('logia'))) {
                type = 'Logia';
            } else if (categories.some((category: string) => category.includes('smile'))) {
                type = 'Smile';
            }
        }

        return {
            id: devilFruitId,
            name,
            type,
            url: devilFruitUrl
        };
    } catch (error) {
        console.error(`Error fetching devil fruit ${devilFruitUrl}:`, (error as Error).message);
        return null;
    }
}

/**
 * Save devil fruits to JSON
 */
async function saveDevilFruitsToJSON(devilFruits: DevilFruit[]): Promise<void> {
    const filepath = `${OUTPUT_DIR}/devil-fruits.json`;
    fs.writeFileSync(filepath, JSON.stringify(devilFruits, null, 2));
    console.log(`✓ Saved to ${filepath}`);
}

/**
 * Save devil fruits to CSV
 */
async function saveDevilFruitsToCSV(devilFruits: DevilFruit[]): Promise<void> {
    const filepath = `${OUTPUT_DIR}/devil-fruits.csv`;
    const csvWriter = createObjectCsvWriter({
        path: filepath,
        header: [
            { id: 'id', title: 'ID' },
            { id: 'name', title: 'Name' },
            { id: 'type', title: 'Type' },
            { id: 'url', title: 'URL' }
        ],
    });

    const records = devilFruits
        .filter((df) => df !== null)
        .map((df) => ({
            id: df.id || '',
            name: df.name || '',
            type: df.type || '',
            url: df.url || ''
        }));

    await csvWriter.writeRecords(records);
    console.log(`✓ Saved to ${filepath}`);
}

/**
 * Main execution
 */
async function main(): Promise<void> {
    const format = process.argv[2] || 'all'; // json, csv, or all

    console.log(`\nOne Piece Scraper - Mode: ${format}\n`);

    // Step 1: Scraping Arcs
    console.log('=== Step 1: Scraping Arcs ===\n');
    const arcsList = await fetchAllArcs();

    if (arcsList.length > 0) {
        // Display arcs in table format
        arcsList.forEach((arc) => {
            console.table({
                ID: arc.id,
                Name: arc.name,
                FrenchName: arc.frName || '',
                StartChapter: arc.startChapter,
                EndChapter: arc.endChapter || 'Ongoing',
                URL: arc.url
            });
        });

        console.log(`\n✓ Found ${arcsList.length} arcs\n`);

        if (format === 'json' || format === 'all') {
            await saveArcsToJSON(arcsList);
        }
        if (format === 'csv' || format === 'all') {
            await saveArcsToCSV(arcsList);
        }
    } else {
        console.warn('No arcs found, continuing...\n');
    }

    // Step 2: Scraping Characters
    console.log('=== Step 1: Scraping Characters ===\n');
    const characterList = await fetchAllCharactersUrl();

    if (characterList.length === 0) {
        console.error('No characters found. Exiting.');
        return;
    }

    const characters: Character[] = [];
    const devilFruitUrls = new Set<string>();
    let failedCharacters: CharacterListItem[] = [...characterList];

    while (failedCharacters.length > 0) {
        const nextFailedCharacters: CharacterListItem[] = [];
        console.log(`\nFetching ${failedCharacters.length} characters...`);

        for (let i = 0; i < failedCharacters.length; i += FETCH_CONCURRENCY) {
            const batch = failedCharacters.slice(i, i + FETCH_CONCURRENCY);
            const batchResults = await Promise.all(
                batch.map(async (char) => {
                    const data = await fetchCharacter(char.url, char.name, char.chapter, arcsList);
                    return { char, data };
                })
            );

            for (const { char, data } of batchResults) {
                if (data) {
                    console.table({
                        ID: data.id,
                        Name: data.name,
                        Gender: data.gender,
                        Age: data.age,
                        Status: data.status,
                        Epithets: data.epithets.join(', '),
                        Affiliations: data.affiliations.join(', '),
                        DevilFruitId: data.devilFruitId,
                        DevilFruitUrl: data.devilFruitUrl,
                        HakiObservation: data.hakiObservation ? 'Yes' : 'No',
                        HakiArmament: data.hakiArmament ? 'Yes' : 'No',
                        HakiConqueror: data.hakiConqueror ? 'Yes' : 'No',
                        Height: data.height,
                        Bounty: data.bounty,
                        Origin: data.origin,
                        FirstAppearance: data.firstAppearance,
                        pictureUrl: data.pictureUrl,
                        FandomURL: data.url
                    });

                    if (data.devilFruitUrl) {
                        devilFruitUrls.add(data.devilFruitUrl);
                    }


                    characters.push(data);
                } else {
                    nextFailedCharacters.push(char);
                }
            }
        }

        failedCharacters = nextFailedCharacters;
        if (failedCharacters.length > 0) {
            console.log(`⚠️  ${failedCharacters.length} characters failed. Retrying...`);
        }
    }

    console.log(`\n✓ Scraped ${characters.length} characters\n`);
    console.log(`✓ Found ${devilFruitUrls.size} unique devil fruits\n`);

    // Step 3: Scraping Devil Fruits
    console.log('=== Step 2: Scraping Devil Fruits ===\n');

    if (devilFruitUrls.size === 0) {
        console.warn('No devil fruits found from characters, skipping...\n');
    } else {
        const devilFruits: DevilFruit[] = [];
        const devilFruitUrlArray = Array.from(devilFruitUrls);

        for (let i = 0; i < devilFruitUrlArray.length; i += FETCH_CONCURRENCY) {
            const batch = devilFruitUrlArray.slice(i, i + FETCH_CONCURRENCY);
            const batchResults = await Promise.all(
                batch.map(async (url) => {
                    const data = await fetchDevilFruit(url, normalizeId(url));
                    return { url, data };
                })
            );

            for (const { data } of batchResults) {
                if (data) {
                    console.table({
                        ID: data.id,
                        Name: data.name,
                        Type: data.type,
                        URL: data.url
                    });

                    devilFruits.push(data);
                }
            }
        }

        console.log(`\n✓ Scraped ${devilFruits.length} devil fruits\n`);

        if (format === 'json' || format === 'all') {
            await saveDevilFruitsToJSON(devilFruits);
        }
        if (format === 'csv' || format === 'all') {
            await saveDevilFruitsToCSV(devilFruits);
        }

        // Update characters with normalized devil fruit IDs
        const devilFruitMap = new Map<string, string>(devilFruits.map(df => [df.id, df.id]));
        characters.forEach(char => {
            if (char.devilFruitUrl) {
                const normalizedId = normalizeId(char.devilFruitUrl);
                char.devilFruitId = devilFruitMap.get(normalizedId) || normalizedId;
            }
        });
    }

    // Save characters after devil fruit IDs are updated
    if (format === 'json' || format === 'all') {
        await saveToJSON(characters);
    }
    if (format === 'csv' || format === 'all') {
        await saveToCSV(characters);
    }

    console.log('\n✓ Done!\n');
}

main().catch(console.error);