OnePieceDle/scripts/scrape-onepiece.js

import * as cheerio from 'cheerio';
import fs from 'fs';
import { createObjectCsvWriter } from 'csv-writer';

const FANDOM_BASE_URL = 'https://onepiece.fandom.com/fr/wiki';
const OUTPUT_DIR = './scraped-data';
const DEVIL_FRUIT_CONCURRENCY = 5;
const CHARACTER_CONCURRENCY = 10;

// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
    fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}

/**
 * Normalize string by removing accents and converting to lowercase
 */
function normalizeId(str) {
    return decodeURIComponent(str)
        .normalize('NFD')
        .replace(/[\u0300-\u036f]/g, '')
        .replace(/[,:]/g, '')
        .toLowerCase();
}

/**
 * Fetch all devil fruits URLs from One Piece fandom
 */
async function fetchAllDevilFruitsUrl() {
    try {
        const url = `${FANDOM_BASE_URL}/Fruits_du_Démon`;
        console.log('Fetching devil fruits list...');
        const response = await fetch(url, {
            headers: {
                'User-Agent': 'Mozilla/5.0 (compatible;  MSIE 7.01; Windows NT 5.0)',
            },
        });
        const data = await response.text();
        const $ = cheerio.load(data);
        const devilFruits = [];

        // Find the main navibox table
        $('table.navibox.toccolours').each((mainTableIndex, mainTable) => {
            const mainHeader = $(mainTable).find('th[colspan="3"]').first().find('span').last().text().trim();
            if (mainHeader !== 'Fruits du Démon') return;

            $(mainTable).find('table.collapsible').each((typeTableIndex, typeTable) => {
                const typeHeader = $(typeTable).find('th[colspan="3"]').first().text().trim();
                let type = null;

                if (typeHeader.includes('Paramecia')) type = 'Paramecia';
                else if (typeHeader.includes('Zoan')) type = 'Zoan';
                else if (typeHeader.includes('Logia')) type = 'Logia';
                else if (typeHeader.includes('Type Inconnu')) type = 'Unknown';

                if (!type) return;

                $(typeTable).find('tr.navibox-row').each((rowIndex, row) => {
                    const categoryHeader = $(row).find('th').text().trim();

                    if (!categoryHeader.includes('Canon') &&
                        !categoryHeader.includes('Standards') &&
                        !categoryHeader.includes('Antiques') &&
                        !categoryHeader.includes('Mythiques') &&
                        !categoryHeader.includes('Hors-Série')) {
                        return;
                    }

                    // Find all links in the row
                    $(row).find('td .hlist ul li a').each((linkIndex, link) => {
                        const name = $(link).text().trim();
                        const href = $(link).attr('href');

                        if (name && href && href.startsWith('/fr/wiki/')) {
                            // Clean the URL
                            const cleanUrl = href.replace('/fr/wiki/', '');

                            // Skip classification pages and category pages
                            if (cleanUrl.includes('Classification') ||
                                cleanUrl.includes('Catégorie:') ||
                                cleanUrl === 'Fruits_du_Démon_Artificiels' ||
                                cleanUrl === 'SMILE') {
                                return;
                            }

                            devilFruits.push({
                                id: normalizeId(cleanUrl),
                                name,
                                type,
                                url: cleanUrl,
                            });
                        }
                    });
                });
            });
        });

        console.log(`Found ${devilFruits.length} devil fruits.`);
        return devilFruits;
    } catch (error) {
        console.error('Error fetching devil fruits list:', error.message);
        return [];
    }
}

/**
 * Fetch devil fruit data from fandom using provided URL
 */
async function fetchDevilFruit(devilFruitUrl, devilFruitId, devilFruitName, devilFruitType) {
    try {
        console.log(`Fetching: ${devilFruitName}...`);

        const response = await fetch(`${FANDOM_BASE_URL}/${devilFruitUrl}`, {
            headers: {
                'User-Agent': 'Mozilla/5.0 (compatible;  MSIE 7.01; Windows NT 5.0)',
            },
        });
        const data = await response.text();
        const $ = cheerio.load(data);

        // Extract devil fruit name from page title if different
        const name = $('h1.mw-page-title-main').text().trim() || devilFruitName;

        // Use the type from the list page
        const type = devilFruitType;

        return {
            id: devilFruitId,
            name,
            type
        };
    } catch (error) {
        console.error(`Error fetching ${devilFruitName}:`, error.message);
        return null;
    }
}

/**
 * Save devil fruits to JSON
 */
async function saveDevilFruitsToJSON(devilFruits) {
    const filepath = `${OUTPUT_DIR}/devil-fruits.json`;
    fs.writeFileSync(filepath, JSON.stringify(devilFruits, null, 2));
    console.log(`✓ Saved to ${filepath}`);
}

/**
 * Save devil fruits to SQL
 */
function saveDevilFruitsToSQL(devilFruits) {
    const filepath = `${OUTPUT_DIR}/devil-fruits.sql`;
    const escapeSql = (value) => (value ? `'${String(value).replace(/'/g, "''")}'` : 'NULL');

    let sql = '';

    devilFruits.forEach((df) => {
        sql += `INSERT INTO devilFruit (id, name, type) \n`;
        sql += `VALUES (${escapeSql(df.id)}, ${escapeSql(df.name)}, ${escapeSql(df.type)}) \n`;
        sql += `ON CONFLICT(id) DO UPDATE SET \n`;
        sql += `  name = excluded.name,\n`;
        sql += `  type = excluded.type;\n\n`;
    });

    fs.writeFileSync(filepath, sql);
    console.log(`✓ Saved to ${filepath}`);
}

/**
 * Fetch all cannon characters from One Piece fandom
 */
async function fetchAllCharactersUrl() {
    try {
        const url = `${FANDOM_BASE_URL}/Liste_des_Personnages_Canon`;
        console.log('Fetching character list...');
        const response = await fetch(url, {
            headers: {
                'User-Agent': 'Mozilla/5.0 (compatible;  MSIE 7.01; Windows NT 5.0)',
            },
        });
        const data = await response.text();
        const $ = cheerio.load(data);
        const characters = [];
        $('table.wikitable tbody tr').each((index, element) => {
            if (index === 0) return; // Skip header row
            const charpictureUrl = $(element).find('td:nth-child(1) a img').attr('data-src') || $(element).find('td:nth-child(1) a img').attr('src');
            const charLink = $(element).find('td:nth-child(2) a').attr('href');
            const charName = $(element).find('td:nth-child(2) a').text().trim();
            if (charLink) {
                const cleanUrl = charLink.replace('/fr/wiki/', '');
                characters.push({
                    id: normalizeId(cleanUrl),
                    name: charName,
                    url: cleanUrl,
                    pictureUrl: charpictureUrl,
                });
            }
        });
        console.log(`Found ${characters.length} characters.`);
        return characters;
    } catch (error) {
        console.error('Error fetching character list:', error.message);
        return [];
    }
}

/**
 * Fetch character data from fandom using provided URL
 */
async function fetchCharacter(characterUrl, characterId, characterName, characterpictureUrl) {
    try {
        console.log(`Fetching: ${characterName}...`);

        const response = await fetch(`${FANDOM_BASE_URL}/${characterUrl}`, {
            headers: {
                'User-Agent': 'Mozilla/5.0 (compatible;  MSIE 7.01; Windows NT 5.0)',
            },
        });
        // Log response status for debugging
        const data = await response.text();

        const $ = cheerio.load(data);

        // Extract character name
        const name = $('h1.mw-page-title-main').text().trim() || characterName.replace(/_/g, ' ');

        // Extract gender from the specific categories link
        let gender = null;
        if ($('.page-header__categories a[title="Catégorie:Personnages Masculins"]').length > 0) {
            gender = 'Male';
        } else if ($('.page-header__categories a[title="Catégorie:Personnages Féminins"]').length > 0) {
            gender = 'Female';
        }

        // Extract age
        const age = extractAge($);

        // Extract affiliations
        const affiliations = extractAffiliations($);

        // Extract devil fruit
        const devilFruit = await extractDevilFruit($);

        // Extract haki
        let haki = [];
        if ($('.page-header__categories a[title="Catégorie:Utilisateurs du Haki de l\'observation"]').length > 0) {
            haki.push('Observation');
        }
        if ($('.page-header__categories a[title="Catégorie:Utilisateurs du Haki de l\'armement"]').length > 0) {
            haki.push('Armament');
        }
        if ($('.page-header__categories a[title="Catégorie:Utilisateurs du Haki des rois"]').length > 0) {
            haki.push('Conqueror');
        }

        // Extract bounty
        const bounty = extractBounty($);

        // Extract height
        const height = extractHeight($);

        // Extract first appearance
        const firstAppearance = extractFirstAppearance($);

        // Extract origin
        const origin = extractOrigin($);

        // Extract image URL and clean it
        let pictureUrl = characterpictureUrl;

        return {
            id: characterId,
            name,
            gender,
            age,
            height,
            origin,
            devilFruit,
            affiliations,
            bounty,
            haki,
            firstAppearance,
            pictureUrl
        };
    } catch (error) {
        console.error(`Error fetching ${characterName}:`, error.message);
        return null;
    }
}


/**
 * Extract age from infobox
 */
function extractAge($) {
    const div = $('[data-source="âge"] .pi-data-value');
    if (div.length === 0) return null;

    let text = div.html();
    if (!text) return null;

    // Remove all sup blocks (citations)
    text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');

    // Get the last element and extract only digits
    const parts = text.split('<br');
    const lastPart = parts[parts.length - 1];
    let cleanText = lastPart.replace(/<[^>]*>/g, '').trim();

    // Remove content with parentheses
    cleanText = cleanText.replace(/\([^)]*\)/g, '');

    const digitsOnly = cleanText.replace(/\D/g, '');
    return digitsOnly || null;
}

/**
 * Extract affiliations from infobox
 */
function extractAffiliations($) {
    const div = $('[data-source="affiliation"] .pi-data-value');
    if (div.length === 0) return [];

    const cleanedDiv = div.clone();
    cleanedDiv.find('sup').remove();

    let text = cleanedDiv.html();
    if (!text) return [];

    // Extract all link values
    const linkValues = cleanedDiv.find('a').map((i, el) => $(el).text().trim()).get();
    if (linkValues.length > 0) {
        return linkValues;
    }

    // Fallback to parsing text
    const cleanText = text.replace(/<[^>]*>/g, '').trim();
    const parts = cleanText.split(/\s*\n\s*|\s*;\s*|\s*,\s*/).filter(Boolean);
    return parts.length > 0 ? parts : [];
}

/**
 * Extract devil fruit from infobox
 */
async function extractDevilFruit($) {
    const link = $('[data-source="dfnom"] .pi-data-value a').first();
    if (link.length === 0) return null;

    const href = link.attr('href');
    if (!href || !href.startsWith('/fr/wiki/')) return null;

    const cleanUrl = href.replace('/fr/wiki/', '');

    try {
        // Fetch the page to follow redirects
        const response = await fetch(`${FANDOM_BASE_URL}/${cleanUrl}`, {
            headers: {
                'User-Agent': 'Mozilla/5.0 (compatible; MSIE 7.01; Windows NT 5.0)',
            },
            redirect: 'follow' // Explicitly follow redirects
        });

        // Check if response was a redirect (301, 302, etc.)
        if (response.status === 301 || response.status === 302) {
            // Use the final redirected URL
            const finalUrl = new URL(response.url);
            const pathname = finalUrl.pathname;
            const finalPath = pathname.replace('/fr/wiki/', '');
            if (finalPath) {
                return normalizeId(finalPath);
            }
        } else {
            // Use the current URL if no redirect
            const finalUrl = new URL(response.url);
            const pathname = finalUrl.pathname;
            const finalPath = pathname.replace('/fr/wiki/', '');
            if (finalPath) {
                return normalizeId(finalPath);
            }
        }
    } catch (error) {
        console.error(`Error fetching devil fruit page: ${error.message}`);
    }

    // Fallback to the original href
    return normalizeId(cleanUrl);
}

/**
 * Extract bounty from infobox
 */
function extractBounty($) {
    const div = $('[data-source="prime"] .pi-data-value');
    if (div.length === 0) return null;

    let text = div.html();
    if (!text) return null;

    // Remove all sup blocks (citations)
    text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');

    // Extract the first value before any <br> tag
    const firstValue = text.split('<br')[0].trim();
    let cleanText = firstValue.replace(/<[^>]*>/g, '').trim();

    // Remove spaces and dots
    cleanText = cleanText.replace(/[\s.]/g, '');

    return cleanText || null;
}

/**
 * Extract height from infobox
 */
function extractHeight($) {
    const div = $('[data-source="taille"] .pi-data-value');
    if (div.length === 0) return null;

    let text = div.html();
    if (!text) return null;

    // Remove all sup blocks (citations)
    text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');

    // Extract the last value after any <br> tag
    const lastValue = text.split('<br>').pop().trim();
    let cleanText = lastValue.replace(/<[^>]*>/g, '').trim();

    // Remove content with parentheses
    cleanText = cleanText.replace(/\([^)]*\)/g, '');

    // Normalize units for meters or centimeters
    const normalized = cleanText.toLowerCase().replace(/\s/g, '');
    if (normalized.includes('cm')) {
        const digitsOnly = normalized.replace(/\D/g, '');
        return digitsOnly || null;
    }

    if (normalized.includes('m')) {
        const parts = normalized.split('m').filter(Boolean);
        return parts.length > 0 ? parts.join('.') : null;
    }

    return normalized.replace(/\D/g, '') || null;
}

/**
 * Extract first appearance from infobox
 */
function extractFirstAppearance($) {
    const div = $('[data-source="première"] .pi-data-value');
    if (div.length === 0) return null;

    let text = div.html();
    if (!text) return null;

    // Remove all sup blocks (citations)
    text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');

    // Extract digits after "Chapitre"
    const cleanText = text.replace(/<[^>]*>/g, '').trim();
    const match = cleanText.match(/Chapitre\s+(\d+)/i);
    return match ? match[1] : null;
}

/**
 * Extract origin from infobox
 */
function extractOrigin($) {
    const div = $('[data-source="origine"] .pi-data-value');
    if (div.length === 0) return null;

    let text = div.html();
    if (!text) return null;

    // Remove all sup blocks (citations)
    text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');

    // Extract the first value before any <br> tag
    const firstValue = text.split('<br')[0].trim();
    let cleanText = firstValue.replace(/<[^>]*>/g, '').trim();

    // Remove content with parentheses
    cleanText = cleanText.replace(/\([^)]*\)/g, '').trim();

    return cleanText || null;
}


/**
 * Save data to JSON
 */
async function saveToJSON(characters) {
    const filepath = `${OUTPUT_DIR}/characters.json`;
    fs.writeFileSync(filepath, JSON.stringify(characters, null, 2));
    console.log(`✓ Saved to ${filepath}`);
}

/**
 * Save data to CSV
 */
async function saveToCSV(characters) {
    const filepath = `${OUTPUT_DIR}/characters.csv`;
    const csvWriter = createObjectCsvWriter({
        path: filepath,
        header: [
            { id: 'id', title: 'ID' },
            { id: 'name', title: 'Name' },
            { id: 'gender', title: 'Gender' },
            { id: 'age', title: 'Age' },
            { id: 'height', title: 'Height' },
            { id: 'origin', title: 'Origin' },
            { id: 'devilFruit', title: 'Devil Fruit' },
            { id: 'affiliations', title: 'Affiliations' },
            { id: 'bounty', title: 'Bounty' },
            { id: 'haki', title: 'Haki' },
            { id: 'firstAppearance', title: 'First Appearance' },
            { id: 'pictureUrl', title: 'Image URL' }
        ],
    });

    const records = characters
        .filter((c) => c !== null)
        .map((c) => ({
            id: c.id || '',
            name: c.name || '',
            gender: c.gender || '',
            age: c.age || '',
            height: c.height || '',
            origin: c.origin || '',
            devilFruit: c.devilFruit || '',
            affiliations: Array.isArray(c.affiliations) ? c.affiliations.join(', ') : (c.affiliations || ''),
            bounty: c.bounty || '',
            haki: Array.isArray(c.haki) ? c.haki.join(', ') : (c.haki || ''),
            firstAppearance: c.firstAppearance || '',
            pictureUrl: c.pictureUrl || ''
        }));

    await csvWriter.writeRecords(records);
    console.log(`✓ Saved to ${filepath}`);
}

/**
 * Save data to SQL
 */
function saveToSQL(characters) {
    const filepath = `${OUTPUT_DIR}/characters.sql`;
    const escapeSql = (value) => (value ? `'${String(value).replace(/'/g, "''")}'` : 'NULL');

    let sql = '';

    characters
        .filter((c) => c !== null)
        .forEach((c) => {
            const affiliations = Array.isArray(c.affiliations) ? c.affiliations.join(', ') : c.affiliations;
            const hakiValue = Array.isArray(c.haki) && c.haki.length > 0 ? JSON.stringify(c.haki) : null;

            sql += `INSERT INTO character (id, name, gender, age, height, origin, devilFruit, affiliations, bounty, haki, firstAppearance, pictureUrl) \n`;
            sql += `VALUES (${escapeSql(c.id)}, ${escapeSql(c.name)}, ${escapeSql(c.gender)}, ${escapeSql(c.age)}, ${escapeSql(c.height)}, ${escapeSql(c.origin)}, ${escapeSql(c.devilFruit)}, ${escapeSql(affiliations)}, ${escapeSql(c.bounty)}, ${escapeSql(hakiValue)}, ${escapeSql(c.firstAppearance)}, ${escapeSql(c.pictureUrl)}) \n`;
            sql += `ON CONFLICT(id) DO UPDATE SET \n`;
            sql += `  name = excluded.name,\n`;
            sql += `  gender = excluded.gender,\n`;
            sql += `  age = excluded.age,\n`;
            sql += `  height = excluded.height,\n`;
            sql += `  origin = excluded.origin,\n`;
            sql += `  devilFruit = excluded.devilFruit,\n`;
            sql += `  affiliations = excluded.affiliations,\n`;
            sql += `  bounty = excluded.bounty,\n`;
            sql += `  haki = excluded.haki,\n`;
            sql += `  firstAppearance = excluded.firstAppearance,\n`;
            sql += `  pictureUrl = excluded.pictureUrl;\n\n`;
        });

    fs.writeFileSync(filepath, sql);
    console.log(`✓ Saved to ${filepath}`);
}

/**
 * Main execution
 */
async function main() {
    const format = process.argv[2] || 'all'; // json, csv, sql, or all

    console.log(`\nOne Piece Scraper - Mode: ${format}\n`);

    // Step 1: Scraping Devil Fruits
    console.log('=== Step 1: Scraping Devil Fruits ===\n');
    const devilFruitList = await fetchAllDevilFruitsUrl();

    if (devilFruitList.length === 0) {
        console.warn('No devil fruits found, continuing with characters...\n');
    } else {
        const devilFruits = [];

        for (let i = 0; i < devilFruitList.length; i += DEVIL_FRUIT_CONCURRENCY) {
            const batch = devilFruitList.slice(i, i + DEVIL_FRUIT_CONCURRENCY);
            const results = await Promise.all(
                batch.map((df) => fetchDevilFruit(df.url, df.id, df.name, df.type))
            );

            results.filter(Boolean).forEach((data) => {
                console.table({
                    ID: data.id,
                    Name: data.name,
                    Type: data.type
                });

                devilFruits.push(data);
            });
        }

        console.log(`\n✓ Scraped ${devilFruits.length} devil fruits\n`);

        if (format === 'json' || format === 'all') {
            await saveDevilFruitsToJSON(devilFruits);
        }
        if (format === 'sql' || format === 'all') {
            saveDevilFruitsToSQL(devilFruits);
        }
    }

    // Step 2: Scraping Characters
    console.log('=== Step 2: Scraping Characters ===\n');
    const characterList = await fetchAllCharactersUrl();

    if (characterList.length === 0) {
        console.error('No characters found. Exiting.');
        return;
    }

    const characters = [];

    for (let i = 0; i < characterList.length; i += CHARACTER_CONCURRENCY) {
        const batch = characterList.slice(i, i + CHARACTER_CONCURRENCY);
        const results = await Promise.all(
                batch.map((char) => fetchCharacter(char.url, char.id, char.name, char.pictureUrl))
        );
        results.filter(Boolean).forEach((data) => {
            console.table({
                ID: data.id,
                Name: data.name,
                Gender: data.gender,
                Age: data.age,
                Affiliations: data.affiliations.join(', '),
                DevilFruit: data.devilFruit,
                Haki: data.haki.join(', '),
                Height: data.height,
                Bounty: data.bounty,
                Origin: data.origin,
                FirstAppearance: data.firstAppearance,
                pictureUrl: data.pictureUrl
            });

            characters.push(data);
        });
    }

    console.log(`\n✓ Scraped ${characters.length} characters\n`);

    if (format === 'json' || format === 'all') {
        await saveToJSON(characters);
    }
    if (format === 'csv' || format === 'all') {
        await saveToCSV(characters);
    }
    if (format === 'sql' || format === 'all') {
        saveToSQL(characters);
    }

    console.log('\n✓ Done!\n');
}

main().catch(console.error);