import * as cheerio from 'cheerio'; import fs from 'fs'; import { createObjectCsvWriter } from 'csv-writer'; const FANDOM_BASE_URL = 'https://onepiece.fandom.com/fr/wiki'; const OUTPUT_DIR = './scraped-data'; const DEVIL_FRUIT_CONCURRENCY = 5; const CHARACTER_CONCURRENCY = 10; // Create output directory if (!fs.existsSync(OUTPUT_DIR)) { fs.mkdirSync(OUTPUT_DIR, { recursive: true }); } /** * Normalize string by removing accents and converting to lowercase */ function normalizeId(str) { return decodeURIComponent(str) .normalize('NFD') .replace(/[\u0300-\u036f]/g, '') .replace(/[,:]/g, '') .toLowerCase(); } /** * Fetch all devil fruits URLs from One Piece fandom */ async function fetchAllDevilFruitsUrl() { try { const url = `${FANDOM_BASE_URL}/Fruits_du_Démon`; console.log('Fetching devil fruits list...'); const response = await fetch(url, { headers: { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 7.01; Windows NT 5.0)', }, }); const data = await response.text(); const $ = cheerio.load(data); const devilFruits = []; // Find the main navibox table $('table.navibox.toccolours').each((mainTableIndex, mainTable) => { const mainHeader = $(mainTable).find('th[colspan="3"]').first().find('span').last().text().trim(); if (mainHeader !== 'Fruits du Démon') return; $(mainTable).find('table.collapsible').each((typeTableIndex, typeTable) => { const typeHeader = $(typeTable).find('th[colspan="3"]').first().text().trim(); let type = null; if (typeHeader.includes('Paramecia')) type = 'Paramecia'; else if (typeHeader.includes('Zoan')) type = 'Zoan'; else if (typeHeader.includes('Logia')) type = 'Logia'; else if (typeHeader.includes('Type Inconnu')) type = 'Unknown'; if (!type) return; $(typeTable).find('tr.navibox-row').each((rowIndex, row) => { const categoryHeader = $(row).find('th').text().trim(); if (!categoryHeader.includes('Canon') && !categoryHeader.includes('Standards') && !categoryHeader.includes('Antiques') && !categoryHeader.includes('Mythiques') && !categoryHeader.includes('Hors-Série')) { return; } // Find all links in the row $(row).find('td .hlist ul li a').each((linkIndex, link) => { const name = $(link).text().trim(); const href = $(link).attr('href'); if (name && href && href.startsWith('/fr/wiki/')) { // Clean the URL const cleanUrl = href.replace('/fr/wiki/', ''); // Skip classification pages and category pages if (cleanUrl.includes('Classification') || cleanUrl.includes('Catégorie:') || cleanUrl === 'Fruits_du_Démon_Artificiels' || cleanUrl === 'SMILE') { return; } devilFruits.push({ id: normalizeId(cleanUrl), name, type, url: cleanUrl, }); } }); }); }); }); console.log(`Found ${devilFruits.length} devil fruits.`); return devilFruits; } catch (error) { console.error('Error fetching devil fruits list:', error.message); return []; } } /** * Fetch devil fruit data from fandom using provided URL */ async function fetchDevilFruit(devilFruitUrl, devilFruitId, devilFruitName, devilFruitType) { try { console.log(`Fetching: ${devilFruitName}...`); const response = await fetch(`${FANDOM_BASE_URL}/${devilFruitUrl}`, { headers: { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 7.01; Windows NT 5.0)', }, }); const data = await response.text(); const $ = cheerio.load(data); // Extract devil fruit name from page title if different const name = $('h1.mw-page-title-main').text().trim() || devilFruitName; // Use the type from the list page const type = devilFruitType; return { id: devilFruitId, name, type }; } catch (error) { console.error(`Error fetching ${devilFruitName}:`, error.message); return null; } } /** * Save devil fruits to JSON */ async function saveDevilFruitsToJSON(devilFruits) { const filepath = `${OUTPUT_DIR}/devil-fruits.json`; fs.writeFileSync(filepath, JSON.stringify(devilFruits, null, 2)); console.log(`✓ Saved to ${filepath}`); } /** * Save devil fruits to SQL */ function saveDevilFruitsToSQL(devilFruits) { const filepath = `${OUTPUT_DIR}/devil-fruits.sql`; const escapeSql = (value) => (value ? `'${String(value).replace(/'/g, "''")}'` : 'NULL'); let sql = ''; devilFruits.forEach((df) => { sql += `INSERT INTO devilFruit (id, name, type) \n`; sql += `VALUES (${escapeSql(df.id)}, ${escapeSql(df.name)}, ${escapeSql(df.type)}) \n`; sql += `ON CONFLICT(id) DO UPDATE SET \n`; sql += ` name = excluded.name,\n`; sql += ` type = excluded.type;\n\n`; }); fs.writeFileSync(filepath, sql); console.log(`✓ Saved to ${filepath}`); } /** * Fetch all cannon characters from One Piece fandom */ async function fetchAllCharactersUrl() { try { const url = `${FANDOM_BASE_URL}/Liste_des_Personnages_Canon`; console.log('Fetching character list...'); const response = await fetch(url, { headers: { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 7.01; Windows NT 5.0)', }, }); const data = await response.text(); const $ = cheerio.load(data); const characters = []; $('table.wikitable tbody tr').each((index, element) => { if (index === 0) return; // Skip header row const charpictureUrl = $(element).find('td:nth-child(1) a img').attr('data-src') || $(element).find('td:nth-child(1) a img').attr('src'); const charLink = $(element).find('td:nth-child(2) a').attr('href'); const charName = $(element).find('td:nth-child(2) a').text().trim(); if (charLink) { const cleanUrl = charLink.replace('/fr/wiki/', ''); characters.push({ id: normalizeId(cleanUrl), name: charName, url: cleanUrl, pictureUrl: charpictureUrl, }); } }); console.log(`Found ${characters.length} characters.`); return characters; } catch (error) { console.error('Error fetching character list:', error.message); return []; } } /** * Fetch character data from fandom using provided URL */ async function fetchCharacter(characterUrl, characterId, characterName, characterpictureUrl) { try { console.log(`Fetching: ${characterName}...`); const response = await fetch(`${FANDOM_BASE_URL}/${characterUrl}`, { headers: { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 7.01; Windows NT 5.0)', }, }); // Log response status for debugging const data = await response.text(); const $ = cheerio.load(data); // Extract character name const name = $('h1.mw-page-title-main').text().trim() || characterName.replace(/_/g, ' '); // Extract gender from the specific categories link let gender = null; if ($('.page-header__categories a[title="Catégorie:Personnages Masculins"]').length > 0) { gender = 'Male'; } else if ($('.page-header__categories a[title="Catégorie:Personnages Féminins"]').length > 0) { gender = 'Female'; } // Extract age const age = extractAge($); // Extract affiliations const affiliations = extractAffiliations($); // Extract devil fruit const devilFruit = await extractDevilFruit($); // Extract haki let haki = []; if ($('.page-header__categories a[title="Catégorie:Utilisateurs du Haki de l\'observation"]').length > 0) { haki.push('Observation'); } if ($('.page-header__categories a[title="Catégorie:Utilisateurs du Haki de l\'armement"]').length > 0) { haki.push('Armament'); } if ($('.page-header__categories a[title="Catégorie:Utilisateurs du Haki des rois"]').length > 0) { haki.push('Conqueror'); } // Extract bounty const bounty = extractBounty($); // Extract height const height = extractHeight($); // Extract first appearance const firstAppearance = extractFirstAppearance($); // Extract origin const origin = extractOrigin($); // Extract image URL and clean it let pictureUrl = characterpictureUrl; return { id: characterId, name, gender, age, height, origin, devilFruit, affiliations, bounty, haki, firstAppearance, pictureUrl }; } catch (error) { console.error(`Error fetching ${characterName}:`, error.message); return null; } } /** * Extract age from infobox */ function extractAge($) { const div = $('[data-source="âge"] .pi-data-value'); if (div.length === 0) return null; let text = div.html(); if (!text) return null; // Remove all sup blocks (citations) text = text.replace(/]*>.*?<\/sup>/gi, ''); // Get the last element and extract only digits const parts = text.split(']*>/g, '').trim(); // Remove content with parentheses cleanText = cleanText.replace(/\([^)]*\)/g, ''); const digitsOnly = cleanText.replace(/\D/g, ''); return digitsOnly || null; } /** * Extract affiliations from infobox */ function extractAffiliations($) { const div = $('[data-source="affiliation"] .pi-data-value'); if (div.length === 0) return []; const cleanedDiv = div.clone(); cleanedDiv.find('sup').remove(); let text = cleanedDiv.html(); if (!text) return []; // Extract all link values const linkValues = cleanedDiv.find('a').map((i, el) => $(el).text().trim()).get(); if (linkValues.length > 0) { return linkValues; } // Fallback to parsing text const cleanText = text.replace(/<[^>]*>/g, '').trim(); const parts = cleanText.split(/\s*\n\s*|\s*;\s*|\s*,\s*/).filter(Boolean); return parts.length > 0 ? parts : []; } /** * Extract devil fruit from infobox */ async function extractDevilFruit($) { const link = $('[data-source="dfnom"] .pi-data-value a').first(); if (link.length === 0) return null; const href = link.attr('href'); if (!href || !href.startsWith('/fr/wiki/')) return null; const cleanUrl = href.replace('/fr/wiki/', ''); try { // Fetch the page to follow redirects const response = await fetch(`${FANDOM_BASE_URL}/${cleanUrl}`, { headers: { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 7.01; Windows NT 5.0)', }, redirect: 'follow' // Explicitly follow redirects }); // Check if response was a redirect (301, 302, etc.) if (response.status === 301 || response.status === 302) { // Use the final redirected URL const finalUrl = new URL(response.url); const pathname = finalUrl.pathname; const finalPath = pathname.replace('/fr/wiki/', ''); if (finalPath) { return normalizeId(finalPath); } } else { // Use the current URL if no redirect const finalUrl = new URL(response.url); const pathname = finalUrl.pathname; const finalPath = pathname.replace('/fr/wiki/', ''); if (finalPath) { return normalizeId(finalPath); } } } catch (error) { console.error(`Error fetching devil fruit page: ${error.message}`); } // Fallback to the original href return normalizeId(cleanUrl); } /** * Extract bounty from infobox */ function extractBounty($) { const div = $('[data-source="prime"] .pi-data-value'); if (div.length === 0) return null; let text = div.html(); if (!text) return null; // Remove all sup blocks (citations) text = text.replace(/]*>.*?<\/sup>/gi, ''); // Extract the first value before any
tag const firstValue = text.split(']*>/g, '').trim(); // Remove spaces and dots cleanText = cleanText.replace(/[\s.]/g, ''); return cleanText || null; } /** * Extract height from infobox */ function extractHeight($) { const div = $('[data-source="taille"] .pi-data-value'); if (div.length === 0) return null; let text = div.html(); if (!text) return null; // Remove all sup blocks (citations) text = text.replace(/]*>.*?<\/sup>/gi, ''); // Extract the last value after any
tag const lastValue = text.split('
').pop().trim(); let cleanText = lastValue.replace(/<[^>]*>/g, '').trim(); // Remove content with parentheses cleanText = cleanText.replace(/\([^)]*\)/g, ''); // Normalize units for meters or centimeters const normalized = cleanText.toLowerCase().replace(/\s/g, ''); if (normalized.includes('cm')) { const digitsOnly = normalized.replace(/\D/g, ''); return digitsOnly || null; } if (normalized.includes('m')) { const parts = normalized.split('m').filter(Boolean); return parts.length > 0 ? parts.join('.') : null; } return normalized.replace(/\D/g, '') || null; } /** * Extract first appearance from infobox */ function extractFirstAppearance($) { const div = $('[data-source="première"] .pi-data-value'); if (div.length === 0) return null; let text = div.html(); if (!text) return null; // Remove all sup blocks (citations) text = text.replace(/]*>.*?<\/sup>/gi, ''); // Extract digits after "Chapitre" const cleanText = text.replace(/<[^>]*>/g, '').trim(); const match = cleanText.match(/Chapitre\s+(\d+)/i); return match ? match[1] : null; } /** * Extract origin from infobox */ function extractOrigin($) { const div = $('[data-source="origine"] .pi-data-value'); if (div.length === 0) return null; let text = div.html(); if (!text) return null; // Remove all sup blocks (citations) text = text.replace(/]*>.*?<\/sup>/gi, ''); // Extract the first value before any
tag const firstValue = text.split(']*>/g, '').trim(); // Remove content with parentheses cleanText = cleanText.replace(/\([^)]*\)/g, '').trim(); return cleanText || null; } /** * Save data to JSON */ async function saveToJSON(characters) { const filepath = `${OUTPUT_DIR}/characters.json`; fs.writeFileSync(filepath, JSON.stringify(characters, null, 2)); console.log(`✓ Saved to ${filepath}`); } /** * Save data to CSV */ async function saveToCSV(characters) { const filepath = `${OUTPUT_DIR}/characters.csv`; const csvWriter = createObjectCsvWriter({ path: filepath, header: [ { id: 'id', title: 'ID' }, { id: 'name', title: 'Name' }, { id: 'gender', title: 'Gender' }, { id: 'age', title: 'Age' }, { id: 'height', title: 'Height' }, { id: 'origin', title: 'Origin' }, { id: 'devilFruit', title: 'Devil Fruit' }, { id: 'affiliations', title: 'Affiliations' }, { id: 'bounty', title: 'Bounty' }, { id: 'haki', title: 'Haki' }, { id: 'firstAppearance', title: 'First Appearance' }, { id: 'pictureUrl', title: 'Image URL' } ], }); const records = characters .filter((c) => c !== null) .map((c) => ({ id: c.id || '', name: c.name || '', gender: c.gender || '', age: c.age || '', height: c.height || '', origin: c.origin || '', devilFruit: c.devilFruit || '', affiliations: Array.isArray(c.affiliations) ? c.affiliations.join(', ') : (c.affiliations || ''), bounty: c.bounty || '', haki: Array.isArray(c.haki) ? c.haki.join(', ') : (c.haki || ''), firstAppearance: c.firstAppearance || '', pictureUrl: c.pictureUrl || '' })); await csvWriter.writeRecords(records); console.log(`✓ Saved to ${filepath}`); } /** * Save data to SQL */ function saveToSQL(characters) { const filepath = `${OUTPUT_DIR}/characters.sql`; const escapeSql = (value) => (value ? `'${String(value).replace(/'/g, "''")}'` : 'NULL'); let sql = ''; characters .filter((c) => c !== null) .forEach((c) => { const affiliations = Array.isArray(c.affiliations) ? c.affiliations.join(', ') : c.affiliations; const hakiValue = Array.isArray(c.haki) && c.haki.length > 0 ? JSON.stringify(c.haki) : null; sql += `INSERT INTO character (id, name, gender, age, height, origin, devilFruit, affiliations, bounty, haki, firstAppearance, pictureUrl) \n`; sql += `VALUES (${escapeSql(c.id)}, ${escapeSql(c.name)}, ${escapeSql(c.gender)}, ${escapeSql(c.age)}, ${escapeSql(c.height)}, ${escapeSql(c.origin)}, ${escapeSql(c.devilFruit)}, ${escapeSql(affiliations)}, ${escapeSql(c.bounty)}, ${escapeSql(hakiValue)}, ${escapeSql(c.firstAppearance)}, ${escapeSql(c.pictureUrl)}) \n`; sql += `ON CONFLICT(id) DO UPDATE SET \n`; sql += ` name = excluded.name,\n`; sql += ` gender = excluded.gender,\n`; sql += ` age = excluded.age,\n`; sql += ` height = excluded.height,\n`; sql += ` origin = excluded.origin,\n`; sql += ` devilFruit = excluded.devilFruit,\n`; sql += ` affiliations = excluded.affiliations,\n`; sql += ` bounty = excluded.bounty,\n`; sql += ` haki = excluded.haki,\n`; sql += ` firstAppearance = excluded.firstAppearance,\n`; sql += ` pictureUrl = excluded.pictureUrl;\n\n`; }); fs.writeFileSync(filepath, sql); console.log(`✓ Saved to ${filepath}`); } /** * Main execution */ async function main() { const format = process.argv[2] || 'all'; // json, csv, sql, or all console.log(`\nOne Piece Scraper - Mode: ${format}\n`); // Step 1: Scraping Devil Fruits console.log('=== Step 1: Scraping Devil Fruits ===\n'); const devilFruitList = await fetchAllDevilFruitsUrl(); if (devilFruitList.length === 0) { console.warn('No devil fruits found, continuing with characters...\n'); } else { const devilFruits = []; for (let i = 0; i < devilFruitList.length; i += DEVIL_FRUIT_CONCURRENCY) { const batch = devilFruitList.slice(i, i + DEVIL_FRUIT_CONCURRENCY); const results = await Promise.all( batch.map((df) => fetchDevilFruit(df.url, df.id, df.name, df.type)) ); results.filter(Boolean).forEach((data) => { console.table({ ID: data.id, Name: data.name, Type: data.type }); devilFruits.push(data); }); } console.log(`\n✓ Scraped ${devilFruits.length} devil fruits\n`); if (format === 'json' || format === 'all') { await saveDevilFruitsToJSON(devilFruits); } if (format === 'sql' || format === 'all') { saveDevilFruitsToSQL(devilFruits); } } // Step 2: Scraping Characters console.log('=== Step 2: Scraping Characters ===\n'); const characterList = await fetchAllCharactersUrl(); if (characterList.length === 0) { console.error('No characters found. Exiting.'); return; } const characters = []; for (let i = 0; i < characterList.length; i += CHARACTER_CONCURRENCY) { const batch = characterList.slice(i, i + CHARACTER_CONCURRENCY); const results = await Promise.all( batch.map((char) => fetchCharacter(char.url, char.id, char.name, char.pictureUrl)) ); results.filter(Boolean).forEach((data) => { console.table({ ID: data.id, Name: data.name, Gender: data.gender, Age: data.age, Affiliations: data.affiliations.join(', '), DevilFruit: data.devilFruit, Haki: data.haki.join(', '), Height: data.height, Bounty: data.bounty, Origin: data.origin, FirstAppearance: data.firstAppearance, pictureUrl: data.pictureUrl }); characters.push(data); }); } console.log(`\n✓ Scraped ${characters.length} characters\n`); if (format === 'json' || format === 'all') { await saveToJSON(characters); } if (format === 'csv' || format === 'all') { await saveToCSV(characters); } if (format === 'sql' || format === 'all') { saveToSQL(characters); } console.log('\n✓ Done!\n'); } main().catch(console.error);