import * as cheerio from 'cheerio'; import fs from 'fs'; import https from 'https'; import { createObjectCsvWriter } from 'csv-writer'; const FANDOM_BASE_URL = 'https://onepiece.fandom.com/fr/wiki'; const OUTPUT_DIR = './scraped-data'; const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed const INITIAL_RETRY_DELAY = 1000; // Keep same HTTP session like a normal browser - maintain connection pool and allow cookie persistence const httpsAgent = new https.Agent({ keepAlive: true, keepAliveMsecs: 1000, maxFreeSockets: 10, maxSockets: 50, maxConnections: 50, timeout: 30000 }); // Store cookies across requests (simulate browser behavior) const cookies = new Map(); function getCookieHeader() { const cookieArray = Array.from(cookies.values()).map(c => c.split(';')[0]); return cookieArray.length > 0 ? cookieArray.join('; ') : ''; } function saveCookies(setCookieHeader) { if (setCookieHeader) { const cookiesList = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader]; cookiesList.forEach(cookie => { const [nameValue] = cookie.split(';'); const [name] = nameValue.split('='); if (name) cookies.set(name, cookie); }); } } // Create output directory if (!fs.existsSync(OUTPUT_DIR)) { fs.mkdirSync(OUTPUT_DIR, { recursive: true }); } /** * Retry a fetch request with exponential backoff */ async function fetchWithRetry(url, options = {}, retries = 0) { try { const headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) Firefox/150.0', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', ...options.headers }; // Add cookies from previous requests const cookieHeader = getCookieHeader(); if (cookieHeader) { headers['Cookie'] = cookieHeader; } const response = await fetch(url, { headers, agent: httpsAgent, ...options }); // Save cookies from response const setCookie = response.headers.get('set-cookie'); if (setCookie) { saveCookies(setCookie); } // Check if response is OK (status 200-299) if (response.ok) { return response; } // If not OK and we have retries left, retry if (retries < MAX_RETRIES) { const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries); console.log(`⚠️ HTTP ${response.status} for ${url}, retrying in ${delay}ms...`); await new Promise(resolve => setTimeout(resolve, delay)); return fetchWithRetry(url, options, retries + 1); } // If we've exhausted retries, throw error throw new Error(`HTTP ${response.status}: ${response.statusText}`); } catch (error) { // If it's a network error and we have retries left, retry if (retries < MAX_RETRIES) { const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries); console.log(`⚠️ Network error: ${error.message}, retrying in ${delay}ms...`); await new Promise(resolve => setTimeout(resolve, delay)); return fetchWithRetry(url, options, retries + 1); } // If we've exhausted retries, throw error throw error; } } /** * Normalize string by decoding URI components, punctuation, and replacing spaces with underscores */ function normalizeId(str) { return decodeURIComponent(str) .normalize('NFD') .replace(/[,:.\(\)]/g, '') .replace(/\s+/g, '_') .toLowerCase(); } /** * Fetch all arcs from One Piece fandom */ async function fetchAllArcs() { try { const url = `${FANDOM_BASE_URL}/Chapitres_et_Tomes`; console.log('Fetching arcs list...'); const response = await fetchWithRetry(url); const data = await response.text(); const $ = cheerio.load(data); const arcs = []; // Find all arc links in the table $('table.wikitable td a').each((index, element) => { const text = $(element).text().trim(); const href = $(element).attr('href'); // Check if it's an arc link (contains "Arc" and chapter info) if (text.includes('Arc') && text.includes('Ch.')) { // Extract arc name and chapter range // Example text: "Arc Ville d'Orange(Ch.8 à 21)[T.1 à 3]" console.log(`Processing arc link: ${text} (${href})`); const nameMatch = text.match(/^(.*?Arc.*?)\s*\(Ch\.(\d+)(?:\s*à\s*(?:(\d+)|(?:...)))?\)/); if (nameMatch) { let arcName = nameMatch[1].trim(); // Remove "Arc " from the name arcName = arcName.replace(/^Arc\s+/i, ''); const startChapter = parseInt(nameMatch[2]); const endChapter = nameMatch[3] ? parseInt(nameMatch[3]) : null; // Generate arc ID by normalizing the url let arcId = normalizeId(href.replace('/fr/wiki/', '')); // Remove "Arc_" from the id arcId = arcId.replace(/^arc_/i, ''); arcs.push({ id: arcId, name: arcName, startChapter, endChapter, url: href.replace('/fr/wiki/', '') }); } } }); console.log(`Found ${arcs.length} arcs.`); return arcs; } catch (error) { console.error('Error fetching arcs list:', error.message); return []; } } /** * Save arcs to JSON */ async function saveArcsToJSON(arcs) { const filepath = `${OUTPUT_DIR}/arcs.json`; fs.writeFileSync(filepath, JSON.stringify(arcs, null, 2)); console.log(`✓ Saved to ${filepath}`); } /** * Save arcs to CSV */ async function saveArcsToCSV(arcs) { const filepath = `${OUTPUT_DIR}/arcs.csv`; const csvWriter = createObjectCsvWriter({ path: filepath, header: [ { id: 'id', title: 'ID' }, { id: 'name', title: 'Name' }, { id: 'startChapter', title: 'Start Chapter' }, { id: 'endChapter', title: 'End Chapter' }, { id: 'url', title: 'URL' } ], }); const records = arcs .filter((arc) => arc !== null) .map((arc) => ({ id: arc.id || '', name: arc.name || '', startChapter: arc.startChapter || '', endChapter: arc.endChapter || '', url: arc.url || '' })); await csvWriter.writeRecords(records); console.log(`✓ Saved to ${filepath}`); } /** * Fetch all cannon characters from One Piece fandom */ async function fetchAllCharactersUrl() { try { const url = `${FANDOM_BASE_URL}/Liste_des_Personnages_Canon`; console.log('Fetching character list...'); const response = await fetchWithRetry(url); const data = await response.text(); const $ = cheerio.load(data); const characters = []; $('table.wikitable tbody tr').each((index, element) => { if (index === 0) return; // Skip header row let charpictureUrl = $(element).find('td:nth-child(1) a img').attr('data-src') || $(element).find('td:nth-child(1) a img').attr('src'); let charUrl = $(element).find('td:nth-child(2) a').attr('href'); let charName = $(element).find('td:nth-child(2) a').text().trim(); let charChapter = $(element).find('td:nth-child(3)').text().trim(); // Remove parentheses and their content from chapter info (e.g. "1 (flashback)" becomes "1") charChapter = charChapter.replace(/\([^)]*\)/g, ''); charChapter = charChapter.replace(/\D/g, ''); // If charChapter is empty, skip the character as it means they don't have a proper page and are just mentioned in the list if (!charChapter) { return; } if (charUrl) { charUrl = charUrl.replace('/fr/wiki/', ''); characters.push({ name: charName, url: charUrl, pictureUrl: charpictureUrl, chapter: charChapter, }); } }); console.log(`Found ${characters.length} characters.`); return characters; } catch (error) { console.error('Error fetching character list:', error.message); return []; } } /** * Fetch character data from fandom using provided URL */ async function fetchCharacter(characterUrl, characterName, characterpictureUrl, characterChapter) { try { console.log(`Fetching: ${characterName}...`); const response = await fetchWithRetry(`${FANDOM_BASE_URL}/${characterUrl}`, { redirect: 'follow' }); // Use final URL after redirects (canonical character page) let finalCharacterUrl = characterUrl; let finalCharacterId = normalizeId(characterUrl); try { const finalUrl = new URL(response.url); const characterUrl = finalUrl.pathname.replace('/fr/wiki/', ''); if (characterUrl) { finalCharacterUrl = characterUrl; finalCharacterId = normalizeId(characterUrl); } } catch { // If HTTP is not ok or redirected URL, throw an error to be caught in the outer block if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } } const data = await response.text(); const $ = cheerio.load(data); // Extract character name const name = $('h1.mw-page-title-main').text().trim() || characterName.replace(/_/g, ' '); // Generate character ID from URL + name combination finalCharacterId = normalizeId(finalCharacterUrl + '_' + name); // Extract gender from the specific categories link let gender = null; if ($('.page-header__categories a[title="Catégorie:Personnages Masculins"]').length > 0) { gender = 'Male'; } else if ($('.page-header__categories a[title="Catégorie:Personnages Féminins"]').length > 0) { gender = 'Female'; } // Extract age const age = extractAge($); // Extract affiliations const affiliations = extractAffiliations($); // Extract epithets const epithets = extractEpithets($); // Extract devil fruit const devilFruitData = await extractDevilFruit($); const devilFruitId = devilFruitData?.devilFruitId || null; const devilFruitUrl = devilFruitData?.devilFruitUrl || null; // Extract haki const hakiObservation = $('.page-header__categories a[title="Catégorie:Utilisateurs du Haki de l\'observation"]').length > 0; const hakiArmament = $('.page-header__categories a[title="Catégorie:Utilisateurs du Haki de l\'armement"]').length > 0; const hakiConqueror = $('.page-header__categories a[title="Catégorie:Utilisateurs du Haki des rois"]').length > 0; // Extract bounty const bounty = extractBounty($); // Extract height const height = extractHeight($); // Use chapter from character list, cast to int let firstAppearance = parseInt(characterChapter); // Extract origin const origin = extractOrigin($); // Extract status const status = extractStatus($); // Extract image URL and clean it let pictureUrl = characterpictureUrl; if (pictureUrl && pictureUrl.includes('Image_Non_Disponible')) { pictureUrl = null; } return { id: finalCharacterId, name, gender, age, height, origin, devilFruitId, devilFruitUrl, affiliations, bounty, hakiObservation, hakiArmament, hakiConqueror, epithets, firstAppearance, status, pictureUrl, url: finalCharacterUrl }; } catch (error) { console.error(`Error fetching ${characterName}:`, error.message); return null; } } /** * Extract age from infobox */ function extractAge($) { const div = $('[data-source="âge"] .pi-data-value'); if (div.length === 0) return null; let text = div.html(); if (!text) return null; // Remove all sup blocks (citations) text = text.replace(/]*>.*?<\/sup>/gi, ''); // Get the last element and extract only digits const parts = text.split(']*>/g, '').trim(); // Remove content with parentheses cleanText = cleanText.replace(/\([^)]*\)/g, ''); const digitsOnly = cleanText.replace(/\D/g, ''); return parseInt(digitsOnly) || null; } /** * Extract affiliations from infobox */ function extractAffiliations($) { const div = $('[data-source="affiliation"] .pi-data-value'); if (div.length === 0) return []; const cleanedDiv = div.clone(); cleanedDiv.find('sup').remove(); let text = cleanedDiv.html(); if (!text) return []; // Extract all link values const linkValues = cleanedDiv.find('a').map((i, el) => $(el).text().trim()).get(); if (linkValues.length > 0) { return linkValues; } // Fallback to parsing text const cleanText = text.replace(/<[^>]*>/g, '').trim(); const parts = cleanText.split(/\s*\n\s*|\s*;\s*|\s*,\s*/).filter(Boolean); return parts.length > 0 ? parts : []; } /** * Extract epithets from infobox * Epithets are always between double quotes */ function extractEpithets($) { const div = $('[data-source="épithète"] .pi-data-value'); if (div.length === 0) return []; const cleanedDiv = div.clone(); cleanedDiv.find('sup').remove(); let text = cleanedDiv.text(); if (!text) return []; // Extract all text between double quotes (both straight and curly quotes) const matches = text.match(/["«"]([^"»"]+)["»"]/g); if (!matches) return []; // Remove the quotes and trim const epithets = matches.map(match => match.replace(/^["«"]|["»"]$/g, '').trim() ).filter(Boolean); return epithets; } /** * Extract devil fruit from infobox * Returns both normalized ID and URL */ async function extractDevilFruit($) { const link = $('[data-source="dfnom"] .pi-data-value a').first(); if (link.length === 0) return null; const href = link.attr('href'); if (!href || !href.startsWith('/fr/wiki/')) return null; const cleanUrl = href.replace('/fr/wiki/', ''); try { // Fetch the page to follow redirects const response = await fetchWithRetry(`${FANDOM_BASE_URL}/${cleanUrl}`, { redirect: 'follow' // Explicitly follow redirects }); // Use the final URL after redirects const finalUrl = new URL(response.url); const pathname = finalUrl.pathname; const finalPath = pathname.replace('/fr/wiki/', ''); if (finalPath) { return { devilFruitId: normalizeId(finalPath), devilFruitUrl: finalPath }; } } catch (error) { console.error(`Error fetching devil fruit page: ${error.message}`); } // Fallback to the original href return { devilFruitId: normalizeId(cleanUrl), devilFruitUrl: cleanUrl }; } /** * Extract bounty from infobox */ function extractBounty($) { const div = $('[data-source="prime"] .pi-data-value'); if (div.length === 0) return 0; let text = div.html(); if (!text) return 0; // Remove all sup blocks (citations) text = text.replace(/]*>.*?<\/sup>/gi, ''); // Extract the first value before any
tag const firstValue = text.split(']*>/g, '').trim(); // Check if cleanText contains digits if (!/\d/.test(cleanText)) { // If no digits, try second value after
const secondValue = text.split('
')[1]; if (secondValue) { cleanText = secondValue.replace(/<[^>]*>/g, '').trim(); } } // Remove all non-digits cleanText = cleanText.replace(/\D/g, ''); return cleanText || 0; } /** * Extract height from infobox */ function extractHeight($) { const div = $('[data-source="taille"] .pi-data-value'); if (div.length === 0) return null; let text = div.html(); if (!text) return null; // Remove all sup blocks (citations) text = text.replace(/]*>.*?<\/sup>/gi, ''); // Check if there's a

tag - if yes, use content from

let content; const pMatch = text.match(/]*>(.*?)<\/p>/i); if (pMatch) { // Extract content from the

tag content = pMatch[1]; } else { // Use the last value method (after any
tag) content = text.split('
').pop(); } let cleanText = content.replace(/<[^>]*>/g, '').trim(); // Remove content with parentheses cleanText = cleanText.replace(/\([^)]*\)/g, ''); // Normalize units for meters or centimeters const normalized = cleanText.toLowerCase().replace(/\s/g, ''); if (normalized.includes('cm')) { const digitsOnly = normalized.replace(/\D/g, ''); return digitsOnly || null; } if (normalized.includes('m')) { const parts = normalized.split('m').filter(Boolean); return parts.length > 0 ? parts.join('.') : null; } return normalized.replace(/\D/g, '') || null; } /** * Extract origin from infobox */ function extractOrigin($) { const div = $('[data-source="origine"] .pi-data-value'); if (div.length === 0) return null; let text = div.html(); if (!text) return null; // Remove all sup blocks (citations) text = text.replace(/]*>.*?<\/sup>/gi, ''); // Extract the first value before any
tag const firstValue = text.split(']*>/g, '').trim(); // Remove content with parentheses cleanText = cleanText.replace(/\([^)]*\)/g, '').trim(); return cleanText || null; } /** * Extract status from infobox */ function extractStatus($) { const div = $('[data-source="statut"] .pi-data-value'); if (div.length === 0) return null; const statusText = div.text().trim().toLowerCase(); if (statusText.includes('vivant')) { return 'Alive'; } else if (statusText.includes('décédé')) { return 'Dead'; } return null; } /** * Save data to JSON */ async function saveToJSON(characters) { const filepath = `${OUTPUT_DIR}/characters.json`; fs.writeFileSync(filepath, JSON.stringify(characters, null, 2)); console.log(`✓ Saved to ${filepath}`); } /** * Save data to CSV */ async function saveToCSV(characters) { const filepath = `${OUTPUT_DIR}/characters.csv`; const csvWriter = createObjectCsvWriter({ path: filepath, header: [ { id: 'id', title: 'ID' }, { id: 'name', title: 'Name' }, { id: 'gender', title: 'Gender' }, { id: 'age', title: 'Age' }, { id: 'height', title: 'Height' }, { id: 'origin', title: 'Origin' }, { id: 'status', title: 'Status' }, { id: 'epithets', title: 'Epithets' }, { id: 'devilFruitId', title: 'Devil Fruit ID' }, { id: 'affiliations', title: 'Affiliations' }, { id: 'bounty', title: 'Bounty' }, { id: 'hakiObservation', title: 'Haki Observation' }, { id: 'hakiArmament', title: 'Haki Armament' }, { id: 'hakiConqueror', title: 'Haki Conqueror' }, { id: 'firstAppearance', title: 'First Appearance' }, { id: 'arcId', title: 'Arc ID' }, { id: 'pictureUrl', title: 'Image URL' }, { id: 'url', title: 'Fandom URL' } ], }); const records = characters .filter((c) => c !== null) .map((c) => ({ id: c.id || '', name: c.name || '', gender: c.gender || '', age: c.age || '', height: c.height || '', origin: c.origin || '', status: c.status || '', epithets: Array.isArray(c.epithets) ? c.epithets.join(', ') : (c.epithets || ''), devilFruitId: c.devilFruitId || '', affiliations: Array.isArray(c.affiliations) ? c.affiliations.join(', ') : (c.affiliations || ''), bounty: c.bounty ?? 0, hakiObservation: c.hakiObservation ? 1 : 0, hakiArmament: c.hakiArmament ? 1 : 0, hakiConqueror: c.hakiConqueror ? 1 : 0, firstAppearance: c.firstAppearance || '', arcId: c.arcId || '', pictureUrl: c.pictureUrl || '', url: c.url || '' })); await csvWriter.writeRecords(records); console.log(`✓ Saved to ${filepath}`); } /** * Fetch devil fruit data from fandom using provided URL */ async function fetchDevilFruit(devilFruitUrl, devilFruitId) { try { console.log(`Fetching devil fruit: ${devilFruitId}...`); const response = await fetchWithRetry(`${FANDOM_BASE_URL}/${devilFruitUrl}`); const data = await response.text(); const $ = cheerio.load(data); const name = $('span.mw-page-title-main').text().trim(); // Extract type from label in infobox let type = null; const typeDiv = $('[data-source="type"] .pi-data-value'); if (typeDiv.length > 0) { const typeText = typeDiv.text().trim().toLowerCase(); if (typeText.includes('zoan')) { type = 'Zoan'; } else if (typeText.includes('paramecia')) { type = 'Paramecia'; } else if (typeText.includes('logia')) { type = 'Logia'; } } return { id: devilFruitId, name, type, url: devilFruitUrl }; } catch (error) { console.error(`Error fetching devil fruit ${devilFruitUrl}:`, error.message); return null; } } /** * Save devil fruits to JSON */ async function saveDevilFruitsToJSON(devilFruits) { const filepath = `${OUTPUT_DIR}/devil-fruits.json`; fs.writeFileSync(filepath, JSON.stringify(devilFruits, null, 2)); console.log(`✓ Saved to ${filepath}`); } /** * Save devil fruits to CSV */ async function saveDevilFruitsToCSV(devilFruits) { const filepath = `${OUTPUT_DIR}/devil-fruits.csv`; const csvWriter = createObjectCsvWriter({ path: filepath, header: [ { id: 'id', title: 'ID' }, { id: 'name', title: 'Name' }, { id: 'type', title: 'Type' }, { id: 'url', title: 'URL' } ], }); const records = devilFruits .filter((df) => df !== null) .map((df) => ({ id: df.id || '', name: df.name || '', type: df.type || '', url: df.url || '' })); await csvWriter.writeRecords(records); console.log(`✓ Saved to ${filepath}`); } /** * Main execution */ async function main() { const format = process.argv[2] || 'all'; // json, csv, or all console.log(`\nOne Piece Scraper - Mode: ${format}\n`); // Step 1: Scraping Arcs console.log('=== Step 1: Scraping Arcs ===\n'); const arcsList = await fetchAllArcs(); if (arcsList.length > 0) { // Display arcs in table format arcsList.forEach((arc) => { console.table({ ID: arc.id, Name: arc.name, StartChapter: arc.startChapter, EndChapter: arc.endChapter || 'Ongoing', URL: arc.url }); }); console.log(`\n✓ Found ${arcsList.length} arcs\n`); if (format === 'json' || format === 'all') { await saveArcsToJSON(arcsList); } if (format === 'csv' || format === 'all') { await saveArcsToCSV(arcsList); } } else { console.warn('No arcs found, continuing...\n'); } // Step 2: Scraping Characters console.log('=== Step 1: Scraping Characters ===\n'); const characterList = await fetchAllCharactersUrl(); if (characterList.length === 0) { console.error('No characters found. Exiting.'); return; } const characters = []; const devilFruitUrls = new Set(); let failedCharacters = [...characterList]; while (failedCharacters.length > 0) { const nextFailedCharacters = []; console.log(`\nFetching ${failedCharacters.length} characters...`); for (let i = 0; i < failedCharacters.length; i++) { const char = failedCharacters[i]; const data = await fetchCharacter(char.url, char.name, char.pictureUrl, char.chapter); if (data) { console.table({ ID: data.id, Name: data.name, Gender: data.gender, Age: data.age, Status: data.status, Epithets: data.epithets.join(', '), Affiliations: data.affiliations.join(', '), DevilFruitId: data.devilFruitId, DevilFruitUrl: data.devilFruitUrl, HakiObservation: data.hakiObservation ? 'Yes' : 'No', HakiArmament: data.hakiArmament ? 'Yes' : 'No', HakiConqueror: data.hakiConqueror ? 'Yes' : 'No', Height: data.height, Bounty: data.bounty, Origin: data.origin, FirstAppearance: data.firstAppearance, pictureUrl: data.pictureUrl, FandomURL: data.url }); // Collect devil fruit URLs if (data.devilFruitUrl) { devilFruitUrls.add(data.devilFruitUrl); } // Add arc IDs to character data if (data.firstAppearance) { const arc = arcsList.find(a => a.startChapter <= parseInt(data.firstAppearance) && (a.endChapter === null || a.endChapter >= parseInt(data.firstAppearance))); if (arc) { data.arcId = arc.id; } } characters.push(data); } else { // Add to retry list and wait before next character nextFailedCharacters.push(char); await new Promise(resolve => setTimeout(resolve, 1000)); } } failedCharacters = nextFailedCharacters; if (failedCharacters.length > 0) { console.log(`⚠️ ${failedCharacters.length} characters failed. Retrying...`); } } console.log(`\n✓ Scraped ${characters.length} characters\n`); console.log(`✓ Found ${devilFruitUrls.size} unique devil fruits\n`); // Step 3: Scraping Devil Fruits console.log('=== Step 2: Scraping Devil Fruits ===\n'); if (devilFruitUrls.size === 0) { console.warn('No devil fruits found from characters, skipping...\n'); } else { const devilFruits = []; const devilFruitUrlArray = Array.from(devilFruitUrls); for (let i = 0; i < devilFruitUrlArray.length; i++) { const url = devilFruitUrlArray[i]; const data = await fetchDevilFruit(url, normalizeId(url)); if (data) { console.table({ ID: data.id, Name: data.name, Type: data.type, URL: data.url }); devilFruits.push(data); } } console.log(`\n✓ Scraped ${devilFruits.length} devil fruits\n`); if (format === 'json' || format === 'all') { await saveDevilFruitsToJSON(devilFruits); } if (format === 'csv' || format === 'all') { await saveDevilFruitsToCSV(devilFruits); } // Update characters with normalized devil fruit IDs const devilFruitMap = new Map(devilFruits.map(df => [df.id, df.id])); characters.forEach(char => { if (char.devilFruitUrl) { const normalizedId = normalizeId(char.devilFruitUrl); char.devilFruitId = devilFruitMap.get(normalizedId) || normalizedId; } }); } // Save characters after devil fruit IDs are updated if (format === 'json' || format === 'all') { await saveToJSON(characters); } if (format === 'csv' || format === 'all') { await saveToCSV(characters); } console.log('\n✓ Done!\n'); } main().catch(console.error);