Files
OnePieceDle/scripts/scrape-onepiece.ts

1068 lines
29 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import * as cheerio from 'cheerio';
import fs from 'fs';
import { createObjectCsvWriter } from 'csv-writer';
// Type definitions
interface Arc {
id: string;
name: string;
frName: string | null;
startChapter: number;
endChapter: number | null;
url: string;
}
interface Character {
id: string;
name: string;
frName: string | null;
gender: string | null;
age: number | null;
height: number | null;
origin: string | null;
frOrigin: string | null;
devilFruitId: string | null;
devilFruitUrl: string | null;
affiliations: string[];
frAffiliations: string[] | null;
bounty: number | null;
hakiObservation: boolean;
hakiArmament: boolean;
hakiConqueror: boolean;
epithets: string[];
frEpithets: string[] | null;
firstAppearance: number;
status: string | null;
pictureUrl: string | null;
url: string;
frUrl: string | null;
arcId: string;
}
interface CharacterListItem {
name: string;
url: string;
chapter: number;
}
interface DevilFruitData {
devilFruitId: string;
devilFruitUrl: string;
}
interface DevilFruit {
id: string;
name: string;
type: string | null;
url: string;
}
const FANDOM_API_BASE =
'https://onepiece.fandom.com/api.php?action=parse&redirects=true&format=json&page=';
const FR_FANDOM_API_BASE =
'https://onepiece.fandom.com/fr/api.php?action=parse&redirects=true&format=json&page=';
const OUTPUT_DIR = './scraped-data';
const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed
const INITIAL_RETRY_DELAY = 1000;
const FETCH_CONCURRENCY = 50;
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
/**
* Retry a fetch request with exponential backoff
*/
async function fetchWithRetry(
url: string,
options: RequestInit = {},
retries: number = 0
): Promise<Response> {
try {
const headers: Record<string, string> = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) Firefox/150.0',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
Connection: 'keep-alive',
...((options.headers as Record<string, string>) || {})
};
const response = await fetch(url, {
headers,
...options
});
// Check if response is OK (status 200-299)
if (response.ok) {
return response;
}
// If not OK and we have retries left, retry
if (retries < MAX_RETRIES) {
const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
console.log(`⚠️ HTTP ${response.status} for ${url}, retrying in ${delay}ms...`);
await new Promise((resolve) => setTimeout(resolve, delay));
return fetchWithRetry(url, options, retries + 1);
}
// If we've exhausted retries, throw error
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
} catch (error) {
// If it's a network error and we have retries left, retry
if (retries < MAX_RETRIES) {
const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
console.log(`⚠️ Network error: ${(error as Error).message}, retrying in ${delay}ms...`);
await new Promise((resolve) => setTimeout(resolve, delay));
return fetchWithRetry(url, options, retries + 1);
}
// If we've exhausted retries, throw error
throw error;
}
}
/**
* Get the French link from the API response links array
*/
function getFrLink(links: { lang: string; ['*']: string; url: string }[]): { url: string } | null {
// Get french url by getting parse.langlinks where lang is "fr" and extract the name from there
const frLink = links.find(
(link: { lang: string; ['*']: string; url: string }) => link.lang === 'fr'
);
return frLink ? { url: frLink['url'] } : null;
}
/**
* Normalize string by decoding URI components, punctuation, and replacing spaces with underscores
*/
function normalizeId(str: string): string {
return decodeURIComponent(str)
.normalize('NFD')
.replace(/[,:.()]/g, '')
.replace(/\s+/g, '_')
.toLowerCase();
}
/**
* Fetch all arcs from One Piece fandom using API
*/
async function fetchAllArcs(): Promise<Arc[]> {
try {
const apiUrl = `${FANDOM_API_BASE}Chapters_and_Volumes`;
console.log('Fetching arcs list via API...');
const response = await fetchWithRetry(apiUrl);
const jsonData = await response.json();
// Extract HTML from API response
const htmlContent = jsonData.parse?.text?.['*'];
if (!htmlContent) {
throw new Error('Unable to extract HTML content from API response');
}
const $ = cheerio.load(htmlContent);
const arcs: Arc[] = [];
const seenArcUrls = new Set<string>();
// Arc rows are in table cells where first link points to a *_Arc page and text includes chapter range.
const arcCells = $('table.wikitable td').toArray();
for (const element of arcCells) {
const cell = $(element);
const firstLink = cell.find('a').first();
const href = firstLink.attr('href') || '';
let arcName = firstLink.text().trim();
if (!href.startsWith('/wiki/') || !/_Arc$/i.test(href)) {
continue;
}
if (!arcName || !/\bArc\b/i.test(arcName)) {
continue;
}
arcName = arcName.replace(/\bArc\b/i, '').trim();
const cleanUrl = href.replace('/wiki/', '');
if (seenArcUrls.has(cleanUrl)) {
continue;
}
const cellText = cell.text().replace(/\s+/g, ' ').trim();
const chapterMatch = cellText.match(/Chapters\s+(\d+)\s+to\s+(\d+|Current)/i);
if (!chapterMatch) {
continue;
}
const startChapter = parseInt(chapterMatch[1], 10);
const endChapter = /current/i.test(chapterMatch[2]) ? null : parseInt(chapterMatch[2], 10);
let arcId = normalizeId(cleanUrl);
arcId = arcId.replace(/_arc$/i, '');
// Query the href page via API to get the correct HTML content (in case of redirect) and extract the French name from there
const arcResponse = await fetchWithRetry(`${FANDOM_API_BASE}${cleanUrl}`);
const arcJsonData = await arcResponse.json();
let frArcName: string | null =
arcJsonData.parse?.langlinks.find(
(link: { lang: string; ['*']: string }) => link.lang === 'fr'
)?.['*'] || null;
// Remove "Arc" suffix from French name if present to keep it consistent with English names (e.g. "Arc de Luffy" becomes "Luffy")
if (frArcName && /\bArc\b/i.test(frArcName)) {
frArcName = frArcName.replace(/\bArc\b/i, '').trim();
}
arcs.push({
id: arcId,
name: arcName,
frName: frArcName,
startChapter,
endChapter,
url: cleanUrl
});
seenArcUrls.add(cleanUrl);
}
console.log(`Found ${arcs.length} arcs.`);
return arcs;
} catch (error) {
console.error('Error fetching arcs list:', (error as Error).message);
return [];
}
}
/**
* Save arcs to JSON
*/
async function saveArcsToJSON(arcs: Arc[]): Promise<void> {
const filepath = `${OUTPUT_DIR}/arcs.json`;
fs.writeFileSync(filepath, JSON.stringify(arcs, null, 2));
console.log(`✓ Saved to ${filepath}`);
}
/**
* Save arcs to CSV
*/
async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
const filepath = `${OUTPUT_DIR}/arcs.csv`;
const csvWriter = createObjectCsvWriter({
path: filepath,
header: [
{ id: 'id', title: 'ID' },
{ id: 'name', title: 'Name' },
{ id: 'frName', title: 'French Name' },
{ id: 'startChapter', title: 'Start Chapter' },
{ id: 'endChapter', title: 'End Chapter' },
{ id: 'url', title: 'URL' }
]
});
const records = arcs
.filter((arc) => arc !== null)
.map((arc) => ({
id: arc.id || '',
name: arc.name || '',
frName: arc.frName || '',
startChapter: arc.startChapter || '',
endChapter: arc.endChapter || '',
url: arc.url || ''
}));
await csvWriter.writeRecords(records);
console.log(`✓ Saved to ${filepath}`);
}
/**
* Fetch all cannon characters from One Piece fandom using API
*/
async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
try {
const apiUrl = `${FANDOM_API_BASE}List_of_Canon_Characters`;
console.log('Fetching character list via API...');
const response = await fetchWithRetry(apiUrl);
const jsonData = await response.json();
// Extract HTML from API response
const htmlContent = jsonData.parse?.text?.['*'];
if (!htmlContent) {
throw new Error('Unable to extract HTML content from API response');
}
const $ = cheerio.load(htmlContent);
const characters: CharacterListItem[] = [];
$('table.fandom-table tbody tr').each((index, element) => {
if (index === 0) return; // Skip header row
let charUrl = $(element).find('td:nth-child(2) a').attr('href');
const charName = $(element).find('td:nth-child(2) a').text().trim();
let charChapter = $(element).find('td:nth-child(3)').text().trim();
// Remove parentheses and their content from chapter info (e.g. "1 (flashback)" becomes "1")
charChapter = charChapter.replace(/\([^)]*\)/g, '');
charChapter = charChapter.replace(/\D/g, '');
// If charChapter is empty, skip the character as it means they don't have a proper page and are just mentioned in the list
if (!charChapter) {
return;
}
if (parseInt(charChapter, 10) === 0) {
return;
}
if (charUrl) {
charUrl = charUrl.replace('/wiki/', '');
characters.push({
name: charName,
url: charUrl,
chapter: parseInt(charChapter, 10)
});
}
});
console.log(`Found ${characters.length} characters.`);
return characters;
} catch (error) {
console.error('Error fetching character list:', (error as Error).message);
return [];
}
}
/**
* Fetch character data from fandom using provided URL
*/
async function fetchCharacter(
characterUrl: string,
characterName: string,
characterChapter: number,
arcsList: Arc[]
): Promise<Character | null> {
try {
console.log(`Fetching: ${characterName}...`);
// Use API to fetch character page
const apiUrl = `${FANDOM_API_BASE}${characterUrl}`;
const response = await fetchWithRetry(apiUrl);
const jsonData = await response.json();
const categories = jsonData.parse?.categories || [];
// Extract HTML from API response
const htmlContent = jsonData.parse?.text?.['*'];
if (!htmlContent) {
throw new Error('Unable to extract HTML content from API response');
}
const $ = cheerio.load(htmlContent);
const name = characterName;
// Generate character ID from URL + name combination
const finalCharacterId = normalizeId(characterUrl + '_' + name);
// Extract gender from JSON categories
let gender: string | null = null;
for (const cat of categories) {
const catName = cat['*'] || '';
if (catName === 'Male_Characters') {
gender = 'Male';
break;
} else if (catName === 'Female_Characters') {
gender = 'Female';
break;
}
}
// Extract age
const age = extractAge($);
// Extract affiliations
const affiliations = await extractAffiliations($, 'en');
// Extract epithets
const epithets = extractEpithets($);
// Extract devil fruit
const devilFruitData = await extractDevilFruit($);
const devilFruitId = devilFruitData?.devilFruitId || null;
const devilFruitUrl = devilFruitData?.devilFruitUrl || null;
// Extract haki from JSON categories
let hakiObservation = false;
let hakiArmament = false;
let hakiConqueror = false;
for (const cat of categories) {
const catName = cat['*'] || '';
if (catName === 'Observation_Haki_Users') {
hakiObservation = true;
} else if (catName === 'Armament_Haki_Users') {
hakiArmament = true;
} else if (catName === 'Supreme_King_Haki_Users') {
hakiConqueror = true;
}
}
// Extract bounty
const bounty = extractBounty($);
// Extract height
const height = extractHeight($);
// Use chapter from character list, cast to int
const firstAppearance = characterChapter;
// Extract origin
const origin = extractOrigin($);
// Extract status
const status = extractStatus($);
let arcId = '';
const arc = arcsList.find(
(a) =>
a.startChapter <= firstAppearance &&
(a.endChapter === null || a.endChapter >= firstAppearance)
);
if (!arc) {
return null;
}
arcId = arc.id;
const frLink = getFrLink(jsonData.parse?.langlinks || []);
const frUrl = frLink ? frLink.url.replace('https://onepiece.fandom.com/fr/wiki/', '') : null;
const frjsonData = frUrl
? await fetchWithRetry(`${FR_FANDOM_API_BASE}${frUrl}`).then((res) => res.json())
: null;
let frName = frjsonData?.parse?.title || null;
const frAffiliations = frjsonData
? await extractAffiliations(cheerio.load(frjsonData.parse?.text?.['*'] || ''), 'fr')
: null;
const frEpithets = frjsonData
? extractEpithets(cheerio.load(frjsonData.parse?.text?.['*'] || ''))
: null;
const frOrigin = frjsonData
? extractOrigin(cheerio.load(frjsonData.parse?.text?.['*'] || ''))
: null;
if (name !== jsonData.parse?.title) {
frName = name;
}
return {
id: finalCharacterId,
name,
frName,
gender,
age,
height,
origin,
frOrigin,
devilFruitId,
devilFruitUrl,
affiliations,
frAffiliations,
bounty,
hakiObservation,
hakiArmament,
hakiConqueror,
epithets,
frEpithets,
firstAppearance,
arcId,
status,
pictureUrl: 'Image_Non_Disponible',
url: characterUrl,
frUrl
};
} catch (error) {
console.error(`Error fetching ${characterName}:`, (error as Error).message);
return null;
}
}
/**
* Extract age from infobox
*/
function extractAge($: cheerio.CheerioAPI): number | null {
const div = $('[data-source="age"] .pi-data-value');
if (div.length === 0) return null;
let text = div.html();
if (!text) return null;
// Remove all sup blocks (citations)
text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');
// Get the last element and extract only digits
const parts = text.split('<br');
const lastPart = parts[parts.length - 1];
let cleanText = lastPart.replace(/<[^>]*>/g, '').trim();
// Remove content with parentheses
cleanText = cleanText.replace(/\([^)]*\)/g, '');
const digitsOnly = cleanText.replace(/\D/g, '');
return parseInt(digitsOnly) || null;
}
/**
* Extract affiliations from infobox
*/
async function extractAffiliations($: cheerio.CheerioAPI, lang: string): Promise<string[]> {
const div = $('[data-source="affiliation"] .pi-data-value');
if (div.length === 0) return [];
const cleanedDiv = div.clone();
cleanedDiv.find('sup').remove();
const text = cleanedDiv.html();
if (!text) return [];
// Resolve affiliations from linked page titles.
const links = cleanedDiv.find('a').toArray();
if (links.length > 0) {
const linkValues = await Promise.all(
links.map(async (el) => {
const href = $(el).attr('href') || '';
const resolvedTitle = await fetchWithRetry(
`${lang === 'fr' ? FR_FANDOM_API_BASE : FANDOM_API_BASE}${href.replace('/fr/wiki/', '').replace('/wiki/', '')}`
)
.then((res) => res.json())
.then((json) => json.parse?.title)
.catch(() => null);
if (resolvedTitle) {
return resolvedTitle;
}
return $(el).text().trim();
})
);
const uniqueLinks = Array.from(new Set(linkValues.filter(Boolean)));
if (uniqueLinks.length > 0) {
return uniqueLinks;
}
}
// Fallback to parsing text
const cleanText = text.replace(/<[^>]*>/g, '').trim();
const parts = cleanText.split(/\s*\n\s*|\s*;\s*|\s*,\s*/).filter(Boolean);
return parts.length > 0 ? parts : [];
}
/**
* Extract epithets from infobox
* Handles both quoted and unquoted epithets, keeping only the main/latest readable values.
*/
function extractEpithets($: cheerio.CheerioAPI): string[] {
const div = $('[data-source="epithet"] .pi-data-value');
if (div.length === 0) return [];
const cleanedDiv = div.clone();
cleanedDiv.find('sup').remove();
const html = cleanedDiv.html();
if (!html) return [];
const plainText = html.replace(/<br\s*\/?\s*>/gi, '\n').replace(/<[^>]*>/g, '');
const lines = plainText
.split('\n')
.map((line) => line.trim())
.filter(Boolean);
const epithets = lines
.map((line) => {
const normalized = line.replace(/\s+/g, ' ').trim();
// Prefer explicit quoted epithet if present.
const quotedMatch = normalized.match(/["«“](.*?)["»”]/);
if (quotedMatch?.[1]) {
return quotedMatch[1].trim();
}
// Otherwise keep only the base epithet text before extra notes/translations.
return normalized
.split(/[;(]/)[0]
.replace(/["'«»“”]/g, '')
.trim();
})
.filter(Boolean);
return Array.from(new Set(epithets));
}
/**
* Extract devil fruit from infobox
* Returns both normalized ID and URL
*/
async function extractDevilFruit($: cheerio.CheerioAPI): Promise<DevilFruitData | null> {
const link = $('[data-source="dfname"] .pi-data-value a').first();
if (link.length === 0) return null;
const href = link.attr('href');
if (!href || !href.startsWith('/wiki/')) return null;
const cleanUrl = href.replace('/wiki/', '');
// Query the devil fruit page via API to get the correct HTML content (in case of redirect) and extract the type from there
const dfResponse = await fetchWithRetry(`${FANDOM_API_BASE}${cleanUrl}`);
const dfJsonData = await dfResponse.json();
const fruitTitle = dfJsonData.parse?.title || '';
return {
devilFruitId: normalizeId(fruitTitle),
devilFruitUrl: fruitTitle
};
}
/**
* Extract bounty from infobox
*/
function extractBounty($: cheerio.CheerioAPI): number | null {
const div = $('[data-source="bounty"] .pi-data-value');
if (div.length === 0) return 0;
const cleanedDiv = div.clone();
// Drop references and old crossed-out bounty values.
cleanedDiv.find('sup, s, del, strike').remove();
const text = cleanedDiv.text().replace(/\s+/g, ' ').trim();
if (!text) return 0;
// Parse the first amount token (e.g. "3,189,000,000"), which is the active bounty.
const amountMatch = text.match(/\d{1,3}(?:[\s,.']\d{3})+|\d+/);
if (!amountMatch) return 0;
const digits = amountMatch[0].replace(/\D/g, '');
if (!digits) return 0;
const value = Number(digits);
return Number.isFinite(value) ? value : 0;
}
/**
* Extract height from infobox
*/
function extractHeight($: cheerio.CheerioAPI): number | null {
const div = $('[data-source="height"] .pi-data-value');
if (div.length === 0) return null;
let text = div.html();
if (!text) return null;
// Remove all sup blocks (citations)
text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');
// Convert line breaks to new lines so we can reliably pick the latest value.
const textWithNewLines = text.replace(/<br\s*\/?\s*>/gi, '\n');
const lines = textWithNewLines
.replace(/<[^>]*>/g, '')
.split('\n')
.map((line) => line.trim())
.filter(Boolean);
// Keep only lines that look like a height value, then pick the latest one.
const heightLines = lines.filter((line) => /\d/.test(line) && /(cm|m)/i.test(line));
const latestLine =
heightLines.length > 0 ? heightLines[heightLines.length - 1] : lines[lines.length - 1];
if (!latestLine) return null;
// Remove descriptive suffixes like "(post-timeskip)".
const cleanText = latestLine.replace(/\([^)]*\)/g, '').trim();
const normalized = cleanText.toLowerCase().replace(/\s/g, '');
// Values are stored in meters in this dataset.
const cmMatch = normalized.match(/(\d+(?:[.,]\d+)?)cm/);
if (cmMatch) {
const cm = parseFloat(cmMatch[1].replace(',', '.'));
return Number.isFinite(cm) ? cm / 100 : null;
}
const mMatch = normalized.match(/(\d+(?:[.,]\d+)?)m/);
if (mMatch) {
const meters = parseFloat(mMatch[1].replace(',', '.'));
return Number.isFinite(meters) ? meters : null;
}
return null;
}
/**
* Extract origin from infobox
*/
function extractOrigin($: cheerio.CheerioAPI): string | null {
const div = $(
'[data-source="origin"] .pi-data-value, [data-source="origine"] .pi-data-value'
).first();
if (div.length === 0) return null;
let text = div.html();
if (!text) return null;
// Remove all sup blocks (citations)
text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');
// Extract the first value before any <br> tag
const firstValue = text.split('<br')[0].trim();
let cleanText = firstValue.replace(/<[^>]*>/g, '').trim();
// Remove content with parentheses
cleanText = cleanText.replace(/\([^)]*\)/g, '').trim();
return cleanText || null;
}
/**
* Extract status from infobox
*/
function extractStatus($: cheerio.CheerioAPI): string | null {
const div = $('[data-source="status"] .pi-data-value');
if (div.length === 0) return null;
const statusText = div.text().trim().toLowerCase();
if (statusText.includes('Alive')) {
return 'Alive';
} else if (statusText.includes('Dead')) {
return 'Dead';
} else if (statusText.includes('Unknown')) {
return 'Unknown';
}
return 'Alive';
}
/**
* Save data to JSON
*/
async function saveToJSON(characters: Character[]): Promise<void> {
const filepath = `${OUTPUT_DIR}/characters.json`;
fs.writeFileSync(filepath, JSON.stringify(characters, null, 2));
console.log(`✓ Saved to ${filepath}`);
}
/**
* Save data to CSV
*/
async function saveToCSV(characters: Character[]): Promise<void> {
const filepath = `${OUTPUT_DIR}/characters.csv`;
const csvWriter = createObjectCsvWriter({
path: filepath,
header: [
{ id: 'id', title: 'ID' },
{ id: 'name', title: 'Name' },
{ id: 'gender', title: 'Gender' },
{ id: 'age', title: 'Age' },
{ id: 'height', title: 'Height' },
{ id: 'origin', title: 'Origin' },
{ id: 'status', title: 'Status' },
{ id: 'epithets', title: 'Epithets' },
{ id: 'devilFruitId', title: 'Devil Fruit ID' },
{ id: 'affiliations', title: 'Affiliations' },
{ id: 'bounty', title: 'Bounty' },
{ id: 'hakiObservation', title: 'Haki Observation' },
{ id: 'hakiArmament', title: 'Haki Armament' },
{ id: 'hakiConqueror', title: 'Haki Conqueror' },
{ id: 'firstAppearance', title: 'First Appearance' },
{ id: 'arcId', title: 'Arc ID' },
{ id: 'pictureUrl', title: 'Image URL' },
{ id: 'url', title: 'Fandom URL' }
]
});
const records = characters
.filter((c) => c !== null)
.map((c) => ({
id: c.id || '',
name: c.name || '',
gender: c.gender || '',
age: c.age || '',
height: c.height || '',
origin: c.origin || '',
status: c.status || '',
epithets: Array.isArray(c.epithets) ? c.epithets.join(', ') : c.epithets || '',
devilFruitId: c.devilFruitId || '',
affiliations: Array.isArray(c.affiliations)
? c.affiliations.join(', ')
: c.affiliations || '',
bounty: c.bounty ?? 0,
hakiObservation: c.hakiObservation ? 1 : 0,
hakiArmament: c.hakiArmament ? 1 : 0,
hakiConqueror: c.hakiConqueror ? 1 : 0,
firstAppearance: c.firstAppearance || '',
arcId: c.arcId || '',
pictureUrl: c.pictureUrl || '',
url: c.url || ''
}));
await csvWriter.writeRecords(records);
console.log(`✓ Saved to ${filepath}`);
}
/**
* Fetch devil fruit data from fandom using provided URL
*/
async function fetchDevilFruit(
devilFruitUrl: string,
devilFruitId: string
): Promise<DevilFruit | null> {
try {
console.log(`Fetching devil fruit: ${devilFruitUrl}...`);
// Use API to fetch devil fruit page
const apiUrl = `${FANDOM_API_BASE}${devilFruitUrl}`;
const response = await fetchWithRetry(apiUrl);
const jsonData = await response.json();
// Extract HTML from API response
const htmlContent = jsonData.parse?.text?.['*'];
if (!htmlContent) {
throw new Error('Unable to extract HTML content from API response');
}
const name = jsonData.parse?.title || devilFruitId.replace(/_/g, ' ');
let type: string | null = null;
// Determine type based on categories (if categories contain "Paramecia", "Zoan", "Logia" or "Smile")
if (jsonData.parse?.categories) {
const categories = jsonData.parse.categories.map((cat: { ['*']: string }) =>
String(cat['*'] || '').toLowerCase()
);
if (categories.some((category: string) => category.includes('paramecia'))) {
type = 'Paramecia';
} else if (categories.some((category: string) => category.includes('zoan'))) {
type = 'Zoan';
} else if (categories.some((category: string) => category.includes('logia'))) {
type = 'Logia';
} else if (categories.some((category: string) => category.includes('smile'))) {
type = 'Smile';
}
}
return {
id: devilFruitId,
name,
type,
url: devilFruitUrl
};
} catch (error) {
console.error(`Error fetching devil fruit ${devilFruitUrl}:`, (error as Error).message);
return null;
}
}
/**
* Save devil fruits to JSON
*/
async function saveDevilFruitsToJSON(devilFruits: DevilFruit[]): Promise<void> {
const filepath = `${OUTPUT_DIR}/devil-fruits.json`;
fs.writeFileSync(filepath, JSON.stringify(devilFruits, null, 2));
console.log(`✓ Saved to ${filepath}`);
}
/**
* Save devil fruits to CSV
*/
async function saveDevilFruitsToCSV(devilFruits: DevilFruit[]): Promise<void> {
const filepath = `${OUTPUT_DIR}/devil-fruits.csv`;
const csvWriter = createObjectCsvWriter({
path: filepath,
header: [
{ id: 'id', title: 'ID' },
{ id: 'name', title: 'Name' },
{ id: 'type', title: 'Type' },
{ id: 'url', title: 'URL' }
]
});
const records = devilFruits
.filter((df) => df !== null)
.map((df) => ({
id: df.id || '',
name: df.name || '',
type: df.type || '',
url: df.url || ''
}));
await csvWriter.writeRecords(records);
console.log(`✓ Saved to ${filepath}`);
}
/**
* Main execution
*/
async function main(): Promise<void> {
const format = process.argv[2] || 'all'; // json, csv, or all
console.log(`\nOne Piece Scraper - Mode: ${format}\n`);
// Step 1: Scraping Arcs
console.log('=== Step 1: Scraping Arcs ===\n');
const arcsList = await fetchAllArcs();
if (arcsList.length > 0) {
// Display arcs in table format
arcsList.forEach((arc) => {
console.table({
ID: arc.id,
Name: arc.name,
FrenchName: arc.frName || '',
StartChapter: arc.startChapter,
EndChapter: arc.endChapter || 'Ongoing',
URL: arc.url
});
});
console.log(`\n✓ Found ${arcsList.length} arcs\n`);
if (format === 'json' || format === 'all') {
await saveArcsToJSON(arcsList);
}
if (format === 'csv' || format === 'all') {
await saveArcsToCSV(arcsList);
}
} else {
console.warn('No arcs found, continuing...\n');
}
// Step 2: Scraping Characters
console.log('=== Step 1: Scraping Characters ===\n');
const characterList = await fetchAllCharactersUrl();
if (characterList.length === 0) {
console.error('No characters found. Exiting.');
return;
}
const characters: Character[] = [];
const devilFruitUrls = new Set<string>();
let failedCharacters: CharacterListItem[] = [...characterList];
while (failedCharacters.length > 0) {
const nextFailedCharacters: CharacterListItem[] = [];
console.log(`\nFetching ${failedCharacters.length} characters...`);
for (let i = 0; i < failedCharacters.length; i += FETCH_CONCURRENCY) {
const batch = failedCharacters.slice(i, i + FETCH_CONCURRENCY);
const batchResults = await Promise.all(
batch.map(async (char) => {
const data = await fetchCharacter(char.url, char.name, char.chapter, arcsList);
return { char, data };
})
);
for (const { char, data } of batchResults) {
if (data) {
console.table({
ID: data.id,
Name: data.name,
Gender: data.gender,
Age: data.age,
Status: data.status,
Epithets: data.epithets.join(', '),
Affiliations: data.affiliations.join(', '),
DevilFruitId: data.devilFruitId,
DevilFruitUrl: data.devilFruitUrl,
HakiObservation: data.hakiObservation ? 'Yes' : 'No',
HakiArmament: data.hakiArmament ? 'Yes' : 'No',
HakiConqueror: data.hakiConqueror ? 'Yes' : 'No',
Height: data.height,
Bounty: data.bounty,
Origin: data.origin,
FirstAppearance: data.firstAppearance,
pictureUrl: data.pictureUrl,
FandomURL: data.url
});
if (data.devilFruitUrl) {
devilFruitUrls.add(data.devilFruitUrl);
}
characters.push(data);
} else {
nextFailedCharacters.push(char);
}
}
}
failedCharacters = nextFailedCharacters;
if (failedCharacters.length > 0) {
console.log(`⚠️ ${failedCharacters.length} characters failed. Retrying...`);
}
}
console.log(`\n✓ Scraped ${characters.length} characters\n`);
console.log(`✓ Found ${devilFruitUrls.size} unique devil fruits\n`);
// Step 3: Scraping Devil Fruits
console.log('=== Step 2: Scraping Devil Fruits ===\n');
if (devilFruitUrls.size === 0) {
console.warn('No devil fruits found from characters, skipping...\n');
} else {
const devilFruits: DevilFruit[] = [];
const devilFruitUrlArray = Array.from(devilFruitUrls);
for (let i = 0; i < devilFruitUrlArray.length; i += FETCH_CONCURRENCY) {
const batch = devilFruitUrlArray.slice(i, i + FETCH_CONCURRENCY);
const batchResults = await Promise.all(
batch.map(async (url) => {
const data = await fetchDevilFruit(url, normalizeId(url));
return { url, data };
})
);
for (const { data } of batchResults) {
if (data) {
console.table({
ID: data.id,
Name: data.name,
Type: data.type,
URL: data.url
});
devilFruits.push(data);
}
}
}
console.log(`\n✓ Scraped ${devilFruits.length} devil fruits\n`);
if (format === 'json' || format === 'all') {
await saveDevilFruitsToJSON(devilFruits);
}
if (format === 'csv' || format === 'all') {
await saveDevilFruitsToCSV(devilFruits);
}
// Update characters with normalized devil fruit IDs
const devilFruitMap = new Map<string, string>(devilFruits.map((df) => [df.id, df.id]));
characters.forEach((char) => {
if (char.devilFruitUrl) {
const normalizedId = normalizeId(char.devilFruitUrl);
char.devilFruitId = devilFruitMap.get(normalizedId) || normalizedId;
}
});
}
// Save characters after devil fruit IDs are updated
if (format === 'json' || format === 'all') {
await saveToJSON(characters);
}
if (format === 'csv' || format === 'all') {
await saveToCSV(characters);
}
console.log('\n✓ Done!\n');
}
main().catch(console.error);