1093 lines
31 KiB
TypeScript
1093 lines
31 KiB
TypeScript
import * as cheerio from 'cheerio';
|
||
import fs from 'fs';
|
||
import { createObjectCsvWriter } from 'csv-writer';
|
||
|
||
// Type definitions
|
||
interface Arc {
|
||
id: string;
|
||
name: string;
|
||
frName: string | null;
|
||
startChapter: number;
|
||
endChapter: number | null;
|
||
url: string;
|
||
}
|
||
|
||
interface Character {
|
||
id: string;
|
||
name: string;
|
||
frName: string | null;
|
||
gender: string | null;
|
||
age: number | null;
|
||
height: number | null;
|
||
origin: string | null;
|
||
frOrigin: string | null;
|
||
devilFruitId: string | null;
|
||
devilFruitUrl: string | null;
|
||
affiliations: string[];
|
||
frAffiliations: string[] | null;
|
||
bounty: number | null;
|
||
hakiObservation: boolean;
|
||
hakiArmament: boolean;
|
||
hakiConqueror: boolean;
|
||
epithets: string[];
|
||
frEpithets: string[] | null;
|
||
firstAppearance: number;
|
||
status: string | null;
|
||
pictureUrl: string | null;
|
||
url: string;
|
||
frUrl: string | null;
|
||
arcId: string;
|
||
}
|
||
|
||
interface CharacterListItem {
|
||
name: string;
|
||
url: string;
|
||
chapter: number;
|
||
}
|
||
|
||
interface DevilFruitData {
|
||
devilFruitId: string;
|
||
devilFruitUrl: string;
|
||
}
|
||
|
||
interface DevilFruit {
|
||
id: string;
|
||
name: string;
|
||
type: string | null;
|
||
url: string;
|
||
}
|
||
|
||
const FANDOM_API_BASE =
|
||
'https://onepiece.fandom.com/api.php?action=parse&redirects=true&format=json&page=';
|
||
const FR_FANDOM_API_BASE =
|
||
'https://onepiece.fandom.com/fr/api.php?action=parse&redirects=true&format=json&page=';
|
||
const OUTPUT_DIR = './scraped-data';
|
||
const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed
|
||
const INITIAL_RETRY_DELAY = 1000;
|
||
const FETCH_CONCURRENCY = 50;
|
||
|
||
// Create output directory
|
||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||
}
|
||
|
||
/**
|
||
* Retry a fetch request with exponential backoff
|
||
*/
|
||
async function fetchWithRetry(
|
||
url: string,
|
||
options: RequestInit = {},
|
||
retries: number = 0
|
||
): Promise<Response> {
|
||
try {
|
||
const headers: Record<string, string> = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) Firefox/150.0',
|
||
'Accept-Language': 'en-US,en;q=0.9',
|
||
'Accept-Encoding': 'gzip, deflate, br',
|
||
Connection: 'keep-alive',
|
||
...((options.headers as Record<string, string>) || {})
|
||
};
|
||
|
||
const response = await fetch(url, {
|
||
headers,
|
||
...options
|
||
});
|
||
|
||
// Check if response is OK (status 200-299)
|
||
if (response.ok) {
|
||
return response;
|
||
}
|
||
|
||
// If not OK and we have retries left, retry
|
||
if (retries < MAX_RETRIES) {
|
||
const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
|
||
console.log(`⚠️ HTTP ${response.status} for ${url}, retrying in ${delay}ms...`);
|
||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||
return fetchWithRetry(url, options, retries + 1);
|
||
}
|
||
|
||
// If we've exhausted retries, throw error
|
||
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||
} catch (error) {
|
||
// If it's a network error and we have retries left, retry
|
||
if (retries < MAX_RETRIES) {
|
||
const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
|
||
console.log(`⚠️ Network error: ${(error as Error).message}, retrying in ${delay}ms...`);
|
||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||
return fetchWithRetry(url, options, retries + 1);
|
||
}
|
||
|
||
// If we've exhausted retries, throw error
|
||
throw error;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Get the French link from the API response links array
|
||
*/
|
||
|
||
function getFrLink(links: { lang: string; ['*']: string; url: string }[]): { url: string } | null {
|
||
// Get french url by getting parse.langlinks where lang is "fr" and extract the name from there
|
||
const frLink = links.find(
|
||
(link: { lang: string; ['*']: string; url: string }) => link.lang === 'fr'
|
||
);
|
||
return frLink ? { url: frLink['url'] } : null;
|
||
}
|
||
|
||
/**
|
||
* Normalize string by decoding URI components, punctuation, and replacing spaces with underscores
|
||
*/
|
||
function normalizeId(str: string): string {
|
||
return decodeURIComponent(str)
|
||
.normalize('NFD')
|
||
.replace(/[,:.()]/g, '')
|
||
.replace(/\s+/g, '_')
|
||
.toLowerCase();
|
||
}
|
||
|
||
/**
|
||
* Fetch all arcs from One Piece fandom using API
|
||
*/
|
||
async function fetchAllArcs(): Promise<Arc[]> {
|
||
try {
|
||
const apiUrl = `${FANDOM_API_BASE}Chapters_and_Volumes`;
|
||
console.log('Fetching arcs list via API...');
|
||
const response = await fetchWithRetry(apiUrl);
|
||
const jsonData = await response.json();
|
||
|
||
// Extract HTML from API response
|
||
const htmlContent = jsonData.parse?.text?.['*'];
|
||
if (!htmlContent) {
|
||
throw new Error('Unable to extract HTML content from API response');
|
||
}
|
||
|
||
const $ = cheerio.load(htmlContent);
|
||
const arcs: Arc[] = [];
|
||
|
||
const seenArcUrls = new Set<string>();
|
||
|
||
// Arc rows are in table cells where first link points to a *_Arc page and text includes chapter range.
|
||
const arcCells = $('table.wikitable td').toArray();
|
||
for (const element of arcCells) {
|
||
const cell = $(element);
|
||
const firstLink = cell.find('a').first();
|
||
const href = firstLink.attr('href') || '';
|
||
let arcName = firstLink.text().trim();
|
||
|
||
if (!href.startsWith('/wiki/') || !/_Arc$/i.test(href)) {
|
||
continue;
|
||
}
|
||
|
||
if (!arcName || !/\bArc\b/i.test(arcName)) {
|
||
continue;
|
||
}
|
||
|
||
arcName = arcName.replace(/\bArc\b/i, '').trim();
|
||
|
||
const cleanUrl = href.replace('/wiki/', '');
|
||
if (seenArcUrls.has(cleanUrl)) {
|
||
continue;
|
||
}
|
||
|
||
const cellText = cell.text().replace(/\s+/g, ' ').trim();
|
||
const chapterMatch = cellText.match(/Chapters\s+(\d+)\s+to\s+(\d+|Current)/i);
|
||
if (!chapterMatch) {
|
||
continue;
|
||
}
|
||
|
||
const startChapter = parseInt(chapterMatch[1], 10);
|
||
const endChapter = /current/i.test(chapterMatch[2]) ? null : parseInt(chapterMatch[2], 10);
|
||
|
||
let arcId = normalizeId(cleanUrl);
|
||
arcId = arcId.replace(/_arc$/i, '');
|
||
|
||
// Query the href page via API to get the correct HTML content (in case of redirect) and extract the French name from there
|
||
const arcResponse = await fetchWithRetry(`${FANDOM_API_BASE}${cleanUrl}`);
|
||
const arcJsonData = await arcResponse.json();
|
||
let frArcName: string | null =
|
||
arcJsonData.parse?.langlinks.find(
|
||
(link: { lang: string; ['*']: string }) => link.lang === 'fr'
|
||
)?.['*'] || null;
|
||
|
||
// Remove "Arc" suffix from French name if present to keep it consistent with English names (e.g. "Arc de Luffy" becomes "Luffy")
|
||
if (frArcName && /\bArc\b/i.test(frArcName)) {
|
||
frArcName = frArcName.replace(/\bArc\b/i, '').trim();
|
||
}
|
||
|
||
arcs.push({
|
||
id: arcId,
|
||
name: arcName,
|
||
frName: frArcName,
|
||
startChapter,
|
||
endChapter,
|
||
url: cleanUrl
|
||
});
|
||
|
||
seenArcUrls.add(cleanUrl);
|
||
}
|
||
|
||
console.log(`Found ${arcs.length} arcs.`);
|
||
return arcs;
|
||
} catch (error) {
|
||
console.error('Error fetching arcs list:', (error as Error).message);
|
||
return [];
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Save arcs to JSON
|
||
*/
|
||
async function saveArcsToJSON(arcs: Arc[]): Promise<void> {
|
||
const filepath = `${OUTPUT_DIR}/arcs.json`;
|
||
fs.writeFileSync(filepath, JSON.stringify(arcs, null, 2));
|
||
console.log(`✓ Saved to ${filepath}`);
|
||
}
|
||
|
||
/**
|
||
* Save arcs to CSV
|
||
*/
|
||
async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
|
||
const filepath = `${OUTPUT_DIR}/arcs.csv`;
|
||
const csvWriter = createObjectCsvWriter({
|
||
path: filepath,
|
||
header: [
|
||
{ id: 'id', title: 'ID' },
|
||
{ id: 'name', title: 'Name' },
|
||
{ id: 'frName', title: 'French Name' },
|
||
{ id: 'startChapter', title: 'Start Chapter' },
|
||
{ id: 'endChapter', title: 'End Chapter' },
|
||
{ id: 'url', title: 'URL' }
|
||
]
|
||
});
|
||
|
||
const records = arcs
|
||
.filter((arc) => arc !== null)
|
||
.map((arc) => ({
|
||
id: arc.id || '',
|
||
name: arc.name || '',
|
||
frName: arc.frName || '',
|
||
startChapter: arc.startChapter || '',
|
||
endChapter: arc.endChapter || '',
|
||
url: arc.url || ''
|
||
}));
|
||
|
||
await csvWriter.writeRecords(records);
|
||
console.log(`✓ Saved to ${filepath}`);
|
||
}
|
||
|
||
/**
|
||
* Fetch all cannon characters from One Piece fandom, including their full data.
|
||
*/
|
||
async function fetchAllCharacters(arcsList: Arc[]): Promise<Character[]> {
|
||
try {
|
||
console.log('Fetching character list via API...');
|
||
const response = await fetchWithRetry(`${FANDOM_API_BASE}List_of_Canon_Characters`);
|
||
const jsonData = await response.json();
|
||
|
||
// Extract HTML from API response
|
||
const htmlContent = jsonData.parse?.text?.['*'];
|
||
if (!htmlContent) {
|
||
throw new Error('Unable to extract HTML content from API response');
|
||
}
|
||
|
||
const $ = cheerio.load(htmlContent);
|
||
const characterList: CharacterListItem[] = [];
|
||
$('table.fandom-table tbody tr').each((index, element) => {
|
||
if (index === 0) return; // Skip header row
|
||
let charUrl = $(element).find('td:nth-child(2) a').attr('href');
|
||
const charName = $(element).find('td:nth-child(2) a').text().trim();
|
||
let charChapter = $(element).find('td:nth-child(3)').text().trim();
|
||
|
||
// Remove parentheses and their content from chapter info (e.g. "1 (flashback)" becomes "1")
|
||
charChapter = charChapter.replace(/\([^)]*\)/g, '');
|
||
charChapter = charChapter.replace(/\D/g, '');
|
||
|
||
// If charChapter is empty, skip the character as it means they don't have a proper page and are just mentioned in the list
|
||
if (!charChapter || parseInt(charChapter, 10) === 0) {
|
||
return;
|
||
}
|
||
|
||
if (charName.toLowerCase().includes('family')) {
|
||
return;
|
||
}
|
||
|
||
if (charUrl) {
|
||
charUrl = charUrl.replace('/wiki/', '');
|
||
characterList.push({
|
||
name: charName,
|
||
url: charUrl,
|
||
chapter: parseInt(charChapter, 10)
|
||
});
|
||
}
|
||
});
|
||
|
||
if (characterList.length === 0) {
|
||
console.error('No characters found.');
|
||
return [];
|
||
}
|
||
console.log(`Found ${characterList.length} characters.`);
|
||
|
||
// Fetch the french character list to get the picture URLs
|
||
console.log('Fetching French character list via API...');
|
||
const frResponse = await fetchWithRetry(`${FR_FANDOM_API_BASE}Liste_des_Personnages_Canon`);
|
||
const frJsonData = await frResponse.json();
|
||
|
||
// Create a map of character name to picture URL from the French list
|
||
const frHtmlContent = frJsonData.parse?.text?.['*'];
|
||
const fr$ = cheerio.load(frHtmlContent);
|
||
const frCharacterPictureMap: Record<string, string> = {};
|
||
fr$('table.wikitable tbody tr').each((index, element) => {
|
||
if (index === 0) return; // Skip header row
|
||
const charName = fr$(element).find('td:nth-child(2) a').text().trim();
|
||
const pictureUrl = fr$(element).find('td:nth-child(1) img').attr('data-src') || fr$(element).find('td:nth-child(1) img').attr('src') || null;
|
||
if (charName && pictureUrl) {
|
||
frCharacterPictureMap[charName] = pictureUrl;
|
||
}
|
||
});
|
||
|
||
const characters: Character[] = [];
|
||
let failedCharacters: CharacterListItem[] = [...characterList];
|
||
|
||
while (failedCharacters.length > 0) {
|
||
const nextFailedCharacters: CharacterListItem[] = [];
|
||
console.log(`\nFetching ${failedCharacters.length} characters...`);
|
||
|
||
for (let i = 0; i < failedCharacters.length; i += FETCH_CONCURRENCY) {
|
||
const batch = failedCharacters.slice(i, i + FETCH_CONCURRENCY);
|
||
const batchResults = await Promise.all(
|
||
batch.map(async (char) => {
|
||
const data = await fetchCharacter(char.url, char.name, char.chapter, arcsList, frCharacterPictureMap);
|
||
return { char, data };
|
||
})
|
||
);
|
||
|
||
for (const { char, data } of batchResults) {
|
||
if (data) {
|
||
console.table({
|
||
ID: data.id,
|
||
Name: data.name,
|
||
Gender: data.gender,
|
||
Age: data.age,
|
||
Status: data.status,
|
||
Epithets: data.epithets.join(', '),
|
||
Affiliations: data.affiliations.join(', '),
|
||
DevilFruitId: data.devilFruitId,
|
||
DevilFruitUrl: data.devilFruitUrl,
|
||
HakiObservation: data.hakiObservation ? 'Yes' : 'No',
|
||
HakiArmament: data.hakiArmament ? 'Yes' : 'No',
|
||
HakiConqueror: data.hakiConqueror ? 'Yes' : 'No',
|
||
Height: data.height,
|
||
Bounty: data.bounty,
|
||
Origin: data.origin,
|
||
FirstAppearance: data.firstAppearance,
|
||
pictureUrl: data.pictureUrl,
|
||
FandomURL: data.url
|
||
});
|
||
characters.push(data);
|
||
} else {
|
||
nextFailedCharacters.push(char);
|
||
}
|
||
}
|
||
}
|
||
|
||
failedCharacters = nextFailedCharacters;
|
||
if (failedCharacters.length > 0) {
|
||
console.log(`⚠️ ${failedCharacters.length} characters failed. Retrying...`);
|
||
}
|
||
}
|
||
|
||
console.log(`\n✓ Scraped ${characters.length} characters\n`);
|
||
return characters;
|
||
} catch (error) {
|
||
console.error('Error fetching characters:', (error as Error).message);
|
||
return [];
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Fetch character data from fandom using provided URL
|
||
*/
|
||
async function fetchCharacter(
|
||
characterUrl: string,
|
||
characterName: string,
|
||
characterChapter: number,
|
||
arcsList: Arc[],
|
||
frCharacterPictureMap: Record<string, string>
|
||
): Promise<Character | null> {
|
||
try {
|
||
console.log(`Fetching: ${characterName}...`);
|
||
|
||
// Use API to fetch character page
|
||
const apiUrl = `${FANDOM_API_BASE}${characterUrl}`;
|
||
const response = await fetchWithRetry(apiUrl);
|
||
const jsonData = await response.json();
|
||
|
||
const categories = jsonData.parse?.categories || [];
|
||
|
||
// Extract HTML from API response
|
||
const htmlContent = jsonData.parse?.text?.['*'];
|
||
if (!htmlContent) {
|
||
throw new Error('Unable to extract HTML content from API response');
|
||
}
|
||
|
||
const $ = cheerio.load(htmlContent);
|
||
|
||
const name = characterName;
|
||
|
||
// Generate character ID from URL + name combination
|
||
const finalCharacterId = normalizeId(characterUrl + '_' + name);
|
||
|
||
// Extract gender from JSON categories
|
||
let gender: string | null = null;
|
||
for (const cat of categories) {
|
||
const catName = cat['*'] || '';
|
||
if (catName === 'Male_Characters' || catName === 'Kings' || catName === 'Princes' || catName === 'Former_Kings' || catName === 'Former_Princes') {
|
||
gender = 'Male';
|
||
break;
|
||
} else if (catName === 'Female_Characters' || catName === 'Queens' || catName === 'Princesses' || catName === 'Former_Queens' || catName === 'Former_Princesses') {
|
||
gender = 'Female';
|
||
break;
|
||
}
|
||
}
|
||
|
||
// Extract age
|
||
const age = extractAge($);
|
||
|
||
// Extract affiliations
|
||
const affiliations = await extractAffiliations($, 'en');
|
||
|
||
// Extract epithets
|
||
const epithets = extractEpithets($);
|
||
|
||
// Extract devil fruit
|
||
const devilFruitData = await extractDevilFruit($);
|
||
const devilFruitId = devilFruitData?.devilFruitId || null;
|
||
const devilFruitUrl = devilFruitData?.devilFruitUrl || null;
|
||
|
||
// Extract haki from JSON categories
|
||
let hakiObservation = false;
|
||
let hakiArmament = false;
|
||
let hakiConqueror = false;
|
||
for (const cat of categories) {
|
||
const catName = cat['*'] || '';
|
||
if (catName === 'Observation_Haki_Users') {
|
||
hakiObservation = true;
|
||
} else if (catName === 'Armament_Haki_Users') {
|
||
hakiArmament = true;
|
||
} else if (catName === 'Supreme_King_Haki_Users') {
|
||
hakiConqueror = true;
|
||
}
|
||
}
|
||
|
||
// Extract bounty
|
||
const bounty = extractBounty($);
|
||
|
||
// Extract height
|
||
const height = extractHeight($);
|
||
|
||
// Use chapter from character list, cast to int
|
||
const firstAppearance = characterChapter;
|
||
|
||
// Extract origin
|
||
const origin = extractOrigin($);
|
||
|
||
// Extract status
|
||
const status = extractStatus($);
|
||
|
||
let arcId = '';
|
||
const arc = arcsList.find(
|
||
(a) =>
|
||
a.startChapter <= firstAppearance &&
|
||
(a.endChapter === null || a.endChapter >= firstAppearance)
|
||
);
|
||
if (!arc) {
|
||
return null;
|
||
}
|
||
arcId = arc.id;
|
||
|
||
const frLink = getFrLink(jsonData.parse?.langlinks || []);
|
||
const frUrl = frLink ? frLink.url.replace('https://onepiece.fandom.com/fr/wiki/', '') : null;
|
||
const frjsonData = frUrl
|
||
? await fetchWithRetry(`${FR_FANDOM_API_BASE}${frUrl}`).then((res) => res.json())
|
||
: null;
|
||
|
||
let frName = frjsonData?.parse?.title || null;
|
||
|
||
const frAffiliations = frjsonData
|
||
? await extractAffiliations(cheerio.load(frjsonData.parse?.text?.['*'] || ''), 'fr')
|
||
: null;
|
||
|
||
const frEpithets = frjsonData
|
||
? extractEpithets(cheerio.load(frjsonData.parse?.text?.['*'] || ''))
|
||
: null;
|
||
|
||
const frOrigin = frjsonData
|
||
? extractOrigin(cheerio.load(frjsonData.parse?.text?.['*'] || ''))
|
||
: null;
|
||
|
||
if (name !== jsonData.parse?.title) {
|
||
frName = name;
|
||
}
|
||
|
||
const pictureUrl = frCharacterPictureMap[frName || ''] || null;
|
||
|
||
return {
|
||
id: finalCharacterId,
|
||
name,
|
||
frName,
|
||
gender,
|
||
age,
|
||
height,
|
||
origin,
|
||
frOrigin,
|
||
devilFruitId,
|
||
devilFruitUrl,
|
||
affiliations,
|
||
frAffiliations,
|
||
bounty,
|
||
hakiObservation,
|
||
hakiArmament,
|
||
hakiConqueror,
|
||
epithets,
|
||
frEpithets,
|
||
firstAppearance,
|
||
arcId,
|
||
status,
|
||
pictureUrl,
|
||
url: characterUrl,
|
||
frUrl
|
||
};
|
||
} catch (error) {
|
||
console.error(`Error fetching ${characterName}:`, (error as Error).message);
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Extract age from infobox
|
||
*/
|
||
function extractAge($: cheerio.CheerioAPI): number | null {
|
||
const div = $('[data-source="age"] .pi-data-value');
|
||
if (div.length === 0) return null;
|
||
|
||
let text = div.html();
|
||
if (!text) return null;
|
||
|
||
// Remove all sup blocks (citations)
|
||
text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');
|
||
|
||
// Get the last element and extract only digits
|
||
const parts = text.split('<br');
|
||
const lastPart = parts[parts.length - 1];
|
||
let cleanText = lastPart.replace(/<[^>]*>/g, '').trim();
|
||
|
||
// Remove content with parentheses
|
||
cleanText = cleanText.replace(/\([^)]*\)/g, '');
|
||
|
||
const digitsOnly = cleanText.replace(/\D/g, '');
|
||
return parseInt(digitsOnly) || null;
|
||
}
|
||
|
||
/**
|
||
* Extract affiliations from infobox
|
||
*/
|
||
async function extractAffiliations($: cheerio.CheerioAPI, lang: string): Promise<string[]> {
|
||
const div = $('[data-source="affiliation"] .pi-data-value');
|
||
if (div.length === 0) return [];
|
||
|
||
const cleanedDiv = div.clone();
|
||
cleanedDiv.find('sup').remove();
|
||
|
||
const text = cleanedDiv.html();
|
||
if (!text) return [];
|
||
|
||
// Resolve affiliations from linked page titles.
|
||
const links = cleanedDiv.find('a').toArray();
|
||
if (links.length > 0) {
|
||
const linkValues = await Promise.all(
|
||
links.map(async (el) => {
|
||
const href = $(el).attr('href') || '';
|
||
const resolvedTitle = await fetchWithRetry(
|
||
`${lang === 'fr' ? FR_FANDOM_API_BASE : FANDOM_API_BASE}${href.replace('/fr/wiki/', '').replace('/wiki/', '')}`
|
||
)
|
||
.then((res) => res.json())
|
||
.then((json) => json.parse?.title)
|
||
.catch(() => null);
|
||
|
||
if (resolvedTitle) {
|
||
return resolvedTitle;
|
||
}
|
||
|
||
return $(el).text().trim();
|
||
})
|
||
);
|
||
|
||
const uniqueLinks = Array.from(new Set(linkValues.filter(Boolean)));
|
||
if (uniqueLinks.length > 0) {
|
||
return uniqueLinks;
|
||
}
|
||
}
|
||
|
||
// Fallback to parsing text
|
||
const cleanText = text.replace(/<[^>]*>/g, '').trim();
|
||
const parts = cleanText.split(/\s*\n\s*|\s*;\s*|\s*,\s*/).filter(Boolean);
|
||
return parts.length > 0 ? parts : [];
|
||
}
|
||
|
||
/**
|
||
* Extract epithets from infobox
|
||
* Handles both quoted and unquoted epithets, keeping only the main/latest readable values.
|
||
*/
|
||
function extractEpithets($: cheerio.CheerioAPI): string[] {
|
||
const div = $(
|
||
'[data-source="epithet"] .pi-data-value, [data-source="épithète"] .pi-data-value'
|
||
).first();
|
||
if (div.length === 0) return [];
|
||
|
||
const cleanedDiv = div.clone();
|
||
cleanedDiv.find('sup').remove();
|
||
|
||
const html = cleanedDiv.html();
|
||
if (!html) return [];
|
||
|
||
const plainText = html.replace(/<br\s*\/?\s*>/gi, '\n').replace(/<[^>]*>/g, '');
|
||
|
||
const lines = plainText
|
||
.split('\n')
|
||
.map((line) => line.trim())
|
||
.filter(Boolean);
|
||
|
||
const epithets = lines
|
||
.map((line) => {
|
||
const normalized = line.replace(/\s+/g, ' ').trim();
|
||
|
||
// Prefer explicit quoted epithet if present.
|
||
const quotedMatch = normalized.match(/["«“](.*?)["»”]/);
|
||
if (quotedMatch?.[1]) {
|
||
return quotedMatch[1].trim();
|
||
}
|
||
|
||
// Otherwise keep only the base epithet text before extra notes/translations.
|
||
return normalized
|
||
.split(/[;(]/)[0]
|
||
.replace(/["'«»“”]/g, '')
|
||
.trim();
|
||
})
|
||
.filter(Boolean);
|
||
|
||
return Array.from(new Set(epithets));
|
||
}
|
||
|
||
/**
|
||
* Extract devil fruit from infobox
|
||
* Returns both normalized ID and URL
|
||
*/
|
||
async function extractDevilFruit($: cheerio.CheerioAPI): Promise<DevilFruitData | null> {
|
||
const link = $('[data-source="dfname"] .pi-data-value a').first();
|
||
if (link.length === 0) return null;
|
||
|
||
const href = link.attr('href');
|
||
if (!href || !href.startsWith('/wiki/')) return null;
|
||
|
||
const cleanUrl = href.replace('/wiki/', '');
|
||
|
||
// Query the devil fruit page via API to get the correct HTML content (in case of redirect) and extract the type from there
|
||
const dfResponse = await fetchWithRetry(`${FANDOM_API_BASE}${cleanUrl}`);
|
||
const dfJsonData = await dfResponse.json();
|
||
const fruitTitle = dfJsonData.parse?.title || '';
|
||
|
||
return {
|
||
devilFruitId: normalizeId(fruitTitle),
|
||
devilFruitUrl: fruitTitle
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Extract bounty from infobox
|
||
*/
|
||
function extractBounty($: cheerio.CheerioAPI): number | null {
|
||
const div = $('[data-source="bounty"] .pi-data-value');
|
||
if (div.length === 0) return 0;
|
||
|
||
const cleanedDiv = div.clone();
|
||
// Drop references and old crossed-out bounty values.
|
||
cleanedDiv.find('sup, s, del, strike').remove();
|
||
|
||
const text = cleanedDiv.text().replace(/\s+/g, ' ').trim();
|
||
if (!text) return 0;
|
||
|
||
// Parse the first amount token (e.g. "3,189,000,000"), which is the active bounty.
|
||
const amountMatch = text.match(/\d{1,3}(?:[\s,.'’]\d{3})+|\d+/);
|
||
if (!amountMatch) return 0;
|
||
|
||
const digits = amountMatch[0].replace(/\D/g, '');
|
||
if (!digits) return 0;
|
||
|
||
const value = Number(digits);
|
||
return Number.isFinite(value) ? value : 0;
|
||
}
|
||
|
||
/**
|
||
* Extract height from infobox
|
||
*/
|
||
function extractHeight($: cheerio.CheerioAPI): number | null {
|
||
const div = $('[data-source="height"] .pi-data-value');
|
||
if (div.length === 0) return null;
|
||
|
||
let text = div.html();
|
||
if (!text) return null;
|
||
|
||
// Remove all sup blocks (citations)
|
||
text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');
|
||
|
||
// Convert line breaks to new lines so we can reliably pick the latest value.
|
||
const textWithNewLines = text.replace(/<br\s*\/?\s*>/gi, '\n');
|
||
const lines = textWithNewLines
|
||
.replace(/<[^>]*>/g, '')
|
||
.split('\n')
|
||
.map((line) => line.trim())
|
||
.filter(Boolean);
|
||
|
||
// Keep only lines that look like a height value, then pick the latest one.
|
||
const heightLines = lines.filter((line) => /\d/.test(line) && /(cm|m)/i.test(line));
|
||
const latestLine =
|
||
heightLines.length > 0 ? heightLines[heightLines.length - 1] : lines[lines.length - 1];
|
||
if (!latestLine) return null;
|
||
|
||
// Remove descriptive suffixes like "(post-timeskip)".
|
||
const cleanText = latestLine.replace(/\([^)]*\)/g, '').trim();
|
||
const normalized = cleanText.toLowerCase().replace(/\s/g, '');
|
||
|
||
// Values are stored in meters in this dataset.
|
||
const cmMatch = normalized.match(/(\d+(?:[.,]\d+)?)cm/);
|
||
if (cmMatch) {
|
||
const cm = parseFloat(cmMatch[1].replace(',', '.'));
|
||
return Number.isFinite(cm) ? cm / 100 : null;
|
||
}
|
||
|
||
const mMatch = normalized.match(/(\d+(?:[.,]\d+)?)m/);
|
||
if (mMatch) {
|
||
const meters = parseFloat(mMatch[1].replace(',', '.'));
|
||
return Number.isFinite(meters) ? meters : null;
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
/**
|
||
* Extract origin from infobox
|
||
*/
|
||
function extractOrigin($: cheerio.CheerioAPI): string | null {
|
||
const div = $(
|
||
'[data-source="origin"] .pi-data-value, [data-source="origine"] .pi-data-value'
|
||
).first();
|
||
if (div.length === 0) return null;
|
||
|
||
let text = div.html();
|
||
if (!text) return null;
|
||
|
||
// Remove all sup blocks (citations)
|
||
text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');
|
||
|
||
// Extract the first value before any <br> tag
|
||
const firstValue = text.split('<br')[0].trim();
|
||
let cleanText = firstValue.replace(/<[^>]*>/g, '').trim();
|
||
|
||
// Remove content with parentheses
|
||
cleanText = cleanText.replace(/\([^)]*\)/g, '').trim();
|
||
|
||
return cleanText || null;
|
||
}
|
||
|
||
/**
|
||
* Extract status from infobox
|
||
*/
|
||
function extractStatus($: cheerio.CheerioAPI): string | null {
|
||
const div = $('[data-source="status"] .pi-data-value');
|
||
if (div.length === 0) return null;
|
||
|
||
const statusText = div.text().trim().toLowerCase();
|
||
|
||
if (statusText.includes('alive')) {
|
||
return 'Alive';
|
||
} else if (statusText.includes('deceased')) {
|
||
return 'Dead';
|
||
} else if (statusText.includes('unknown')) {
|
||
return 'Unknown';
|
||
}
|
||
|
||
return 'Alive';
|
||
}
|
||
|
||
/**
|
||
* Save data to JSON
|
||
*/
|
||
async function saveToJSON(characters: Character[]): Promise<void> {
|
||
const filepath = `${OUTPUT_DIR}/characters.json`;
|
||
fs.writeFileSync(filepath, JSON.stringify(characters, null, 2));
|
||
console.log(`✓ Saved to ${filepath}`);
|
||
}
|
||
|
||
/**
|
||
* Save data to CSV
|
||
*/
|
||
async function saveToCSV(characters: Character[]): Promise<void> {
|
||
const filepath = `${OUTPUT_DIR}/characters.csv`;
|
||
const csvWriter = createObjectCsvWriter({
|
||
path: filepath,
|
||
header: [
|
||
{ id: 'id', title: 'ID' },
|
||
{ id: 'name', title: 'Name' },
|
||
{ id: 'gender', title: 'Gender' },
|
||
{ id: 'age', title: 'Age' },
|
||
{ id: 'height', title: 'Height' },
|
||
{ id: 'origin', title: 'Origin' },
|
||
{ id: 'status', title: 'Status' },
|
||
{ id: 'epithets', title: 'Epithets' },
|
||
{ id: 'devilFruitId', title: 'Devil Fruit ID' },
|
||
{ id: 'affiliations', title: 'Affiliations' },
|
||
{ id: 'bounty', title: 'Bounty' },
|
||
{ id: 'hakiObservation', title: 'Haki Observation' },
|
||
{ id: 'hakiArmament', title: 'Haki Armament' },
|
||
{ id: 'hakiConqueror', title: 'Haki Conqueror' },
|
||
{ id: 'firstAppearance', title: 'First Appearance' },
|
||
{ id: 'arcId', title: 'Arc ID' },
|
||
{ id: 'pictureUrl', title: 'Image URL' },
|
||
{ id: 'url', title: 'Fandom URL' }
|
||
]
|
||
});
|
||
|
||
const records = characters
|
||
.filter((c) => c !== null)
|
||
.map((c) => ({
|
||
id: c.id || '',
|
||
name: c.name || '',
|
||
gender: c.gender || '',
|
||
age: c.age || '',
|
||
height: c.height || '',
|
||
origin: c.origin || '',
|
||
status: c.status || '',
|
||
epithets: Array.isArray(c.epithets) ? c.epithets.join(', ') : c.epithets || '',
|
||
devilFruitId: c.devilFruitId || '',
|
||
affiliations: Array.isArray(c.affiliations)
|
||
? c.affiliations.join(', ')
|
||
: c.affiliations || '',
|
||
bounty: c.bounty ?? 0,
|
||
hakiObservation: c.hakiObservation ? 1 : 0,
|
||
hakiArmament: c.hakiArmament ? 1 : 0,
|
||
hakiConqueror: c.hakiConqueror ? 1 : 0,
|
||
firstAppearance: c.firstAppearance || '',
|
||
arcId: c.arcId || '',
|
||
pictureUrl: c.pictureUrl || '',
|
||
url: c.url || ''
|
||
}));
|
||
|
||
await csvWriter.writeRecords(records);
|
||
console.log(`✓ Saved to ${filepath}`);
|
||
}
|
||
|
||
/**
|
||
* Fetch devil fruit data from fandom using provided URL
|
||
*/
|
||
async function fetchDevilFruit(
|
||
devilFruitUrl: string,
|
||
devilFruitId: string
|
||
): Promise<DevilFruit | null> {
|
||
try {
|
||
console.log(`Fetching devil fruit: ${devilFruitUrl}...`);
|
||
|
||
// Use API to fetch devil fruit page
|
||
const apiUrl = `${FANDOM_API_BASE}${devilFruitUrl}`;
|
||
const response = await fetchWithRetry(apiUrl);
|
||
const jsonData = await response.json();
|
||
|
||
// Extract HTML from API response
|
||
const htmlContent = jsonData.parse?.text?.['*'];
|
||
if (!htmlContent) {
|
||
throw new Error('Unable to extract HTML content from API response');
|
||
}
|
||
|
||
const name = jsonData.parse?.title || devilFruitId.replace(/_/g, ' ');
|
||
|
||
let type: string | null = null;
|
||
// Determine type based on categories (if categories contain "Paramecia", "Zoan", "Logia" or "Smile")
|
||
if (jsonData.parse?.categories) {
|
||
const categories = jsonData.parse.categories.map((cat: { ['*']: string }) =>
|
||
String(cat['*'] || '').toLowerCase()
|
||
);
|
||
|
||
if (categories.some((category: string) => category.includes('paramecia'))) {
|
||
type = 'Paramecia';
|
||
} else if (categories.some((category: string) => category.includes('zoan'))) {
|
||
type = 'Zoan';
|
||
} else if (categories.some((category: string) => category.includes('logia'))) {
|
||
type = 'Logia';
|
||
} else if (categories.some((category: string) => category.includes('smile'))) {
|
||
type = 'Smile';
|
||
}
|
||
}
|
||
|
||
return {
|
||
id: devilFruitId,
|
||
name,
|
||
type,
|
||
url: devilFruitUrl
|
||
};
|
||
} catch (error) {
|
||
console.error(`Error fetching devil fruit ${devilFruitUrl}:`, (error as Error).message);
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Save devil fruits to JSON
|
||
*/
|
||
async function saveDevilFruitsToJSON(devilFruits: DevilFruit[]): Promise<void> {
|
||
const filepath = `${OUTPUT_DIR}/devil-fruits.json`;
|
||
fs.writeFileSync(filepath, JSON.stringify(devilFruits, null, 2));
|
||
console.log(`✓ Saved to ${filepath}`);
|
||
}
|
||
|
||
/**
|
||
* Save devil fruits to CSV
|
||
*/
|
||
async function saveDevilFruitsToCSV(devilFruits: DevilFruit[]): Promise<void> {
|
||
const filepath = `${OUTPUT_DIR}/devil-fruits.csv`;
|
||
const csvWriter = createObjectCsvWriter({
|
||
path: filepath,
|
||
header: [
|
||
{ id: 'id', title: 'ID' },
|
||
{ id: 'name', title: 'Name' },
|
||
{ id: 'type', title: 'Type' },
|
||
{ id: 'url', title: 'URL' }
|
||
]
|
||
});
|
||
|
||
const records = devilFruits
|
||
.filter((df) => df !== null)
|
||
.map((df) => ({
|
||
id: df.id || '',
|
||
name: df.name || '',
|
||
type: df.type || '',
|
||
url: df.url || ''
|
||
}));
|
||
|
||
await csvWriter.writeRecords(records);
|
||
console.log(`✓ Saved to ${filepath}`);
|
||
}
|
||
|
||
/**
|
||
* Main execution
|
||
*/
|
||
async function main(): Promise<void> {
|
||
const format = process.argv[2] || 'all'; // json, csv, or all
|
||
|
||
console.log(`\nOne Piece Scraper - Mode: ${format}\n`);
|
||
|
||
// Step 1: Scraping Arcs
|
||
console.log('=== Step 1: Scraping Arcs ===\n');
|
||
const arcsList = await fetchAllArcs();
|
||
|
||
if (arcsList.length > 0) {
|
||
// Display arcs in table format
|
||
arcsList.forEach((arc) => {
|
||
console.table({
|
||
ID: arc.id,
|
||
Name: arc.name,
|
||
FrenchName: arc.frName || '',
|
||
StartChapter: arc.startChapter,
|
||
EndChapter: arc.endChapter || 'Ongoing',
|
||
URL: arc.url
|
||
});
|
||
});
|
||
|
||
console.log(`\n✓ Found ${arcsList.length} arcs\n`);
|
||
|
||
if (format === 'json' || format === 'all') {
|
||
await saveArcsToJSON(arcsList);
|
||
}
|
||
if (format === 'csv' || format === 'all') {
|
||
await saveArcsToCSV(arcsList);
|
||
}
|
||
} else {
|
||
console.warn('No arcs found, continuing...\n');
|
||
}
|
||
|
||
// Step 2: Scraping Characters
|
||
console.log('=== Step 2: Scraping Characters ===\n');
|
||
const characters = await fetchAllCharacters(arcsList);
|
||
|
||
if (characters.length === 0) {
|
||
console.error('No characters found. Exiting.');
|
||
return;
|
||
}
|
||
|
||
const devilFruitUrls = new Set<string>(
|
||
characters.filter((c) => c.devilFruitUrl).map((c) => c.devilFruitUrl!)
|
||
);
|
||
console.log(`✓ Found ${devilFruitUrls.size} unique devil fruits\n`);
|
||
|
||
// Step 3: Scraping Devil Fruits
|
||
console.log('=== Step 2: Scraping Devil Fruits ===\n');
|
||
|
||
if (devilFruitUrls.size === 0) {
|
||
console.warn('No devil fruits found from characters, skipping...\n');
|
||
} else {
|
||
const devilFruits: DevilFruit[] = [];
|
||
const devilFruitUrlArray = Array.from(devilFruitUrls);
|
||
|
||
for (let i = 0; i < devilFruitUrlArray.length; i += FETCH_CONCURRENCY) {
|
||
const batch = devilFruitUrlArray.slice(i, i + FETCH_CONCURRENCY);
|
||
const batchResults = await Promise.all(
|
||
batch.map(async (url) => {
|
||
const data = await fetchDevilFruit(url, normalizeId(url));
|
||
return { url, data };
|
||
})
|
||
);
|
||
|
||
for (const { data } of batchResults) {
|
||
if (data) {
|
||
console.table({
|
||
ID: data.id,
|
||
Name: data.name,
|
||
Type: data.type,
|
||
URL: data.url
|
||
});
|
||
|
||
devilFruits.push(data);
|
||
}
|
||
}
|
||
}
|
||
|
||
console.log(`\n✓ Scraped ${devilFruits.length} devil fruits\n`);
|
||
|
||
if (format === 'json' || format === 'all') {
|
||
await saveDevilFruitsToJSON(devilFruits);
|
||
}
|
||
if (format === 'csv' || format === 'all') {
|
||
await saveDevilFruitsToCSV(devilFruits);
|
||
}
|
||
|
||
// Update characters with normalized devil fruit IDs
|
||
const devilFruitMap = new Map<string, string>(devilFruits.map((df) => [df.id, df.id]));
|
||
characters.forEach((char) => {
|
||
if (char.devilFruitUrl) {
|
||
const normalizedId = normalizeId(char.devilFruitUrl);
|
||
char.devilFruitId = devilFruitMap.get(normalizedId) || normalizedId;
|
||
}
|
||
});
|
||
}
|
||
|
||
// Save characters after devil fruit IDs are updated
|
||
if (format === 'json' || format === 'all') {
|
||
await saveToJSON(characters);
|
||
}
|
||
if (format === 'csv' || format === 'all') {
|
||
await saveToCSV(characters);
|
||
}
|
||
|
||
console.log('\n✓ Done!\n');
|
||
}
|
||
|
||
main().catch(console.error);
|