Refactor database schema and update scraping logic for One Piece characters and arcs
- Updated database schema to include French names and adjusted field names for consistency. - Modified scraping script to fetch and store French names for arcs and characters. - Improved API calls to handle redirects and fetch additional data for characters. - Enhanced data extraction methods for character attributes and devil fruits. - Cleaned up code for better readability and maintainability.
This commit is contained in:
@@ -6,6 +6,7 @@ import { createObjectCsvWriter } from 'csv-writer';
|
||||
interface Arc {
|
||||
id: string;
|
||||
name: string;
|
||||
frName: string | null;
|
||||
startChapter: number;
|
||||
endChapter: number | null;
|
||||
url: string;
|
||||
@@ -14,30 +15,34 @@ interface Arc {
|
||||
interface Character {
|
||||
id: string;
|
||||
name: string;
|
||||
frName: string | null;
|
||||
gender: string | null;
|
||||
age: number | null;
|
||||
height: number | null;
|
||||
origin: string | null;
|
||||
frOrigin: string | null;
|
||||
devilFruitId: string | null;
|
||||
devilFruitUrl: string | null;
|
||||
affiliations: string[];
|
||||
frAffiliations: string[] | null;
|
||||
bounty: number | null;
|
||||
hakiObservation: boolean;
|
||||
hakiArmament: boolean;
|
||||
hakiConqueror: boolean;
|
||||
epithets: string[];
|
||||
frEpithets: string[] | null;
|
||||
firstAppearance: number;
|
||||
status: string | null;
|
||||
pictureUrl: string | null;
|
||||
url: string;
|
||||
arcId?: string;
|
||||
frUrl: string | null;
|
||||
arcId: string;
|
||||
}
|
||||
|
||||
interface CharacterListItem {
|
||||
name: string;
|
||||
url: string;
|
||||
pictureUrl: string | null;
|
||||
chapter: string;
|
||||
chapter: number;
|
||||
}
|
||||
|
||||
interface DevilFruitData {
|
||||
@@ -52,30 +57,13 @@ interface DevilFruit {
|
||||
url: string;
|
||||
}
|
||||
|
||||
const FANDOM_API_BASE = 'https://onepiece.fandom.com/fr/api.php?action=parse&format=json&page=';
|
||||
const FANDOM_API_BASE = 'https://onepiece.fandom.com/api.php?action=parse&redirects=true&format=json&page=';
|
||||
const FR_FANDOM_API_BASE = 'https://onepiece.fandom.com/fr/api.php?action=parse&redirects=true&format=json&page=';
|
||||
const OUTPUT_DIR = './scraped-data';
|
||||
const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed
|
||||
const INITIAL_RETRY_DELAY = 1000;
|
||||
const FETCH_CONCURRENCY = 50;
|
||||
|
||||
// Store cookies across requests (simulate browser behavior)
|
||||
const cookies = new Map<string, string>();
|
||||
|
||||
function getCookieHeader(): string {
|
||||
const cookieArray = Array.from(cookies.values()).map(c => c.split(';')[0]);
|
||||
return cookieArray.length > 0 ? cookieArray.join('; ') : '';
|
||||
}
|
||||
|
||||
function saveCookies(setCookieHeader: string | string[] | null): void {
|
||||
if (setCookieHeader) {
|
||||
const cookiesList = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader];
|
||||
cookiesList.forEach(cookie => {
|
||||
const [nameValue] = cookie.split(';');
|
||||
const [name] = nameValue.split('=');
|
||||
if (name) cookies.set(name, cookie);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
@@ -94,23 +82,11 @@ async function fetchWithRetry(url: string, options: RequestInit = {}, retries: n
|
||||
'Connection': 'keep-alive',
|
||||
...((options.headers as Record<string, string>) || {})
|
||||
};
|
||||
|
||||
// Add cookies from previous requests
|
||||
const cookieHeader = getCookieHeader();
|
||||
if (cookieHeader) {
|
||||
headers['Cookie'] = cookieHeader;
|
||||
}
|
||||
|
||||
const response = await fetch(url, {
|
||||
headers,
|
||||
...options
|
||||
} as any);
|
||||
|
||||
// Save cookies from response
|
||||
const setCookie = response.headers.get('set-cookie');
|
||||
if (setCookie) {
|
||||
saveCookies(setCookie);
|
||||
}
|
||||
});
|
||||
|
||||
// Check if response is OK (status 200-299)
|
||||
if (response.ok) {
|
||||
@@ -141,6 +117,16 @@ async function fetchWithRetry(url: string, options: RequestInit = {}, retries: n
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the French link from the API response links array
|
||||
*/
|
||||
|
||||
function getFrLink(links: { lang: string; ['*']: string; url: string }[]): { url: string } | null {
|
||||
// Get french url by getting parse.langlinks where lang is "fr" and extract the name from there
|
||||
const frLink = links.find((link: { lang: string; ['*']: string; url: string }) => link.lang === 'fr');
|
||||
return frLink ? { url: frLink['url'] } : null;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Normalize string by decoding URI components, punctuation, and replacing spaces with underscores
|
||||
@@ -148,7 +134,7 @@ async function fetchWithRetry(url: string, options: RequestInit = {}, retries: n
|
||||
function normalizeId(str: string): string {
|
||||
return decodeURIComponent(str)
|
||||
.normalize('NFD')
|
||||
.replace(/[,:.\(\)]/g, '')
|
||||
.replace(/[,:.()]/g, '')
|
||||
.replace(/\s+/g, '_')
|
||||
.toLowerCase();
|
||||
}
|
||||
@@ -158,10 +144,10 @@ function normalizeId(str: string): string {
|
||||
*/
|
||||
async function fetchAllArcs(): Promise<Arc[]> {
|
||||
try {
|
||||
const apiUrl = `${FANDOM_API_BASE}Chapitres_et_Tomes`;
|
||||
const apiUrl = `${FANDOM_API_BASE}Chapters_and_Volumes`;
|
||||
console.log('Fetching arcs list via API...');
|
||||
const response = await fetchWithRetry(apiUrl);
|
||||
const jsonData = await response.json() as any;
|
||||
const jsonData = await response.json();
|
||||
|
||||
// Extract HTML from API response
|
||||
const htmlContent = jsonData.parse?.text?.['*'];
|
||||
@@ -172,40 +158,66 @@ async function fetchAllArcs(): Promise<Arc[]> {
|
||||
const $ = cheerio.load(htmlContent);
|
||||
const arcs: Arc[] = [];
|
||||
|
||||
// Find all arc links in the table
|
||||
$('table.wikitable td a').each((index, element) => {
|
||||
const text = $(element).text().trim();
|
||||
const href = $(element).attr('href');
|
||||
|
||||
// Check if it's an arc link (contains "Arc" and chapter info)
|
||||
if (text.includes('Arc') && text.includes('Ch.') && href) {
|
||||
// Extract arc name and chapter range
|
||||
// Example text: "Arc Ville d'Orange(Ch.8 à 21)[T.1 à 3]"
|
||||
console.log(`Processing arc link: ${text} (${href})`);
|
||||
const nameMatch = text.match(/^(.*?Arc.*?)\s*\(Ch\.(\d+)(?:\s*à\s*(?:(\d+)|(?:...)))?\)/);
|
||||
if (nameMatch) {
|
||||
let arcName = nameMatch[1].trim();
|
||||
// Remove "Arc " from the name
|
||||
arcName = arcName.replace(/^Arc\s+/i, '');
|
||||
|
||||
const startChapter = parseInt(nameMatch[2]);
|
||||
const endChapter = nameMatch[3] ? parseInt(nameMatch[3]) : null;
|
||||
const seenArcUrls = new Set<string>();
|
||||
|
||||
// Generate arc ID by normalizing the url
|
||||
let arcId = normalizeId(href.replace('/fr/wiki/', ''));
|
||||
// Remove "Arc_" from the id
|
||||
arcId = arcId.replace(/^arc_/i, '');
|
||||
// Arc rows are in table cells where first link points to a *_Arc page and text includes chapter range.
|
||||
const arcCells = $('table.wikitable td').toArray();
|
||||
for (const element of arcCells) {
|
||||
const cell = $(element);
|
||||
const firstLink = cell.find('a').first();
|
||||
const href = firstLink.attr('href') || '';
|
||||
let arcName = firstLink.text().trim();
|
||||
|
||||
arcs.push({
|
||||
id: arcId,
|
||||
name: arcName,
|
||||
startChapter,
|
||||
endChapter,
|
||||
url: href.replace('/fr/wiki/', '')
|
||||
});
|
||||
}
|
||||
if (!href.startsWith('/wiki/') || !/_Arc$/i.test(href)) {
|
||||
continue;
|
||||
}
|
||||
});
|
||||
|
||||
if (!arcName || !/\bArc\b/i.test(arcName)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
arcName = arcName.replace(/\bArc\b/i, '').trim();
|
||||
|
||||
const cleanUrl = href.replace('/wiki/', '');
|
||||
if (seenArcUrls.has(cleanUrl)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const cellText = cell.text().replace(/\s+/g, ' ').trim();
|
||||
const chapterMatch = cellText.match(/Chapters\s+(\d+)\s+to\s+(\d+|Current)/i);
|
||||
if (!chapterMatch) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const startChapter = parseInt(chapterMatch[1], 10);
|
||||
const endChapter = /current/i.test(chapterMatch[2])
|
||||
? null
|
||||
: parseInt(chapterMatch[2], 10);
|
||||
|
||||
let arcId = normalizeId(cleanUrl);
|
||||
arcId = arcId.replace(/_arc$/i, '');
|
||||
|
||||
// Query the href page via API to get the correct HTML content (in case of redirect) and extract the French name from there
|
||||
const arcResponse = await fetchWithRetry(`${FANDOM_API_BASE}${cleanUrl}`);
|
||||
const arcJsonData = await arcResponse.json();
|
||||
let frArcName: string | null = arcJsonData.parse?.langlinks.find((link: { lang: string; ['*']: string }) => link.lang === 'fr')?.['*'] || null;
|
||||
|
||||
// Remove "Arc" suffix from French name if present to keep it consistent with English names (e.g. "Arc de Luffy" becomes "Luffy")
|
||||
if (frArcName && /\bArc\b/i.test(frArcName)) {
|
||||
frArcName = frArcName.replace(/\bArc\b/i, '').trim();
|
||||
}
|
||||
|
||||
arcs.push({
|
||||
id: arcId,
|
||||
name: arcName,
|
||||
frName: frArcName,
|
||||
startChapter,
|
||||
endChapter,
|
||||
url: cleanUrl
|
||||
});
|
||||
|
||||
seenArcUrls.add(cleanUrl);
|
||||
}
|
||||
|
||||
console.log(`Found ${arcs.length} arcs.`);
|
||||
return arcs;
|
||||
@@ -234,6 +246,7 @@ async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
|
||||
header: [
|
||||
{ id: 'id', title: 'ID' },
|
||||
{ id: 'name', title: 'Name' },
|
||||
{ id: 'frName', title: 'French Name' },
|
||||
{ id: 'startChapter', title: 'Start Chapter' },
|
||||
{ id: 'endChapter', title: 'End Chapter' },
|
||||
{ id: 'url', title: 'URL' }
|
||||
@@ -245,6 +258,7 @@ async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
|
||||
.map((arc) => ({
|
||||
id: arc.id || '',
|
||||
name: arc.name || '',
|
||||
frName: arc.frName || '',
|
||||
startChapter: arc.startChapter || '',
|
||||
endChapter: arc.endChapter || '',
|
||||
url: arc.url || ''
|
||||
@@ -259,10 +273,10 @@ async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
|
||||
*/
|
||||
async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
|
||||
try {
|
||||
const apiUrl = `${FANDOM_API_BASE}Liste_des_Personnages_Canon`;
|
||||
const apiUrl = `${FANDOM_API_BASE}List_of_Canon_Characters`;
|
||||
console.log('Fetching character list via API...');
|
||||
const response = await fetchWithRetry(apiUrl);
|
||||
const jsonData = await response.json() as any;
|
||||
const jsonData = await response.json();
|
||||
|
||||
// Extract HTML from API response
|
||||
const htmlContent = jsonData.parse?.text?.['*'];
|
||||
@@ -272,11 +286,10 @@ async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
|
||||
|
||||
const $ = cheerio.load(htmlContent);
|
||||
const characters: CharacterListItem[] = [];
|
||||
$('table.wikitable tbody tr').each((index, element) => {
|
||||
$('table.fandom-table tbody tr').each((index, element) => {
|
||||
if (index === 0) return; // Skip header row
|
||||
let charpictureUrl = $(element).find('td:nth-child(1) a img').attr('data-src') || $(element).find('td:nth-child(1) a img').attr('src');
|
||||
let charUrl = $(element).find('td:nth-child(2) a').attr('href');
|
||||
let charName = $(element).find('td:nth-child(2) a').text().trim();
|
||||
const charName = $(element).find('td:nth-child(2) a').text().trim();
|
||||
let charChapter = $(element).find('td:nth-child(3)').text().trim();
|
||||
|
||||
// Remove parentheses and their content from chapter info (e.g. "1 (flashback)" becomes "1")
|
||||
@@ -288,13 +301,16 @@ async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
|
||||
return;
|
||||
}
|
||||
|
||||
if (parseInt(charChapter, 10) === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (charUrl) {
|
||||
charUrl = charUrl.replace('/fr/wiki/', '');
|
||||
charUrl = charUrl.replace('/wiki/', '');
|
||||
characters.push({
|
||||
name: charName,
|
||||
url: charUrl,
|
||||
pictureUrl: charpictureUrl || null,
|
||||
chapter: charChapter,
|
||||
chapter: parseInt(charChapter, 10)
|
||||
});
|
||||
}
|
||||
});
|
||||
@@ -312,27 +328,17 @@ async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
|
||||
async function fetchCharacter(
|
||||
characterUrl: string,
|
||||
characterName: string,
|
||||
characterpictureUrl: string | null,
|
||||
characterChapter: string
|
||||
characterChapter: number,
|
||||
arcsList: Arc[],
|
||||
): Promise<Character | null> {
|
||||
try {
|
||||
console.log(`Fetching: ${characterName}...`);
|
||||
|
||||
// Use API to fetch character page
|
||||
const apiUrl = `${FANDOM_API_BASE}${characterUrl}`;
|
||||
let response = await fetchWithRetry(apiUrl);
|
||||
const response = await fetchWithRetry(apiUrl);
|
||||
const jsonData = await response.json();
|
||||
|
||||
let jsonData = await response.json() as any;
|
||||
|
||||
// Use final page name from API (if parse.limks contains one element, it means the original page was a redirect, so we use the the element 0 as the final URL, otherwise we use the original URL)
|
||||
let finalCharacterUrl = characterUrl;
|
||||
if (jsonData.parse?.links?.length === 1) {
|
||||
finalCharacterUrl = jsonData.parse.links[0]['*'];
|
||||
// Query the API again with the final URL to get the correct HTML content (in case of redirect)
|
||||
response = await fetchWithRetry(`${FANDOM_API_BASE}${finalCharacterUrl}`);
|
||||
jsonData = await response.json() as any;
|
||||
}
|
||||
|
||||
const categories = jsonData.parse?.categories || [];
|
||||
|
||||
// Extract HTML from API response
|
||||
@@ -346,16 +352,16 @@ async function fetchCharacter(
|
||||
const name = characterName;
|
||||
|
||||
// Generate character ID from URL + name combination
|
||||
const finalCharacterId = normalizeId(finalCharacterUrl + '_' + name);
|
||||
const finalCharacterId = normalizeId(characterUrl + '_' + name);
|
||||
|
||||
// Extract gender from JSON categories
|
||||
let gender: string | null = null;
|
||||
for (const cat of categories) {
|
||||
const catName = cat['*'] || '';
|
||||
if (catName === 'Personnages_Masculins') {
|
||||
if (catName === 'Male_Characters') {
|
||||
gender = 'Male';
|
||||
break;
|
||||
} else if (catName === 'Personnages_Féminins') {
|
||||
} else if (catName === 'Female_Characters') {
|
||||
gender = 'Female';
|
||||
break;
|
||||
}
|
||||
@@ -381,11 +387,11 @@ async function fetchCharacter(
|
||||
let hakiConqueror = false;
|
||||
for (const cat of categories) {
|
||||
const catName = cat['*'] || '';
|
||||
if (catName === 'Utilisateurs_du_Haki_de_l\'observation') {
|
||||
if (catName === 'Observation_Haki_Users') {
|
||||
hakiObservation = true;
|
||||
} else if (catName === 'Utilisateurs_du_Haki_de_l\'armement') {
|
||||
} else if (catName === 'Armament_Haki_Users') {
|
||||
hakiArmament = true;
|
||||
} else if (catName === 'Utilisateurs_du_Haki_des_rois') {
|
||||
} else if (catName === 'Supreme_King_Haki_Users') {
|
||||
hakiConqueror = true;
|
||||
}
|
||||
}
|
||||
@@ -397,7 +403,7 @@ async function fetchCharacter(
|
||||
const height = extractHeight($);
|
||||
|
||||
// Use chapter from character list, cast to int
|
||||
let firstAppearance = parseInt(characterChapter);
|
||||
const firstAppearance = characterChapter;
|
||||
|
||||
// Extract origin
|
||||
const origin = extractOrigin($);
|
||||
@@ -405,31 +411,51 @@ async function fetchCharacter(
|
||||
// Extract status
|
||||
const status = extractStatus($);
|
||||
|
||||
// Extract image URL and clean it
|
||||
let pictureUrl = characterpictureUrl;
|
||||
if (pictureUrl && pictureUrl.includes('Image_Non_Disponible')) {
|
||||
pictureUrl = null;
|
||||
let arcId = '';
|
||||
const arc = arcsList.find(a => a.startChapter <= firstAppearance && (a.endChapter === null || a.endChapter >= firstAppearance));
|
||||
if (!arc) {
|
||||
return null;
|
||||
}
|
||||
arcId = arc.id;
|
||||
|
||||
const frLink = getFrLink(jsonData.parse?.langlinks || []);
|
||||
const frUrl = frLink ? frLink.url.replace('https://onepiece.fandom.com/fr/wiki/', '') : null;
|
||||
const frjsonData = frUrl ? await fetchWithRetry(`${FR_FANDOM_API_BASE}${frUrl}`).then(res => res.json()) : null;
|
||||
|
||||
const frName = frjsonData?.parse?.title || null;
|
||||
|
||||
const frAffiliations = frjsonData ? extractAffiliations(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null;
|
||||
|
||||
const frEpithets = frjsonData ? extractEpithets(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null;
|
||||
|
||||
const frOrigin = frjsonData ? extractOrigin(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null;
|
||||
|
||||
|
||||
return {
|
||||
id: finalCharacterId,
|
||||
name,
|
||||
frName,
|
||||
gender,
|
||||
age,
|
||||
height,
|
||||
origin,
|
||||
frOrigin,
|
||||
devilFruitId,
|
||||
devilFruitUrl,
|
||||
affiliations,
|
||||
frAffiliations,
|
||||
bounty,
|
||||
hakiObservation,
|
||||
hakiArmament,
|
||||
hakiConqueror,
|
||||
epithets,
|
||||
frEpithets,
|
||||
firstAppearance,
|
||||
arcId,
|
||||
status,
|
||||
pictureUrl,
|
||||
url: finalCharacterUrl
|
||||
pictureUrl: "Image_Non_Disponible",
|
||||
url: characterUrl,
|
||||
frUrl
|
||||
};
|
||||
} catch (error) {
|
||||
console.error(`Error fetching ${characterName}:`, (error as Error).message);
|
||||
@@ -442,7 +468,7 @@ async function fetchCharacter(
|
||||
* Extract age from infobox
|
||||
*/
|
||||
function extractAge($: cheerio.CheerioAPI): number | null {
|
||||
const div = $('[data-source="âge"] .pi-data-value');
|
||||
const div = $('[data-source="age"] .pi-data-value');
|
||||
if (div.length === 0) return null;
|
||||
|
||||
let text = div.html();
|
||||
@@ -473,11 +499,11 @@ function extractAffiliations($: cheerio.CheerioAPI): string[] {
|
||||
const cleanedDiv = div.clone();
|
||||
cleanedDiv.find('sup').remove();
|
||||
|
||||
let text = cleanedDiv.html();
|
||||
const text = cleanedDiv.html();
|
||||
if (!text) return [];
|
||||
|
||||
// Extract all link values
|
||||
const linkValues = cleanedDiv.find('a').map((i, el) => $(el).text().trim()).get();
|
||||
const linkValues = cleanedDiv.find('a').map((_i, el) => $(el).text().trim()).get();
|
||||
if (linkValues.length > 0) {
|
||||
return linkValues;
|
||||
}
|
||||
@@ -490,28 +516,46 @@ function extractAffiliations($: cheerio.CheerioAPI): string[] {
|
||||
|
||||
/**
|
||||
* Extract epithets from infobox
|
||||
* Epithets are always between double quotes
|
||||
* Handles both quoted and unquoted epithets, keeping only the main/latest readable values.
|
||||
*/
|
||||
function extractEpithets($: cheerio.CheerioAPI): string[] {
|
||||
const div = $('[data-source="épithète"] .pi-data-value');
|
||||
const div = $('[data-source="epithet"] .pi-data-value');
|
||||
if (div.length === 0) return [];
|
||||
|
||||
const cleanedDiv = div.clone();
|
||||
cleanedDiv.find('sup').remove();
|
||||
|
||||
let text = cleanedDiv.text();
|
||||
if (!text) return [];
|
||||
const html = cleanedDiv.html();
|
||||
if (!html) return [];
|
||||
|
||||
// Extract all text between double quotes (both straight and curly quotes)
|
||||
const matches = text.match(/["«"]([^"»"]+)["»"]/g);
|
||||
if (!matches) return [];
|
||||
const plainText = html
|
||||
.replace(/<br\s*\/?\s*>/gi, '\n')
|
||||
.replace(/<[^>]*>/g, '');
|
||||
|
||||
// Remove the quotes and trim
|
||||
const epithets = matches.map(match =>
|
||||
match.replace(/^["«"]|["»"]$/g, '').trim()
|
||||
).filter(Boolean);
|
||||
const lines = plainText
|
||||
.split('\n')
|
||||
.map((line) => line.trim())
|
||||
.filter(Boolean);
|
||||
|
||||
return epithets;
|
||||
const epithets = lines
|
||||
.map((line) => {
|
||||
const normalized = line.replace(/\s+/g, ' ').trim();
|
||||
|
||||
// Prefer explicit quoted epithet if present.
|
||||
const quotedMatch = normalized.match(/["«“](.*?)["»”]/);
|
||||
if (quotedMatch?.[1]) {
|
||||
return quotedMatch[1].trim();
|
||||
}
|
||||
|
||||
// Otherwise keep only the base epithet text before extra notes/translations.
|
||||
return normalized
|
||||
.split(/[;(]/)[0]
|
||||
.replace(/["'«»“”]/g, '')
|
||||
.trim();
|
||||
})
|
||||
.filter(Boolean);
|
||||
|
||||
return Array.from(new Set(epithets));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -519,49 +563,27 @@ function extractEpithets($: cheerio.CheerioAPI): string[] {
|
||||
* Returns both normalized ID and URL
|
||||
*/
|
||||
async function extractDevilFruit($: cheerio.CheerioAPI): Promise<DevilFruitData | null> {
|
||||
const link = $('[data-source="dfnom"] .pi-data-value a').first();
|
||||
const link = $('[data-source="dfname"] .pi-data-value a').first();
|
||||
if (link.length === 0) return null;
|
||||
|
||||
const href = link.attr('href');
|
||||
if (!href || !href.startsWith('/fr/wiki/')) return null;
|
||||
if (!href || !href.startsWith('/wiki/')) return null;
|
||||
|
||||
const cleanUrl = href.replace('/fr/wiki/', '');
|
||||
|
||||
try {
|
||||
// Fetch the page via API to follow redirects
|
||||
const apiUrl = `${FANDOM_API_BASE}${decodeURIComponent(cleanUrl)}`;
|
||||
const response = await fetchWithRetry(apiUrl);
|
||||
const jsonData = await response.json() as any;
|
||||
|
||||
// Use final page name from API (if parse.links contains one element, it means the original page was a redirect, so we use the the element 0 as the final URL, otherwise we use the original URL)
|
||||
let finalPath = cleanUrl;
|
||||
if (jsonData.parse?.links?.length === 1) {
|
||||
finalPath = jsonData.parse.links[0]['*'];
|
||||
}
|
||||
const cleanUrl = href.replace('/wiki/', '');
|
||||
|
||||
|
||||
if (finalPath) {
|
||||
return {
|
||||
devilFruitId: normalizeId(finalPath),
|
||||
devilFruitUrl: finalPath
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error fetching devil fruit page: ${(error as Error).message}`);
|
||||
}
|
||||
|
||||
// Fallback to the original href
|
||||
return {
|
||||
devilFruitId: normalizeId(cleanUrl),
|
||||
devilFruitUrl: cleanUrl
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Extract bounty from infobox
|
||||
*/
|
||||
function extractBounty($: cheerio.CheerioAPI): number | null {
|
||||
const div = $('[data-source="prime"] .pi-data-value');
|
||||
const div = $('[data-source="bounty"] .pi-data-value');
|
||||
if (div.length === 0) return 0;
|
||||
|
||||
let text = div.html();
|
||||
@@ -593,7 +615,7 @@ function extractBounty($: cheerio.CheerioAPI): number | null {
|
||||
* Extract height from infobox
|
||||
*/
|
||||
function extractHeight($: cheerio.CheerioAPI): number | null {
|
||||
const div = $('[data-source="taille"] .pi-data-value');
|
||||
const div = $('[data-source="height"] .pi-data-value');
|
||||
if (div.length === 0) return null;
|
||||
|
||||
let text = div.html();
|
||||
@@ -602,43 +624,44 @@ function extractHeight($: cheerio.CheerioAPI): number | null {
|
||||
// Remove all sup blocks (citations)
|
||||
text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');
|
||||
|
||||
// Check if there's a <p> tag - if yes, use content from <p>
|
||||
let content;
|
||||
const pMatch = text.match(/<p[^>]*>(.*?)<\/p>/i);
|
||||
if (pMatch) {
|
||||
// Extract content from the <p> tag
|
||||
content = pMatch[1];
|
||||
} else {
|
||||
// Use the last value method (after any <br> tag)
|
||||
content = text.split('<br>').pop();
|
||||
}
|
||||
|
||||
let cleanText = (content || '').replace(/<[^>]*>/g, '').trim();
|
||||
|
||||
// Remove content with parentheses
|
||||
cleanText = cleanText.replace(/\([^)]*\)/g, '');
|
||||
|
||||
// Normalize units for meters or centimeters
|
||||
// Convert line breaks to new lines so we can reliably pick the latest value.
|
||||
const textWithNewLines = text.replace(/<br\s*\/?\s*>/gi, '\n');
|
||||
const lines = textWithNewLines
|
||||
.replace(/<[^>]*>/g, '')
|
||||
.split('\n')
|
||||
.map((line) => line.trim())
|
||||
.filter(Boolean);
|
||||
|
||||
// Keep only lines that look like a height value, then pick the latest one.
|
||||
const heightLines = lines.filter((line) => /\d/.test(line) && /(cm|m)/i.test(line));
|
||||
const latestLine = heightLines.length > 0 ? heightLines[heightLines.length - 1] : lines[lines.length - 1];
|
||||
if (!latestLine) return null;
|
||||
|
||||
// Remove descriptive suffixes like "(post-timeskip)".
|
||||
const cleanText = latestLine.replace(/\([^)]*\)/g, '').trim();
|
||||
const normalized = cleanText.toLowerCase().replace(/\s/g, '');
|
||||
if (normalized.includes('cm')) {
|
||||
const digitsOnly = normalized.replace(/\D/g, '');
|
||||
const cm = parseFloat(digitsOnly);
|
||||
return cm ? cm / 100 : null;
|
||||
|
||||
// Values are stored in meters in this dataset.
|
||||
const cmMatch = normalized.match(/(\d+(?:[.,]\d+)?)cm/);
|
||||
if (cmMatch) {
|
||||
const cm = parseFloat(cmMatch[1].replace(',', '.'));
|
||||
return Number.isFinite(cm) ? cm / 100 : null;
|
||||
}
|
||||
|
||||
if (normalized.includes('m')) {
|
||||
const parts = normalized.split('m').filter(Boolean);
|
||||
return parts.length > 0 ? parseFloat(parts.join('.')) : null;
|
||||
const mMatch = normalized.match(/(\d+(?:[.,]\d+)?)m/);
|
||||
if (mMatch) {
|
||||
const meters = parseFloat(mMatch[1].replace(',', '.'));
|
||||
return Number.isFinite(meters) ? meters : null;
|
||||
}
|
||||
|
||||
return normalized.length > 0 ? parseFloat(normalized.replace(/\D/g, '')) : null;
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract origin from infobox
|
||||
*/
|
||||
function extractOrigin($: cheerio.CheerioAPI): string | null {
|
||||
const div = $('[data-source="origine"] .pi-data-value');
|
||||
const div = $('[data-source="origin"] .pi-data-value, [data-source="origine"] .pi-data-value').first();
|
||||
if (div.length === 0) return null;
|
||||
|
||||
let text = div.html();
|
||||
@@ -661,16 +684,16 @@ function extractOrigin($: cheerio.CheerioAPI): string | null {
|
||||
* Extract status from infobox
|
||||
*/
|
||||
function extractStatus($: cheerio.CheerioAPI): string | null {
|
||||
const div = $('[data-source="statut"] .pi-data-value');
|
||||
const div = $('[data-source="status"] .pi-data-value');
|
||||
if (div.length === 0) return null;
|
||||
|
||||
const statusText = div.text().trim().toLowerCase();
|
||||
|
||||
if (statusText.includes('vivant')) {
|
||||
if (statusText.includes('Alive')) {
|
||||
return 'Alive';
|
||||
} else if (statusText.includes('décédé')) {
|
||||
} else if (statusText.includes('Dead')) {
|
||||
return 'Dead';
|
||||
} else if (statusText.includes('inconnu')) {
|
||||
} else if (statusText.includes('Unknown')) {
|
||||
return 'Unknown';
|
||||
}
|
||||
|
||||
@@ -753,7 +776,7 @@ async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Pro
|
||||
// Use API to fetch devil fruit page
|
||||
const apiUrl = `${FANDOM_API_BASE}${devilFruitUrl}`;
|
||||
const response = await fetchWithRetry(apiUrl);
|
||||
const jsonData = await response.json() as any;
|
||||
const jsonData = await response.json();
|
||||
|
||||
// Extract HTML from API response
|
||||
const htmlContent = jsonData.parse?.text?.['*'];
|
||||
@@ -767,7 +790,7 @@ async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Pro
|
||||
// Determine type based on categories (if categories contain "Paramecia", "Zoan", "Logia" or "Smile")
|
||||
if (jsonData.parse?.categories) {
|
||||
const categories = jsonData.parse.categories
|
||||
.map((cat: any) => String(cat['*'] || '').toLowerCase());
|
||||
.map((cat: { ['*']: string }) => String(cat['*'] || '').toLowerCase());
|
||||
|
||||
if (categories.some((category: string) => category.includes('paramecia'))) {
|
||||
type = 'Paramecia';
|
||||
@@ -847,6 +870,7 @@ async function main(): Promise<void> {
|
||||
console.table({
|
||||
ID: arc.id,
|
||||
Name: arc.name,
|
||||
FrenchName: arc.frName || '',
|
||||
StartChapter: arc.startChapter,
|
||||
EndChapter: arc.endChapter || 'Ongoing',
|
||||
URL: arc.url
|
||||
@@ -886,7 +910,7 @@ async function main(): Promise<void> {
|
||||
const batch = failedCharacters.slice(i, i + FETCH_CONCURRENCY);
|
||||
const batchResults = await Promise.all(
|
||||
batch.map(async (char) => {
|
||||
const data = await fetchCharacter(char.url, char.name, char.pictureUrl, char.chapter);
|
||||
const data = await fetchCharacter(char.url, char.name, char.chapter, arcsList);
|
||||
return { char, data };
|
||||
})
|
||||
);
|
||||
@@ -918,12 +942,6 @@ async function main(): Promise<void> {
|
||||
devilFruitUrls.add(data.devilFruitUrl);
|
||||
}
|
||||
|
||||
if (data.firstAppearance) {
|
||||
const arc = arcsList.find(a => a.startChapter <= data.firstAppearance && (a.endChapter === null || a.endChapter >= data.firstAppearance));
|
||||
if (arc) {
|
||||
data.arcId = arc.id;
|
||||
}
|
||||
}
|
||||
|
||||
characters.push(data);
|
||||
} else {
|
||||
|
||||
Reference in New Issue
Block a user