|
|
|
|
@@ -1,32 +1,71 @@
|
|
|
|
|
import * as cheerio from 'cheerio';
|
|
|
|
|
import fs from 'fs';
|
|
|
|
|
import https from 'https';
|
|
|
|
|
import { createObjectCsvWriter } from 'csv-writer';
|
|
|
|
|
|
|
|
|
|
// Type definitions
|
|
|
|
|
interface Arc {
|
|
|
|
|
id: string;
|
|
|
|
|
name: string;
|
|
|
|
|
startChapter: number;
|
|
|
|
|
endChapter: number | null;
|
|
|
|
|
url: string;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
interface Character {
|
|
|
|
|
id: string;
|
|
|
|
|
name: string;
|
|
|
|
|
gender: string | null;
|
|
|
|
|
age: number | null;
|
|
|
|
|
height: number | null;
|
|
|
|
|
origin: string | null;
|
|
|
|
|
devilFruitId: string | null;
|
|
|
|
|
devilFruitUrl: string | null;
|
|
|
|
|
affiliations: string[];
|
|
|
|
|
bounty: number | null;
|
|
|
|
|
hakiObservation: boolean;
|
|
|
|
|
hakiArmament: boolean;
|
|
|
|
|
hakiConqueror: boolean;
|
|
|
|
|
epithets: string[];
|
|
|
|
|
firstAppearance: number;
|
|
|
|
|
status: string | null;
|
|
|
|
|
pictureUrl: string | null;
|
|
|
|
|
url: string;
|
|
|
|
|
arcId?: string;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
interface CharacterListItem {
|
|
|
|
|
name: string;
|
|
|
|
|
url: string;
|
|
|
|
|
pictureUrl: string | null;
|
|
|
|
|
chapter: string;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
interface DevilFruitData {
|
|
|
|
|
devilFruitId: string;
|
|
|
|
|
devilFruitUrl: string;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
interface DevilFruit {
|
|
|
|
|
id: string;
|
|
|
|
|
name: string;
|
|
|
|
|
type: string | null;
|
|
|
|
|
url: string;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const FANDOM_BASE_URL = 'https://onepiece.fandom.com/fr/wiki';
|
|
|
|
|
const OUTPUT_DIR = './scraped-data';
|
|
|
|
|
const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed
|
|
|
|
|
const INITIAL_RETRY_DELAY = 1000;
|
|
|
|
|
|
|
|
|
|
// Keep same HTTP session like a normal browser - maintain connection pool and allow cookie persistence
|
|
|
|
|
const httpsAgent = new https.Agent({
|
|
|
|
|
keepAlive: true,
|
|
|
|
|
keepAliveMsecs: 1000,
|
|
|
|
|
maxFreeSockets: 10,
|
|
|
|
|
maxSockets: 50,
|
|
|
|
|
maxConnections: 50,
|
|
|
|
|
timeout: 30000
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// Store cookies across requests (simulate browser behavior)
|
|
|
|
|
const cookies = new Map();
|
|
|
|
|
const cookies = new Map<string, string>();
|
|
|
|
|
|
|
|
|
|
function getCookieHeader() {
|
|
|
|
|
function getCookieHeader(): string {
|
|
|
|
|
const cookieArray = Array.from(cookies.values()).map(c => c.split(';')[0]);
|
|
|
|
|
return cookieArray.length > 0 ? cookieArray.join('; ') : '';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function saveCookies(setCookieHeader) {
|
|
|
|
|
function saveCookies(setCookieHeader: string | string[] | null): void {
|
|
|
|
|
if (setCookieHeader) {
|
|
|
|
|
const cookiesList = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader];
|
|
|
|
|
cookiesList.forEach(cookie => {
|
|
|
|
|
@@ -45,14 +84,14 @@ if (!fs.existsSync(OUTPUT_DIR)) {
|
|
|
|
|
/**
|
|
|
|
|
* Retry a fetch request with exponential backoff
|
|
|
|
|
*/
|
|
|
|
|
async function fetchWithRetry(url, options = {}, retries = 0) {
|
|
|
|
|
async function fetchWithRetry(url: string, options: RequestInit = {}, retries: number = 0): Promise<Response> {
|
|
|
|
|
try {
|
|
|
|
|
const headers = {
|
|
|
|
|
const headers: Record<string, string> = {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) Firefox/150.0',
|
|
|
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
|
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
|
|
|
'Connection': 'keep-alive',
|
|
|
|
|
...options.headers
|
|
|
|
|
...((options.headers as Record<string, string>) || {})
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Add cookies from previous requests
|
|
|
|
|
@@ -63,9 +102,8 @@ async function fetchWithRetry(url, options = {}, retries = 0) {
|
|
|
|
|
|
|
|
|
|
const response = await fetch(url, {
|
|
|
|
|
headers,
|
|
|
|
|
agent: httpsAgent,
|
|
|
|
|
...options
|
|
|
|
|
});
|
|
|
|
|
} as any);
|
|
|
|
|
|
|
|
|
|
// Save cookies from response
|
|
|
|
|
const setCookie = response.headers.get('set-cookie');
|
|
|
|
|
@@ -92,7 +130,7 @@ async function fetchWithRetry(url, options = {}, retries = 0) {
|
|
|
|
|
// If it's a network error and we have retries left, retry
|
|
|
|
|
if (retries < MAX_RETRIES) {
|
|
|
|
|
const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
|
|
|
|
|
console.log(`⚠️ Network error: ${error.message}, retrying in ${delay}ms...`);
|
|
|
|
|
console.log(`⚠️ Network error: ${(error as Error).message}, retrying in ${delay}ms...`);
|
|
|
|
|
await new Promise(resolve => setTimeout(resolve, delay));
|
|
|
|
|
return fetchWithRetry(url, options, retries + 1);
|
|
|
|
|
}
|
|
|
|
|
@@ -106,7 +144,7 @@ async function fetchWithRetry(url, options = {}, retries = 0) {
|
|
|
|
|
/**
|
|
|
|
|
* Normalize string by decoding URI components, punctuation, and replacing spaces with underscores
|
|
|
|
|
*/
|
|
|
|
|
function normalizeId(str) {
|
|
|
|
|
function normalizeId(str: string): string {
|
|
|
|
|
return decodeURIComponent(str)
|
|
|
|
|
.normalize('NFD')
|
|
|
|
|
.replace(/[,:.\(\)]/g, '')
|
|
|
|
|
@@ -117,14 +155,14 @@ function normalizeId(str) {
|
|
|
|
|
/**
|
|
|
|
|
* Fetch all arcs from One Piece fandom
|
|
|
|
|
*/
|
|
|
|
|
async function fetchAllArcs() {
|
|
|
|
|
async function fetchAllArcs(): Promise<Arc[]> {
|
|
|
|
|
try {
|
|
|
|
|
const url = `${FANDOM_BASE_URL}/Chapitres_et_Tomes`;
|
|
|
|
|
console.log('Fetching arcs list...');
|
|
|
|
|
const response = await fetchWithRetry(url);
|
|
|
|
|
const data = await response.text();
|
|
|
|
|
const $ = cheerio.load(data);
|
|
|
|
|
const arcs = [];
|
|
|
|
|
const arcs: Arc[] = [];
|
|
|
|
|
|
|
|
|
|
// Find all arc links in the table
|
|
|
|
|
$('table.wikitable td a').each((index, element) => {
|
|
|
|
|
@@ -132,7 +170,7 @@ async function fetchAllArcs() {
|
|
|
|
|
const href = $(element).attr('href');
|
|
|
|
|
|
|
|
|
|
// Check if it's an arc link (contains "Arc" and chapter info)
|
|
|
|
|
if (text.includes('Arc') && text.includes('Ch.')) {
|
|
|
|
|
if (text.includes('Arc') && text.includes('Ch.') && href) {
|
|
|
|
|
// Extract arc name and chapter range
|
|
|
|
|
// Example text: "Arc Ville d'Orange(Ch.8 à 21)[T.1 à 3]"
|
|
|
|
|
console.log(`Processing arc link: ${text} (${href})`);
|
|
|
|
|
@@ -164,7 +202,7 @@ async function fetchAllArcs() {
|
|
|
|
|
console.log(`Found ${arcs.length} arcs.`);
|
|
|
|
|
return arcs;
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.error('Error fetching arcs list:', error.message);
|
|
|
|
|
console.error('Error fetching arcs list:', (error as Error).message);
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
@@ -172,7 +210,7 @@ async function fetchAllArcs() {
|
|
|
|
|
/**
|
|
|
|
|
* Save arcs to JSON
|
|
|
|
|
*/
|
|
|
|
|
async function saveArcsToJSON(arcs) {
|
|
|
|
|
async function saveArcsToJSON(arcs: Arc[]): Promise<void> {
|
|
|
|
|
const filepath = `${OUTPUT_DIR}/arcs.json`;
|
|
|
|
|
fs.writeFileSync(filepath, JSON.stringify(arcs, null, 2));
|
|
|
|
|
console.log(`✓ Saved to ${filepath}`);
|
|
|
|
|
@@ -181,7 +219,7 @@ async function saveArcsToJSON(arcs) {
|
|
|
|
|
/**
|
|
|
|
|
* Save arcs to CSV
|
|
|
|
|
*/
|
|
|
|
|
async function saveArcsToCSV(arcs) {
|
|
|
|
|
async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
|
|
|
|
|
const filepath = `${OUTPUT_DIR}/arcs.csv`;
|
|
|
|
|
const csvWriter = createObjectCsvWriter({
|
|
|
|
|
path: filepath,
|
|
|
|
|
@@ -211,14 +249,14 @@ async function saveArcsToCSV(arcs) {
|
|
|
|
|
/**
|
|
|
|
|
* Fetch all cannon characters from One Piece fandom
|
|
|
|
|
*/
|
|
|
|
|
async function fetchAllCharactersUrl() {
|
|
|
|
|
async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
|
|
|
|
|
try {
|
|
|
|
|
const url = `${FANDOM_BASE_URL}/Liste_des_Personnages_Canon`;
|
|
|
|
|
console.log('Fetching character list...');
|
|
|
|
|
const response = await fetchWithRetry(url);
|
|
|
|
|
const data = await response.text();
|
|
|
|
|
const $ = cheerio.load(data);
|
|
|
|
|
const characters = [];
|
|
|
|
|
const characters: CharacterListItem[] = [];
|
|
|
|
|
$('table.wikitable tbody tr').each((index, element) => {
|
|
|
|
|
if (index === 0) return; // Skip header row
|
|
|
|
|
let charpictureUrl = $(element).find('td:nth-child(1) a img').attr('data-src') || $(element).find('td:nth-child(1) a img').attr('src');
|
|
|
|
|
@@ -240,7 +278,7 @@ async function fetchAllCharactersUrl() {
|
|
|
|
|
characters.push({
|
|
|
|
|
name: charName,
|
|
|
|
|
url: charUrl,
|
|
|
|
|
pictureUrl: charpictureUrl,
|
|
|
|
|
pictureUrl: charpictureUrl || null,
|
|
|
|
|
chapter: charChapter,
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
@@ -248,7 +286,7 @@ async function fetchAllCharactersUrl() {
|
|
|
|
|
console.log(`Found ${characters.length} characters.`);
|
|
|
|
|
return characters;
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.error('Error fetching character list:', error.message);
|
|
|
|
|
console.error('Error fetching character list:', (error as Error).message);
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
@@ -256,7 +294,12 @@ async function fetchAllCharactersUrl() {
|
|
|
|
|
/**
|
|
|
|
|
* Fetch character data from fandom using provided URL
|
|
|
|
|
*/
|
|
|
|
|
async function fetchCharacter(characterUrl, characterName, characterpictureUrl, characterChapter) {
|
|
|
|
|
async function fetchCharacter(
|
|
|
|
|
characterUrl: string,
|
|
|
|
|
characterName: string,
|
|
|
|
|
characterpictureUrl: string | null,
|
|
|
|
|
characterChapter: string
|
|
|
|
|
): Promise<Character | null> {
|
|
|
|
|
try {
|
|
|
|
|
console.log(`Fetching: ${characterName}...`);
|
|
|
|
|
|
|
|
|
|
@@ -269,10 +312,10 @@ async function fetchCharacter(characterUrl, characterName, characterpictureUrl,
|
|
|
|
|
let finalCharacterId = normalizeId(characterUrl);
|
|
|
|
|
try {
|
|
|
|
|
const finalUrl = new URL(response.url);
|
|
|
|
|
const characterUrl = finalUrl.pathname.replace('/fr/wiki/', '');
|
|
|
|
|
if (characterUrl) {
|
|
|
|
|
finalCharacterUrl = characterUrl;
|
|
|
|
|
finalCharacterId = normalizeId(characterUrl);
|
|
|
|
|
const characterUrlPath = finalUrl.pathname.replace('/fr/wiki/', '');
|
|
|
|
|
if (characterUrlPath) {
|
|
|
|
|
finalCharacterUrl = characterUrlPath;
|
|
|
|
|
finalCharacterId = normalizeId(characterUrlPath);
|
|
|
|
|
}
|
|
|
|
|
} catch {
|
|
|
|
|
// If HTTP is not ok or redirected URL, throw an error to be caught in the outer block
|
|
|
|
|
@@ -292,7 +335,7 @@ async function fetchCharacter(characterUrl, characterName, characterpictureUrl,
|
|
|
|
|
finalCharacterId = normalizeId(finalCharacterUrl + '_' + name);
|
|
|
|
|
|
|
|
|
|
// Extract gender from the specific categories link
|
|
|
|
|
let gender = null;
|
|
|
|
|
let gender: string | null = null;
|
|
|
|
|
if ($('.page-header__categories a[title="Catégorie:Personnages Masculins"]').length > 0) {
|
|
|
|
|
gender = 'Male';
|
|
|
|
|
} else if ($('.page-header__categories a[title="Catégorie:Personnages Féminins"]').length > 0) {
|
|
|
|
|
@@ -360,7 +403,7 @@ async function fetchCharacter(characterUrl, characterName, characterpictureUrl,
|
|
|
|
|
url: finalCharacterUrl
|
|
|
|
|
};
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.error(`Error fetching ${characterName}:`, error.message);
|
|
|
|
|
console.error(`Error fetching ${characterName}:`, (error as Error).message);
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
@@ -369,7 +412,7 @@ async function fetchCharacter(characterUrl, characterName, characterpictureUrl,
|
|
|
|
|
/**
|
|
|
|
|
* Extract age from infobox
|
|
|
|
|
*/
|
|
|
|
|
function extractAge($) {
|
|
|
|
|
function extractAge($: cheerio.CheerioAPI): number | null {
|
|
|
|
|
const div = $('[data-source="âge"] .pi-data-value');
|
|
|
|
|
if (div.length === 0) return null;
|
|
|
|
|
|
|
|
|
|
@@ -394,7 +437,7 @@ function extractAge($) {
|
|
|
|
|
/**
|
|
|
|
|
* Extract affiliations from infobox
|
|
|
|
|
*/
|
|
|
|
|
function extractAffiliations($) {
|
|
|
|
|
function extractAffiliations($: cheerio.CheerioAPI): string[] {
|
|
|
|
|
const div = $('[data-source="affiliation"] .pi-data-value');
|
|
|
|
|
if (div.length === 0) return [];
|
|
|
|
|
|
|
|
|
|
@@ -420,7 +463,7 @@ function extractAffiliations($) {
|
|
|
|
|
* Extract epithets from infobox
|
|
|
|
|
* Epithets are always between double quotes
|
|
|
|
|
*/
|
|
|
|
|
function extractEpithets($) {
|
|
|
|
|
function extractEpithets($: cheerio.CheerioAPI): string[] {
|
|
|
|
|
const div = $('[data-source="épithète"] .pi-data-value');
|
|
|
|
|
if (div.length === 0) return [];
|
|
|
|
|
|
|
|
|
|
@@ -446,7 +489,7 @@ function extractEpithets($) {
|
|
|
|
|
* Extract devil fruit from infobox
|
|
|
|
|
* Returns both normalized ID and URL
|
|
|
|
|
*/
|
|
|
|
|
async function extractDevilFruit($) {
|
|
|
|
|
async function extractDevilFruit($: cheerio.CheerioAPI): Promise<DevilFruitData | null> {
|
|
|
|
|
const link = $('[data-source="dfnom"] .pi-data-value a').first();
|
|
|
|
|
if (link.length === 0) return null;
|
|
|
|
|
|
|
|
|
|
@@ -473,7 +516,7 @@ async function extractDevilFruit($) {
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.error(`Error fetching devil fruit page: ${error.message}`);
|
|
|
|
|
console.error(`Error fetching devil fruit page: ${(error as Error).message}`);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Fallback to the original href
|
|
|
|
|
@@ -486,7 +529,7 @@ async function extractDevilFruit($) {
|
|
|
|
|
/**
|
|
|
|
|
* Extract bounty from infobox
|
|
|
|
|
*/
|
|
|
|
|
function extractBounty($) {
|
|
|
|
|
function extractBounty($: cheerio.CheerioAPI): number | null {
|
|
|
|
|
const div = $('[data-source="prime"] .pi-data-value');
|
|
|
|
|
if (div.length === 0) return 0;
|
|
|
|
|
|
|
|
|
|
@@ -511,14 +554,14 @@ function extractBounty($) {
|
|
|
|
|
|
|
|
|
|
// Remove all non-digits
|
|
|
|
|
cleanText = cleanText.replace(/\D/g, '');
|
|
|
|
|
|
|
|
|
|
return cleanText || 0;
|
|
|
|
|
|
|
|
|
|
return cleanText ? parseInt(cleanText) : 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Extract height from infobox
|
|
|
|
|
*/
|
|
|
|
|
function extractHeight($) {
|
|
|
|
|
function extractHeight($: cheerio.CheerioAPI): number | null {
|
|
|
|
|
const div = $('[data-source="taille"] .pi-data-value');
|
|
|
|
|
if (div.length === 0) return null;
|
|
|
|
|
|
|
|
|
|
@@ -539,7 +582,7 @@ function extractHeight($) {
|
|
|
|
|
content = text.split('<br>').pop();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let cleanText = content.replace(/<[^>]*>/g, '').trim();
|
|
|
|
|
let cleanText = (content || '').replace(/<[^>]*>/g, '').trim();
|
|
|
|
|
|
|
|
|
|
// Remove content with parentheses
|
|
|
|
|
cleanText = cleanText.replace(/\([^)]*\)/g, '');
|
|
|
|
|
@@ -548,21 +591,21 @@ function extractHeight($) {
|
|
|
|
|
const normalized = cleanText.toLowerCase().replace(/\s/g, '');
|
|
|
|
|
if (normalized.includes('cm')) {
|
|
|
|
|
const digitsOnly = normalized.replace(/\D/g, '');
|
|
|
|
|
return digitsOnly || null;
|
|
|
|
|
return parseFloat(digitsOnly) || null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (normalized.includes('m')) {
|
|
|
|
|
const parts = normalized.split('m').filter(Boolean);
|
|
|
|
|
return parts.length > 0 ? parts.join('.') : null;
|
|
|
|
|
return parts.length > 0 ? parseFloat(parts.join('.')) : null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return normalized.replace(/\D/g, '') || null;
|
|
|
|
|
return normalized.length > 0 ? parseFloat(normalized.replace(/\D/g, '')) : null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Extract origin from infobox
|
|
|
|
|
*/
|
|
|
|
|
function extractOrigin($) {
|
|
|
|
|
function extractOrigin($: cheerio.CheerioAPI): string | null {
|
|
|
|
|
const div = $('[data-source="origine"] .pi-data-value');
|
|
|
|
|
if (div.length === 0) return null;
|
|
|
|
|
|
|
|
|
|
@@ -585,7 +628,7 @@ function extractOrigin($) {
|
|
|
|
|
/**
|
|
|
|
|
* Extract status from infobox
|
|
|
|
|
*/
|
|
|
|
|
function extractStatus($) {
|
|
|
|
|
function extractStatus($: cheerio.CheerioAPI): string | null {
|
|
|
|
|
const div = $('[data-source="statut"] .pi-data-value');
|
|
|
|
|
if (div.length === 0) return null;
|
|
|
|
|
|
|
|
|
|
@@ -604,7 +647,7 @@ function extractStatus($) {
|
|
|
|
|
/**
|
|
|
|
|
* Save data to JSON
|
|
|
|
|
*/
|
|
|
|
|
async function saveToJSON(characters) {
|
|
|
|
|
async function saveToJSON(characters: Character[]): Promise<void> {
|
|
|
|
|
const filepath = `${OUTPUT_DIR}/characters.json`;
|
|
|
|
|
fs.writeFileSync(filepath, JSON.stringify(characters, null, 2));
|
|
|
|
|
console.log(`✓ Saved to ${filepath}`);
|
|
|
|
|
@@ -613,7 +656,7 @@ async function saveToJSON(characters) {
|
|
|
|
|
/**
|
|
|
|
|
* Save data to CSV
|
|
|
|
|
*/
|
|
|
|
|
async function saveToCSV(characters) {
|
|
|
|
|
async function saveToCSV(characters: Character[]): Promise<void> {
|
|
|
|
|
const filepath = `${OUTPUT_DIR}/characters.csv`;
|
|
|
|
|
const csvWriter = createObjectCsvWriter({
|
|
|
|
|
path: filepath,
|
|
|
|
|
@@ -669,7 +712,7 @@ async function saveToCSV(characters) {
|
|
|
|
|
/**
|
|
|
|
|
* Fetch devil fruit data from fandom using provided URL
|
|
|
|
|
*/
|
|
|
|
|
async function fetchDevilFruit(devilFruitUrl, devilFruitId) {
|
|
|
|
|
async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Promise<DevilFruit | null> {
|
|
|
|
|
try {
|
|
|
|
|
console.log(`Fetching devil fruit: ${devilFruitId}...`);
|
|
|
|
|
|
|
|
|
|
@@ -680,7 +723,7 @@ async function fetchDevilFruit(devilFruitUrl, devilFruitId) {
|
|
|
|
|
const name = $('span.mw-page-title-main').text().trim();
|
|
|
|
|
|
|
|
|
|
// Extract type from label in infobox
|
|
|
|
|
let type = null;
|
|
|
|
|
let type: string | null = null;
|
|
|
|
|
const typeDiv = $('[data-source="type"] .pi-data-value');
|
|
|
|
|
if (typeDiv.length > 0) {
|
|
|
|
|
const typeText = typeDiv.text().trim().toLowerCase();
|
|
|
|
|
@@ -700,7 +743,7 @@ async function fetchDevilFruit(devilFruitUrl, devilFruitId) {
|
|
|
|
|
url: devilFruitUrl
|
|
|
|
|
};
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.error(`Error fetching devil fruit ${devilFruitUrl}:`, error.message);
|
|
|
|
|
console.error(`Error fetching devil fruit ${devilFruitUrl}:`, (error as Error).message);
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
@@ -708,7 +751,7 @@ async function fetchDevilFruit(devilFruitUrl, devilFruitId) {
|
|
|
|
|
/**
|
|
|
|
|
* Save devil fruits to JSON
|
|
|
|
|
*/
|
|
|
|
|
async function saveDevilFruitsToJSON(devilFruits) {
|
|
|
|
|
async function saveDevilFruitsToJSON(devilFruits: DevilFruit[]): Promise<void> {
|
|
|
|
|
const filepath = `${OUTPUT_DIR}/devil-fruits.json`;
|
|
|
|
|
fs.writeFileSync(filepath, JSON.stringify(devilFruits, null, 2));
|
|
|
|
|
console.log(`✓ Saved to ${filepath}`);
|
|
|
|
|
@@ -717,7 +760,7 @@ async function saveDevilFruitsToJSON(devilFruits) {
|
|
|
|
|
/**
|
|
|
|
|
* Save devil fruits to CSV
|
|
|
|
|
*/
|
|
|
|
|
async function saveDevilFruitsToCSV(devilFruits) {
|
|
|
|
|
async function saveDevilFruitsToCSV(devilFruits: DevilFruit[]): Promise<void> {
|
|
|
|
|
const filepath = `${OUTPUT_DIR}/devil-fruits.csv`;
|
|
|
|
|
const csvWriter = createObjectCsvWriter({
|
|
|
|
|
path: filepath,
|
|
|
|
|
@@ -745,7 +788,7 @@ async function saveDevilFruitsToCSV(devilFruits) {
|
|
|
|
|
/**
|
|
|
|
|
* Main execution
|
|
|
|
|
*/
|
|
|
|
|
async function main() {
|
|
|
|
|
async function main(): Promise<void> {
|
|
|
|
|
const format = process.argv[2] || 'all'; // json, csv, or all
|
|
|
|
|
|
|
|
|
|
console.log(`\nOne Piece Scraper - Mode: ${format}\n`);
|
|
|
|
|
@@ -787,12 +830,12 @@ async function main() {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const characters = [];
|
|
|
|
|
const devilFruitUrls = new Set();
|
|
|
|
|
let failedCharacters = [...characterList];
|
|
|
|
|
const characters: Character[] = [];
|
|
|
|
|
const devilFruitUrls = new Set<string>();
|
|
|
|
|
let failedCharacters: CharacterListItem[] = [...characterList];
|
|
|
|
|
|
|
|
|
|
while (failedCharacters.length > 0) {
|
|
|
|
|
const nextFailedCharacters = [];
|
|
|
|
|
const nextFailedCharacters: CharacterListItem[] = [];
|
|
|
|
|
console.log(`\nFetching ${failedCharacters.length} characters...`);
|
|
|
|
|
|
|
|
|
|
for (let i = 0; i < failedCharacters.length; i++) {
|
|
|
|
|
@@ -828,7 +871,7 @@ async function main() {
|
|
|
|
|
|
|
|
|
|
// Add arc IDs to character data
|
|
|
|
|
if (data.firstAppearance) {
|
|
|
|
|
const arc = arcsList.find(a => a.startChapter <= parseInt(data.firstAppearance) && (a.endChapter === null || a.endChapter >= parseInt(data.firstAppearance)));
|
|
|
|
|
const arc = arcsList.find(a => a.startChapter <= data.firstAppearance && (a.endChapter === null || a.endChapter >= data.firstAppearance));
|
|
|
|
|
if (arc) {
|
|
|
|
|
data.arcId = arc.id;
|
|
|
|
|
}
|
|
|
|
|
@@ -857,7 +900,7 @@ async function main() {
|
|
|
|
|
if (devilFruitUrls.size === 0) {
|
|
|
|
|
console.warn('No devil fruits found from characters, skipping...\n');
|
|
|
|
|
} else {
|
|
|
|
|
const devilFruits = [];
|
|
|
|
|
const devilFruits: DevilFruit[] = [];
|
|
|
|
|
const devilFruitUrlArray = Array.from(devilFruitUrls);
|
|
|
|
|
|
|
|
|
|
for (let i = 0; i < devilFruitUrlArray.length; i++) {
|
|
|
|
|
@@ -886,7 +929,7 @@ async function main() {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Update characters with normalized devil fruit IDs
|
|
|
|
|
const devilFruitMap = new Map(devilFruits.map(df => [df.id, df.id]));
|
|
|
|
|
const devilFruitMap = new Map<string, string>(devilFruits.map(df => [df.id, df.id]));
|
|
|
|
|
characters.forEach(char => {
|
|
|
|
|
if (char.devilFruitUrl) {
|
|
|
|
|
const normalizedId = normalizeId(char.devilFruitUrl);
|