refactor: update package.json and scripts for One Piece scraper
- Changed the scrape script to use tsx for TypeScript execution. - Added new TypeScript script for scraping One Piece data. - Refactored package.json to include dependencies for the new scraper. - Removed unused dependencies and organized devDependencies. feat: implement One Piece data scraping functionality - Added functionality to scrape arcs, characters, and devil fruits from One Piece fandom. - Implemented data extraction methods for character attributes and devil fruit details. - Added JSON and CSV export capabilities for scraped data. fix: update auth configuration to handle missing secret - Modified the auth configuration to use a default secret if BETTER_AUTH_SECRET is not set. fix: improve database client initialization - Updated database client creation to use a local database file if DATABASE_URL is not set. chore: switch Svelte adapter to node - Changed Svelte adapter from auto to node for better server-side rendering support.
This commit is contained in:
952
scripts/scrape-onepiece.ts
Normal file
952
scripts/scrape-onepiece.ts
Normal file
@@ -0,0 +1,952 @@
|
||||
import * as cheerio from 'cheerio';
|
||||
import fs from 'fs';
|
||||
import { createObjectCsvWriter } from 'csv-writer';
|
||||
|
||||
// Type definitions
|
||||
interface Arc {
|
||||
id: string;
|
||||
name: string;
|
||||
startChapter: number;
|
||||
endChapter: number | null;
|
||||
url: string;
|
||||
}
|
||||
|
||||
interface Character {
|
||||
id: string;
|
||||
name: string;
|
||||
gender: string | null;
|
||||
age: number | null;
|
||||
height: number | null;
|
||||
origin: string | null;
|
||||
devilFruitId: string | null;
|
||||
devilFruitUrl: string | null;
|
||||
affiliations: string[];
|
||||
bounty: number | null;
|
||||
hakiObservation: boolean;
|
||||
hakiArmament: boolean;
|
||||
hakiConqueror: boolean;
|
||||
epithets: string[];
|
||||
firstAppearance: number;
|
||||
status: string | null;
|
||||
pictureUrl: string | null;
|
||||
url: string;
|
||||
arcId?: string;
|
||||
}
|
||||
|
||||
interface CharacterListItem {
|
||||
name: string;
|
||||
url: string;
|
||||
pictureUrl: string | null;
|
||||
chapter: string;
|
||||
}
|
||||
|
||||
interface DevilFruitData {
|
||||
devilFruitId: string;
|
||||
devilFruitUrl: string;
|
||||
}
|
||||
|
||||
interface DevilFruit {
|
||||
id: string;
|
||||
name: string;
|
||||
type: string | null;
|
||||
url: string;
|
||||
}
|
||||
|
||||
const FANDOM_BASE_URL = 'https://onepiece.fandom.com/fr/wiki';
|
||||
const OUTPUT_DIR = './scraped-data';
|
||||
const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed
|
||||
const INITIAL_RETRY_DELAY = 1000;
|
||||
|
||||
// Store cookies across requests (simulate browser behavior)
|
||||
const cookies = new Map<string, string>();
|
||||
|
||||
function getCookieHeader(): string {
|
||||
const cookieArray = Array.from(cookies.values()).map(c => c.split(';')[0]);
|
||||
return cookieArray.length > 0 ? cookieArray.join('; ') : '';
|
||||
}
|
||||
|
||||
function saveCookies(setCookieHeader: string | string[] | null): void {
|
||||
if (setCookieHeader) {
|
||||
const cookiesList = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader];
|
||||
cookiesList.forEach(cookie => {
|
||||
const [nameValue] = cookie.split(';');
|
||||
const [name] = nameValue.split('=');
|
||||
if (name) cookies.set(name, cookie);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
/**
|
||||
* Retry a fetch request with exponential backoff
|
||||
*/
|
||||
async function fetchWithRetry(url: string, options: RequestInit = {}, retries: number = 0): Promise<Response> {
|
||||
try {
|
||||
const headers: Record<string, string> = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) Firefox/150.0',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
...((options.headers as Record<string, string>) || {})
|
||||
};
|
||||
|
||||
// Add cookies from previous requests
|
||||
const cookieHeader = getCookieHeader();
|
||||
if (cookieHeader) {
|
||||
headers['Cookie'] = cookieHeader;
|
||||
}
|
||||
|
||||
const response = await fetch(url, {
|
||||
headers,
|
||||
...options
|
||||
} as any);
|
||||
|
||||
// Save cookies from response
|
||||
const setCookie = response.headers.get('set-cookie');
|
||||
if (setCookie) {
|
||||
saveCookies(setCookie);
|
||||
}
|
||||
|
||||
// Check if response is OK (status 200-299)
|
||||
if (response.ok) {
|
||||
return response;
|
||||
}
|
||||
|
||||
// If not OK and we have retries left, retry
|
||||
if (retries < MAX_RETRIES) {
|
||||
const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
|
||||
console.log(`⚠️ HTTP ${response.status} for ${url}, retrying in ${delay}ms...`);
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
return fetchWithRetry(url, options, retries + 1);
|
||||
}
|
||||
|
||||
// If we've exhausted retries, throw error
|
||||
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||||
} catch (error) {
|
||||
// If it's a network error and we have retries left, retry
|
||||
if (retries < MAX_RETRIES) {
|
||||
const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
|
||||
console.log(`⚠️ Network error: ${(error as Error).message}, retrying in ${delay}ms...`);
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
return fetchWithRetry(url, options, retries + 1);
|
||||
}
|
||||
|
||||
// If we've exhausted retries, throw error
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Normalize string by decoding URI components, punctuation, and replacing spaces with underscores
|
||||
*/
|
||||
function normalizeId(str: string): string {
|
||||
return decodeURIComponent(str)
|
||||
.normalize('NFD')
|
||||
.replace(/[,:.\(\)]/g, '')
|
||||
.replace(/\s+/g, '_')
|
||||
.toLowerCase();
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch all arcs from One Piece fandom
|
||||
*/
|
||||
async function fetchAllArcs(): Promise<Arc[]> {
|
||||
try {
|
||||
const url = `${FANDOM_BASE_URL}/Chapitres_et_Tomes`;
|
||||
console.log('Fetching arcs list...');
|
||||
const response = await fetchWithRetry(url);
|
||||
const data = await response.text();
|
||||
const $ = cheerio.load(data);
|
||||
const arcs: Arc[] = [];
|
||||
|
||||
// Find all arc links in the table
|
||||
$('table.wikitable td a').each((index, element) => {
|
||||
const text = $(element).text().trim();
|
||||
const href = $(element).attr('href');
|
||||
|
||||
// Check if it's an arc link (contains "Arc" and chapter info)
|
||||
if (text.includes('Arc') && text.includes('Ch.') && href) {
|
||||
// Extract arc name and chapter range
|
||||
// Example text: "Arc Ville d'Orange(Ch.8 à 21)[T.1 à 3]"
|
||||
console.log(`Processing arc link: ${text} (${href})`);
|
||||
const nameMatch = text.match(/^(.*?Arc.*?)\s*\(Ch\.(\d+)(?:\s*à\s*(?:(\d+)|(?:...)))?\)/);
|
||||
if (nameMatch) {
|
||||
let arcName = nameMatch[1].trim();
|
||||
// Remove "Arc " from the name
|
||||
arcName = arcName.replace(/^Arc\s+/i, '');
|
||||
|
||||
const startChapter = parseInt(nameMatch[2]);
|
||||
const endChapter = nameMatch[3] ? parseInt(nameMatch[3]) : null;
|
||||
|
||||
// Generate arc ID by normalizing the url
|
||||
let arcId = normalizeId(href.replace('/fr/wiki/', ''));
|
||||
// Remove "Arc_" from the id
|
||||
arcId = arcId.replace(/^arc_/i, '');
|
||||
|
||||
arcs.push({
|
||||
id: arcId,
|
||||
name: arcName,
|
||||
startChapter,
|
||||
endChapter,
|
||||
url: href.replace('/fr/wiki/', '')
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`Found ${arcs.length} arcs.`);
|
||||
return arcs;
|
||||
} catch (error) {
|
||||
console.error('Error fetching arcs list:', (error as Error).message);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save arcs to JSON
|
||||
*/
|
||||
async function saveArcsToJSON(arcs: Arc[]): Promise<void> {
|
||||
const filepath = `${OUTPUT_DIR}/arcs.json`;
|
||||
fs.writeFileSync(filepath, JSON.stringify(arcs, null, 2));
|
||||
console.log(`✓ Saved to ${filepath}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Save arcs to CSV
|
||||
*/
|
||||
async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
|
||||
const filepath = `${OUTPUT_DIR}/arcs.csv`;
|
||||
const csvWriter = createObjectCsvWriter({
|
||||
path: filepath,
|
||||
header: [
|
||||
{ id: 'id', title: 'ID' },
|
||||
{ id: 'name', title: 'Name' },
|
||||
{ id: 'startChapter', title: 'Start Chapter' },
|
||||
{ id: 'endChapter', title: 'End Chapter' },
|
||||
{ id: 'url', title: 'URL' }
|
||||
],
|
||||
});
|
||||
|
||||
const records = arcs
|
||||
.filter((arc) => arc !== null)
|
||||
.map((arc) => ({
|
||||
id: arc.id || '',
|
||||
name: arc.name || '',
|
||||
startChapter: arc.startChapter || '',
|
||||
endChapter: arc.endChapter || '',
|
||||
url: arc.url || ''
|
||||
}));
|
||||
|
||||
await csvWriter.writeRecords(records);
|
||||
console.log(`✓ Saved to ${filepath}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch all cannon characters from One Piece fandom
|
||||
*/
|
||||
async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
|
||||
try {
|
||||
const url = `${FANDOM_BASE_URL}/Liste_des_Personnages_Canon`;
|
||||
console.log('Fetching character list...');
|
||||
const response = await fetchWithRetry(url);
|
||||
const data = await response.text();
|
||||
const $ = cheerio.load(data);
|
||||
const characters: CharacterListItem[] = [];
|
||||
$('table.wikitable tbody tr').each((index, element) => {
|
||||
if (index === 0) return; // Skip header row
|
||||
let charpictureUrl = $(element).find('td:nth-child(1) a img').attr('data-src') || $(element).find('td:nth-child(1) a img').attr('src');
|
||||
let charUrl = $(element).find('td:nth-child(2) a').attr('href');
|
||||
let charName = $(element).find('td:nth-child(2) a').text().trim();
|
||||
let charChapter = $(element).find('td:nth-child(3)').text().trim();
|
||||
|
||||
// Remove parentheses and their content from chapter info (e.g. "1 (flashback)" becomes "1")
|
||||
charChapter = charChapter.replace(/\([^)]*\)/g, '');
|
||||
charChapter = charChapter.replace(/\D/g, '');
|
||||
|
||||
// If charChapter is empty, skip the character as it means they don't have a proper page and are just mentioned in the list
|
||||
if (!charChapter) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (charUrl) {
|
||||
charUrl = charUrl.replace('/fr/wiki/', '');
|
||||
characters.push({
|
||||
name: charName,
|
||||
url: charUrl,
|
||||
pictureUrl: charpictureUrl || null,
|
||||
chapter: charChapter,
|
||||
});
|
||||
}
|
||||
});
|
||||
console.log(`Found ${characters.length} characters.`);
|
||||
return characters;
|
||||
} catch (error) {
|
||||
console.error('Error fetching character list:', (error as Error).message);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch character data from fandom using provided URL
|
||||
*/
|
||||
async function fetchCharacter(
|
||||
characterUrl: string,
|
||||
characterName: string,
|
||||
characterpictureUrl: string | null,
|
||||
characterChapter: string
|
||||
): Promise<Character | null> {
|
||||
try {
|
||||
console.log(`Fetching: ${characterName}...`);
|
||||
|
||||
const response = await fetchWithRetry(`${FANDOM_BASE_URL}/${characterUrl}`, {
|
||||
redirect: 'follow'
|
||||
});
|
||||
|
||||
// Use final URL after redirects (canonical character page)
|
||||
let finalCharacterUrl = characterUrl;
|
||||
let finalCharacterId = normalizeId(characterUrl);
|
||||
try {
|
||||
const finalUrl = new URL(response.url);
|
||||
const characterUrlPath = finalUrl.pathname.replace('/fr/wiki/', '');
|
||||
if (characterUrlPath) {
|
||||
finalCharacterUrl = characterUrlPath;
|
||||
finalCharacterId = normalizeId(characterUrlPath);
|
||||
}
|
||||
} catch {
|
||||
// If HTTP is not ok or redirected URL, throw an error to be caught in the outer block
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||||
}
|
||||
}
|
||||
|
||||
const data = await response.text();
|
||||
|
||||
const $ = cheerio.load(data);
|
||||
|
||||
// Extract character name
|
||||
const name = $('h1.mw-page-title-main').text().trim() || characterName.replace(/_/g, ' ');
|
||||
|
||||
// Generate character ID from URL + name combination
|
||||
finalCharacterId = normalizeId(finalCharacterUrl + '_' + name);
|
||||
|
||||
// Extract gender from the specific categories link
|
||||
let gender: string | null = null;
|
||||
if ($('.page-header__categories a[title="Catégorie:Personnages Masculins"]').length > 0) {
|
||||
gender = 'Male';
|
||||
} else if ($('.page-header__categories a[title="Catégorie:Personnages Féminins"]').length > 0) {
|
||||
gender = 'Female';
|
||||
}
|
||||
|
||||
// Extract age
|
||||
const age = extractAge($);
|
||||
|
||||
// Extract affiliations
|
||||
const affiliations = extractAffiliations($);
|
||||
|
||||
// Extract epithets
|
||||
const epithets = extractEpithets($);
|
||||
|
||||
// Extract devil fruit
|
||||
const devilFruitData = await extractDevilFruit($);
|
||||
const devilFruitId = devilFruitData?.devilFruitId || null;
|
||||
const devilFruitUrl = devilFruitData?.devilFruitUrl || null;
|
||||
|
||||
// Extract haki
|
||||
const hakiObservation = $('.page-header__categories a[title="Catégorie:Utilisateurs du Haki de l\'observation"]').length > 0;
|
||||
const hakiArmament = $('.page-header__categories a[title="Catégorie:Utilisateurs du Haki de l\'armement"]').length > 0;
|
||||
const hakiConqueror = $('.page-header__categories a[title="Catégorie:Utilisateurs du Haki des rois"]').length > 0;
|
||||
|
||||
// Extract bounty
|
||||
const bounty = extractBounty($);
|
||||
|
||||
// Extract height
|
||||
const height = extractHeight($);
|
||||
|
||||
// Use chapter from character list, cast to int
|
||||
let firstAppearance = parseInt(characterChapter);
|
||||
|
||||
// Extract origin
|
||||
const origin = extractOrigin($);
|
||||
|
||||
// Extract status
|
||||
const status = extractStatus($);
|
||||
|
||||
// Extract image URL and clean it
|
||||
let pictureUrl = characterpictureUrl;
|
||||
if (pictureUrl && pictureUrl.includes('Image_Non_Disponible')) {
|
||||
pictureUrl = null;
|
||||
}
|
||||
|
||||
return {
|
||||
id: finalCharacterId,
|
||||
name,
|
||||
gender,
|
||||
age,
|
||||
height,
|
||||
origin,
|
||||
devilFruitId,
|
||||
devilFruitUrl,
|
||||
affiliations,
|
||||
bounty,
|
||||
hakiObservation,
|
||||
hakiArmament,
|
||||
hakiConqueror,
|
||||
epithets,
|
||||
firstAppearance,
|
||||
status,
|
||||
pictureUrl,
|
||||
url: finalCharacterUrl
|
||||
};
|
||||
} catch (error) {
|
||||
console.error(`Error fetching ${characterName}:`, (error as Error).message);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Extract age from infobox
|
||||
*/
|
||||
function extractAge($: cheerio.CheerioAPI): number | null {
|
||||
const div = $('[data-source="âge"] .pi-data-value');
|
||||
if (div.length === 0) return null;
|
||||
|
||||
let text = div.html();
|
||||
if (!text) return null;
|
||||
|
||||
// Remove all sup blocks (citations)
|
||||
text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');
|
||||
|
||||
// Get the last element and extract only digits
|
||||
const parts = text.split('<br');
|
||||
const lastPart = parts[parts.length - 1];
|
||||
let cleanText = lastPart.replace(/<[^>]*>/g, '').trim();
|
||||
|
||||
// Remove content with parentheses
|
||||
cleanText = cleanText.replace(/\([^)]*\)/g, '');
|
||||
|
||||
const digitsOnly = cleanText.replace(/\D/g, '');
|
||||
return parseInt(digitsOnly) || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract affiliations from infobox
|
||||
*/
|
||||
function extractAffiliations($: cheerio.CheerioAPI): string[] {
|
||||
const div = $('[data-source="affiliation"] .pi-data-value');
|
||||
if (div.length === 0) return [];
|
||||
|
||||
const cleanedDiv = div.clone();
|
||||
cleanedDiv.find('sup').remove();
|
||||
|
||||
let text = cleanedDiv.html();
|
||||
if (!text) return [];
|
||||
|
||||
// Extract all link values
|
||||
const linkValues = cleanedDiv.find('a').map((i, el) => $(el).text().trim()).get();
|
||||
if (linkValues.length > 0) {
|
||||
return linkValues;
|
||||
}
|
||||
|
||||
// Fallback to parsing text
|
||||
const cleanText = text.replace(/<[^>]*>/g, '').trim();
|
||||
const parts = cleanText.split(/\s*\n\s*|\s*;\s*|\s*,\s*/).filter(Boolean);
|
||||
return parts.length > 0 ? parts : [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract epithets from infobox
|
||||
* Epithets are always between double quotes
|
||||
*/
|
||||
function extractEpithets($: cheerio.CheerioAPI): string[] {
|
||||
const div = $('[data-source="épithète"] .pi-data-value');
|
||||
if (div.length === 0) return [];
|
||||
|
||||
const cleanedDiv = div.clone();
|
||||
cleanedDiv.find('sup').remove();
|
||||
|
||||
let text = cleanedDiv.text();
|
||||
if (!text) return [];
|
||||
|
||||
// Extract all text between double quotes (both straight and curly quotes)
|
||||
const matches = text.match(/["«"]([^"»"]+)["»"]/g);
|
||||
if (!matches) return [];
|
||||
|
||||
// Remove the quotes and trim
|
||||
const epithets = matches.map(match =>
|
||||
match.replace(/^["«"]|["»"]$/g, '').trim()
|
||||
).filter(Boolean);
|
||||
|
||||
return epithets;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract devil fruit from infobox
|
||||
* Returns both normalized ID and URL
|
||||
*/
|
||||
async function extractDevilFruit($: cheerio.CheerioAPI): Promise<DevilFruitData | null> {
|
||||
const link = $('[data-source="dfnom"] .pi-data-value a').first();
|
||||
if (link.length === 0) return null;
|
||||
|
||||
const href = link.attr('href');
|
||||
if (!href || !href.startsWith('/fr/wiki/')) return null;
|
||||
|
||||
const cleanUrl = href.replace('/fr/wiki/', '');
|
||||
|
||||
try {
|
||||
// Fetch the page to follow redirects
|
||||
const response = await fetchWithRetry(`${FANDOM_BASE_URL}/${cleanUrl}`, {
|
||||
redirect: 'follow' // Explicitly follow redirects
|
||||
});
|
||||
|
||||
// Use the final URL after redirects
|
||||
const finalUrl = new URL(response.url);
|
||||
const pathname = finalUrl.pathname;
|
||||
const finalPath = pathname.replace('/fr/wiki/', '');
|
||||
|
||||
if (finalPath) {
|
||||
return {
|
||||
devilFruitId: normalizeId(finalPath),
|
||||
devilFruitUrl: finalPath
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error fetching devil fruit page: ${(error as Error).message}`);
|
||||
}
|
||||
|
||||
// Fallback to the original href
|
||||
return {
|
||||
devilFruitId: normalizeId(cleanUrl),
|
||||
devilFruitUrl: cleanUrl
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract bounty from infobox
|
||||
*/
|
||||
function extractBounty($: cheerio.CheerioAPI): number | null {
|
||||
const div = $('[data-source="prime"] .pi-data-value');
|
||||
if (div.length === 0) return 0;
|
||||
|
||||
let text = div.html();
|
||||
if (!text) return 0;
|
||||
|
||||
// Remove all sup blocks (citations)
|
||||
text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');
|
||||
|
||||
// Extract the first value before any <br> tag
|
||||
const firstValue = text.split('<br')[0].trim();
|
||||
let cleanText = firstValue.replace(/<[^>]*>/g, '').trim();
|
||||
|
||||
// Check if cleanText contains digits
|
||||
if (!/\d/.test(cleanText)) {
|
||||
// If no digits, try second value after <br>
|
||||
const secondValue = text.split('<br>')[1];
|
||||
if (secondValue) {
|
||||
cleanText = secondValue.replace(/<[^>]*>/g, '').trim();
|
||||
}
|
||||
}
|
||||
|
||||
// Remove all non-digits
|
||||
cleanText = cleanText.replace(/\D/g, '');
|
||||
|
||||
return cleanText ? parseInt(cleanText) : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract height from infobox
|
||||
*/
|
||||
function extractHeight($: cheerio.CheerioAPI): number | null {
|
||||
const div = $('[data-source="taille"] .pi-data-value');
|
||||
if (div.length === 0) return null;
|
||||
|
||||
let text = div.html();
|
||||
if (!text) return null;
|
||||
|
||||
// Remove all sup blocks (citations)
|
||||
text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');
|
||||
|
||||
// Check if there's a <p> tag - if yes, use content from <p>
|
||||
let content;
|
||||
const pMatch = text.match(/<p[^>]*>(.*?)<\/p>/i);
|
||||
if (pMatch) {
|
||||
// Extract content from the <p> tag
|
||||
content = pMatch[1];
|
||||
} else {
|
||||
// Use the last value method (after any <br> tag)
|
||||
content = text.split('<br>').pop();
|
||||
}
|
||||
|
||||
let cleanText = (content || '').replace(/<[^>]*>/g, '').trim();
|
||||
|
||||
// Remove content with parentheses
|
||||
cleanText = cleanText.replace(/\([^)]*\)/g, '');
|
||||
|
||||
// Normalize units for meters or centimeters
|
||||
const normalized = cleanText.toLowerCase().replace(/\s/g, '');
|
||||
if (normalized.includes('cm')) {
|
||||
const digitsOnly = normalized.replace(/\D/g, '');
|
||||
return parseFloat(digitsOnly) || null;
|
||||
}
|
||||
|
||||
if (normalized.includes('m')) {
|
||||
const parts = normalized.split('m').filter(Boolean);
|
||||
return parts.length > 0 ? parseFloat(parts.join('.')) : null;
|
||||
}
|
||||
|
||||
return normalized.length > 0 ? parseFloat(normalized.replace(/\D/g, '')) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract origin from infobox
|
||||
*/
|
||||
function extractOrigin($: cheerio.CheerioAPI): string | null {
|
||||
const div = $('[data-source="origine"] .pi-data-value');
|
||||
if (div.length === 0) return null;
|
||||
|
||||
let text = div.html();
|
||||
if (!text) return null;
|
||||
|
||||
// Remove all sup blocks (citations)
|
||||
text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');
|
||||
|
||||
// Extract the first value before any <br> tag
|
||||
const firstValue = text.split('<br')[0].trim();
|
||||
let cleanText = firstValue.replace(/<[^>]*>/g, '').trim();
|
||||
|
||||
// Remove content with parentheses
|
||||
cleanText = cleanText.replace(/\([^)]*\)/g, '').trim();
|
||||
|
||||
return cleanText || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract status from infobox
|
||||
*/
|
||||
function extractStatus($: cheerio.CheerioAPI): string | null {
|
||||
const div = $('[data-source="statut"] .pi-data-value');
|
||||
if (div.length === 0) return null;
|
||||
|
||||
const statusText = div.text().trim().toLowerCase();
|
||||
|
||||
if (statusText.includes('vivant')) {
|
||||
return 'Alive';
|
||||
} else if (statusText.includes('décédé')) {
|
||||
return 'Dead';
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Save data to JSON
|
||||
*/
|
||||
async function saveToJSON(characters: Character[]): Promise<void> {
|
||||
const filepath = `${OUTPUT_DIR}/characters.json`;
|
||||
fs.writeFileSync(filepath, JSON.stringify(characters, null, 2));
|
||||
console.log(`✓ Saved to ${filepath}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Save data to CSV
|
||||
*/
|
||||
async function saveToCSV(characters: Character[]): Promise<void> {
|
||||
const filepath = `${OUTPUT_DIR}/characters.csv`;
|
||||
const csvWriter = createObjectCsvWriter({
|
||||
path: filepath,
|
||||
header: [
|
||||
{ id: 'id', title: 'ID' },
|
||||
{ id: 'name', title: 'Name' },
|
||||
{ id: 'gender', title: 'Gender' },
|
||||
{ id: 'age', title: 'Age' },
|
||||
{ id: 'height', title: 'Height' },
|
||||
{ id: 'origin', title: 'Origin' },
|
||||
{ id: 'status', title: 'Status' },
|
||||
{ id: 'epithets', title: 'Epithets' },
|
||||
{ id: 'devilFruitId', title: 'Devil Fruit ID' },
|
||||
{ id: 'affiliations', title: 'Affiliations' },
|
||||
{ id: 'bounty', title: 'Bounty' },
|
||||
{ id: 'hakiObservation', title: 'Haki Observation' },
|
||||
{ id: 'hakiArmament', title: 'Haki Armament' },
|
||||
{ id: 'hakiConqueror', title: 'Haki Conqueror' },
|
||||
{ id: 'firstAppearance', title: 'First Appearance' },
|
||||
{ id: 'arcId', title: 'Arc ID' },
|
||||
{ id: 'pictureUrl', title: 'Image URL' },
|
||||
{ id: 'url', title: 'Fandom URL' }
|
||||
],
|
||||
});
|
||||
|
||||
const records = characters
|
||||
.filter((c) => c !== null)
|
||||
.map((c) => ({
|
||||
id: c.id || '',
|
||||
name: c.name || '',
|
||||
gender: c.gender || '',
|
||||
age: c.age || '',
|
||||
height: c.height || '',
|
||||
origin: c.origin || '',
|
||||
status: c.status || '',
|
||||
epithets: Array.isArray(c.epithets) ? c.epithets.join(', ') : (c.epithets || ''),
|
||||
devilFruitId: c.devilFruitId || '',
|
||||
affiliations: Array.isArray(c.affiliations) ? c.affiliations.join(', ') : (c.affiliations || ''),
|
||||
bounty: c.bounty ?? 0,
|
||||
hakiObservation: c.hakiObservation ? 1 : 0,
|
||||
hakiArmament: c.hakiArmament ? 1 : 0,
|
||||
hakiConqueror: c.hakiConqueror ? 1 : 0,
|
||||
firstAppearance: c.firstAppearance || '',
|
||||
arcId: c.arcId || '',
|
||||
pictureUrl: c.pictureUrl || '',
|
||||
url: c.url || ''
|
||||
}));
|
||||
|
||||
await csvWriter.writeRecords(records);
|
||||
console.log(`✓ Saved to ${filepath}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch devil fruit data from fandom using provided URL
|
||||
*/
|
||||
async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Promise<DevilFruit | null> {
|
||||
try {
|
||||
console.log(`Fetching devil fruit: ${devilFruitId}...`);
|
||||
|
||||
const response = await fetchWithRetry(`${FANDOM_BASE_URL}/${devilFruitUrl}`);
|
||||
const data = await response.text();
|
||||
const $ = cheerio.load(data);
|
||||
|
||||
const name = $('span.mw-page-title-main').text().trim();
|
||||
|
||||
// Extract type from label in infobox
|
||||
let type: string | null = null;
|
||||
const typeDiv = $('[data-source="type"] .pi-data-value');
|
||||
if (typeDiv.length > 0) {
|
||||
const typeText = typeDiv.text().trim().toLowerCase();
|
||||
if (typeText.includes('zoan')) {
|
||||
type = 'Zoan';
|
||||
} else if (typeText.includes('paramecia')) {
|
||||
type = 'Paramecia';
|
||||
} else if (typeText.includes('logia')) {
|
||||
type = 'Logia';
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
id: devilFruitId,
|
||||
name,
|
||||
type,
|
||||
url: devilFruitUrl
|
||||
};
|
||||
} catch (error) {
|
||||
console.error(`Error fetching devil fruit ${devilFruitUrl}:`, (error as Error).message);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save devil fruits to JSON
|
||||
*/
|
||||
async function saveDevilFruitsToJSON(devilFruits: DevilFruit[]): Promise<void> {
|
||||
const filepath = `${OUTPUT_DIR}/devil-fruits.json`;
|
||||
fs.writeFileSync(filepath, JSON.stringify(devilFruits, null, 2));
|
||||
console.log(`✓ Saved to ${filepath}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Save devil fruits to CSV
|
||||
*/
|
||||
async function saveDevilFruitsToCSV(devilFruits: DevilFruit[]): Promise<void> {
|
||||
const filepath = `${OUTPUT_DIR}/devil-fruits.csv`;
|
||||
const csvWriter = createObjectCsvWriter({
|
||||
path: filepath,
|
||||
header: [
|
||||
{ id: 'id', title: 'ID' },
|
||||
{ id: 'name', title: 'Name' },
|
||||
{ id: 'type', title: 'Type' },
|
||||
{ id: 'url', title: 'URL' }
|
||||
],
|
||||
});
|
||||
|
||||
const records = devilFruits
|
||||
.filter((df) => df !== null)
|
||||
.map((df) => ({
|
||||
id: df.id || '',
|
||||
name: df.name || '',
|
||||
type: df.type || '',
|
||||
url: df.url || ''
|
||||
}));
|
||||
|
||||
await csvWriter.writeRecords(records);
|
||||
console.log(`✓ Saved to ${filepath}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Main execution
|
||||
*/
|
||||
async function main(): Promise<void> {
|
||||
const format = process.argv[2] || 'all'; // json, csv, or all
|
||||
|
||||
console.log(`\nOne Piece Scraper - Mode: ${format}\n`);
|
||||
|
||||
// Step 1: Scraping Arcs
|
||||
console.log('=== Step 1: Scraping Arcs ===\n');
|
||||
const arcsList = await fetchAllArcs();
|
||||
|
||||
if (arcsList.length > 0) {
|
||||
// Display arcs in table format
|
||||
arcsList.forEach((arc) => {
|
||||
console.table({
|
||||
ID: arc.id,
|
||||
Name: arc.name,
|
||||
StartChapter: arc.startChapter,
|
||||
EndChapter: arc.endChapter || 'Ongoing',
|
||||
URL: arc.url
|
||||
});
|
||||
});
|
||||
|
||||
console.log(`\n✓ Found ${arcsList.length} arcs\n`);
|
||||
|
||||
if (format === 'json' || format === 'all') {
|
||||
await saveArcsToJSON(arcsList);
|
||||
}
|
||||
if (format === 'csv' || format === 'all') {
|
||||
await saveArcsToCSV(arcsList);
|
||||
}
|
||||
} else {
|
||||
console.warn('No arcs found, continuing...\n');
|
||||
}
|
||||
|
||||
// Step 2: Scraping Characters
|
||||
console.log('=== Step 1: Scraping Characters ===\n');
|
||||
const characterList = await fetchAllCharactersUrl();
|
||||
|
||||
if (characterList.length === 0) {
|
||||
console.error('No characters found. Exiting.');
|
||||
return;
|
||||
}
|
||||
|
||||
const characters: Character[] = [];
|
||||
const devilFruitUrls = new Set<string>();
|
||||
let failedCharacters: CharacterListItem[] = [...characterList];
|
||||
|
||||
while (failedCharacters.length > 0) {
|
||||
const nextFailedCharacters: CharacterListItem[] = [];
|
||||
console.log(`\nFetching ${failedCharacters.length} characters...`);
|
||||
|
||||
for (let i = 0; i < failedCharacters.length; i++) {
|
||||
const char = failedCharacters[i];
|
||||
const data = await fetchCharacter(char.url, char.name, char.pictureUrl, char.chapter);
|
||||
|
||||
if (data) {
|
||||
console.table({
|
||||
ID: data.id,
|
||||
Name: data.name,
|
||||
Gender: data.gender,
|
||||
Age: data.age,
|
||||
Status: data.status,
|
||||
Epithets: data.epithets.join(', '),
|
||||
Affiliations: data.affiliations.join(', '),
|
||||
DevilFruitId: data.devilFruitId,
|
||||
DevilFruitUrl: data.devilFruitUrl,
|
||||
HakiObservation: data.hakiObservation ? 'Yes' : 'No',
|
||||
HakiArmament: data.hakiArmament ? 'Yes' : 'No',
|
||||
HakiConqueror: data.hakiConqueror ? 'Yes' : 'No',
|
||||
Height: data.height,
|
||||
Bounty: data.bounty,
|
||||
Origin: data.origin,
|
||||
FirstAppearance: data.firstAppearance,
|
||||
pictureUrl: data.pictureUrl,
|
||||
FandomURL: data.url
|
||||
});
|
||||
|
||||
// Collect devil fruit URLs
|
||||
if (data.devilFruitUrl) {
|
||||
devilFruitUrls.add(data.devilFruitUrl);
|
||||
}
|
||||
|
||||
// Add arc IDs to character data
|
||||
if (data.firstAppearance) {
|
||||
const arc = arcsList.find(a => a.startChapter <= data.firstAppearance && (a.endChapter === null || a.endChapter >= data.firstAppearance));
|
||||
if (arc) {
|
||||
data.arcId = arc.id;
|
||||
}
|
||||
}
|
||||
|
||||
characters.push(data);
|
||||
} else {
|
||||
// Add to retry list and wait before next character
|
||||
nextFailedCharacters.push(char);
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
}
|
||||
}
|
||||
|
||||
failedCharacters = nextFailedCharacters;
|
||||
if (failedCharacters.length > 0) {
|
||||
console.log(`⚠️ ${failedCharacters.length} characters failed. Retrying...`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n✓ Scraped ${characters.length} characters\n`);
|
||||
console.log(`✓ Found ${devilFruitUrls.size} unique devil fruits\n`);
|
||||
|
||||
// Step 3: Scraping Devil Fruits
|
||||
console.log('=== Step 2: Scraping Devil Fruits ===\n');
|
||||
|
||||
if (devilFruitUrls.size === 0) {
|
||||
console.warn('No devil fruits found from characters, skipping...\n');
|
||||
} else {
|
||||
const devilFruits: DevilFruit[] = [];
|
||||
const devilFruitUrlArray = Array.from(devilFruitUrls);
|
||||
|
||||
for (let i = 0; i < devilFruitUrlArray.length; i++) {
|
||||
const url = devilFruitUrlArray[i];
|
||||
const data = await fetchDevilFruit(url, normalizeId(url));
|
||||
|
||||
if (data) {
|
||||
console.table({
|
||||
ID: data.id,
|
||||
Name: data.name,
|
||||
Type: data.type,
|
||||
URL: data.url
|
||||
});
|
||||
|
||||
devilFruits.push(data);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n✓ Scraped ${devilFruits.length} devil fruits\n`);
|
||||
|
||||
if (format === 'json' || format === 'all') {
|
||||
await saveDevilFruitsToJSON(devilFruits);
|
||||
}
|
||||
if (format === 'csv' || format === 'all') {
|
||||
await saveDevilFruitsToCSV(devilFruits);
|
||||
}
|
||||
|
||||
// Update characters with normalized devil fruit IDs
|
||||
const devilFruitMap = new Map<string, string>(devilFruits.map(df => [df.id, df.id]));
|
||||
characters.forEach(char => {
|
||||
if (char.devilFruitUrl) {
|
||||
const normalizedId = normalizeId(char.devilFruitUrl);
|
||||
char.devilFruitId = devilFruitMap.get(normalizedId) || normalizedId;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Save characters after devil fruit IDs are updated
|
||||
if (format === 'json' || format === 'all') {
|
||||
await saveToJSON(characters);
|
||||
}
|
||||
if (format === 'csv' || format === 'all') {
|
||||
await saveToCSV(characters);
|
||||
}
|
||||
|
||||
console.log('\n✓ Done!\n');
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
Reference in New Issue
Block a user