refactor: update package.json and scripts for One Piece scraper

- Changed the scrape script to use tsx for TypeScript execution.
- Added new TypeScript script for scraping One Piece data.
- Refactored package.json to include dependencies for the new scraper.
- Removed unused dependencies and organized devDependencies.

feat: implement One Piece data scraping functionality

- Added functionality to scrape arcs, characters, and devil fruits from One Piece fandom.
- Implemented data extraction methods for character attributes and devil fruit details.
- Added JSON and CSV export capabilities for scraped data.

fix: update auth configuration to handle missing secret

- Modified the auth configuration to use a default secret if BETTER_AUTH_SECRET is not set.

fix: improve database client initialization

- Updated database client creation to use a local database file if DATABASE_URL is not set.

chore: switch Svelte adapter to node

- Changed Svelte adapter from auto to node for better server-side rendering support.
This commit is contained in:
2026-03-01 15:17:17 +01:00
parent b8b3f8bddc
commit 56bd6f5545
10 changed files with 1976 additions and 666 deletions

View File

@@ -1,32 +1,71 @@
import * as cheerio from 'cheerio';
import fs from 'fs';
import https from 'https';
import { createObjectCsvWriter } from 'csv-writer';
// Type definitions
interface Arc {
id: string;
name: string;
startChapter: number;
endChapter: number | null;
url: string;
}
interface Character {
id: string;
name: string;
gender: string | null;
age: number | null;
height: number | null;
origin: string | null;
devilFruitId: string | null;
devilFruitUrl: string | null;
affiliations: string[];
bounty: number | null;
hakiObservation: boolean;
hakiArmament: boolean;
hakiConqueror: boolean;
epithets: string[];
firstAppearance: number;
status: string | null;
pictureUrl: string | null;
url: string;
arcId?: string;
}
interface CharacterListItem {
name: string;
url: string;
pictureUrl: string | null;
chapter: string;
}
interface DevilFruitData {
devilFruitId: string;
devilFruitUrl: string;
}
interface DevilFruit {
id: string;
name: string;
type: string | null;
url: string;
}
const FANDOM_BASE_URL = 'https://onepiece.fandom.com/fr/wiki';
const OUTPUT_DIR = './scraped-data';
const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed
const INITIAL_RETRY_DELAY = 1000;
// Keep same HTTP session like a normal browser - maintain connection pool and allow cookie persistence
const httpsAgent = new https.Agent({
keepAlive: true,
keepAliveMsecs: 1000,
maxFreeSockets: 10,
maxSockets: 50,
maxConnections: 50,
timeout: 30000
});
// Store cookies across requests (simulate browser behavior)
const cookies = new Map();
const cookies = new Map<string, string>();
function getCookieHeader() {
function getCookieHeader(): string {
const cookieArray = Array.from(cookies.values()).map(c => c.split(';')[0]);
return cookieArray.length > 0 ? cookieArray.join('; ') : '';
}
function saveCookies(setCookieHeader) {
function saveCookies(setCookieHeader: string | string[] | null): void {
if (setCookieHeader) {
const cookiesList = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader];
cookiesList.forEach(cookie => {
@@ -45,14 +84,14 @@ if (!fs.existsSync(OUTPUT_DIR)) {
/**
* Retry a fetch request with exponential backoff
*/
async function fetchWithRetry(url, options = {}, retries = 0) {
async function fetchWithRetry(url: string, options: RequestInit = {}, retries: number = 0): Promise<Response> {
try {
const headers = {
const headers: Record<string, string> = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) Firefox/150.0',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
...options.headers
...((options.headers as Record<string, string>) || {})
};
// Add cookies from previous requests
@@ -63,9 +102,8 @@ async function fetchWithRetry(url, options = {}, retries = 0) {
const response = await fetch(url, {
headers,
agent: httpsAgent,
...options
});
} as any);
// Save cookies from response
const setCookie = response.headers.get('set-cookie');
@@ -92,7 +130,7 @@ async function fetchWithRetry(url, options = {}, retries = 0) {
// If it's a network error and we have retries left, retry
if (retries < MAX_RETRIES) {
const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
console.log(`⚠️ Network error: ${error.message}, retrying in ${delay}ms...`);
console.log(`⚠️ Network error: ${(error as Error).message}, retrying in ${delay}ms...`);
await new Promise(resolve => setTimeout(resolve, delay));
return fetchWithRetry(url, options, retries + 1);
}
@@ -106,7 +144,7 @@ async function fetchWithRetry(url, options = {}, retries = 0) {
/**
* Normalize string by decoding URI components, punctuation, and replacing spaces with underscores
*/
function normalizeId(str) {
function normalizeId(str: string): string {
return decodeURIComponent(str)
.normalize('NFD')
.replace(/[,:.\(\)]/g, '')
@@ -117,14 +155,14 @@ function normalizeId(str) {
/**
* Fetch all arcs from One Piece fandom
*/
async function fetchAllArcs() {
async function fetchAllArcs(): Promise<Arc[]> {
try {
const url = `${FANDOM_BASE_URL}/Chapitres_et_Tomes`;
console.log('Fetching arcs list...');
const response = await fetchWithRetry(url);
const data = await response.text();
const $ = cheerio.load(data);
const arcs = [];
const arcs: Arc[] = [];
// Find all arc links in the table
$('table.wikitable td a').each((index, element) => {
@@ -132,7 +170,7 @@ async function fetchAllArcs() {
const href = $(element).attr('href');
// Check if it's an arc link (contains "Arc" and chapter info)
if (text.includes('Arc') && text.includes('Ch.')) {
if (text.includes('Arc') && text.includes('Ch.') && href) {
// Extract arc name and chapter range
// Example text: "Arc Ville d'Orange(Ch.8 à 21)[T.1 à 3]"
console.log(`Processing arc link: ${text} (${href})`);
@@ -164,7 +202,7 @@ async function fetchAllArcs() {
console.log(`Found ${arcs.length} arcs.`);
return arcs;
} catch (error) {
console.error('Error fetching arcs list:', error.message);
console.error('Error fetching arcs list:', (error as Error).message);
return [];
}
}
@@ -172,7 +210,7 @@ async function fetchAllArcs() {
/**
* Save arcs to JSON
*/
async function saveArcsToJSON(arcs) {
async function saveArcsToJSON(arcs: Arc[]): Promise<void> {
const filepath = `${OUTPUT_DIR}/arcs.json`;
fs.writeFileSync(filepath, JSON.stringify(arcs, null, 2));
console.log(`✓ Saved to ${filepath}`);
@@ -181,7 +219,7 @@ async function saveArcsToJSON(arcs) {
/**
* Save arcs to CSV
*/
async function saveArcsToCSV(arcs) {
async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
const filepath = `${OUTPUT_DIR}/arcs.csv`;
const csvWriter = createObjectCsvWriter({
path: filepath,
@@ -211,14 +249,14 @@ async function saveArcsToCSV(arcs) {
/**
* Fetch all cannon characters from One Piece fandom
*/
async function fetchAllCharactersUrl() {
async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
try {
const url = `${FANDOM_BASE_URL}/Liste_des_Personnages_Canon`;
console.log('Fetching character list...');
const response = await fetchWithRetry(url);
const data = await response.text();
const $ = cheerio.load(data);
const characters = [];
const characters: CharacterListItem[] = [];
$('table.wikitable tbody tr').each((index, element) => {
if (index === 0) return; // Skip header row
let charpictureUrl = $(element).find('td:nth-child(1) a img').attr('data-src') || $(element).find('td:nth-child(1) a img').attr('src');
@@ -240,7 +278,7 @@ async function fetchAllCharactersUrl() {
characters.push({
name: charName,
url: charUrl,
pictureUrl: charpictureUrl,
pictureUrl: charpictureUrl || null,
chapter: charChapter,
});
}
@@ -248,7 +286,7 @@ async function fetchAllCharactersUrl() {
console.log(`Found ${characters.length} characters.`);
return characters;
} catch (error) {
console.error('Error fetching character list:', error.message);
console.error('Error fetching character list:', (error as Error).message);
return [];
}
}
@@ -256,7 +294,12 @@ async function fetchAllCharactersUrl() {
/**
* Fetch character data from fandom using provided URL
*/
async function fetchCharacter(characterUrl, characterName, characterpictureUrl, characterChapter) {
async function fetchCharacter(
characterUrl: string,
characterName: string,
characterpictureUrl: string | null,
characterChapter: string
): Promise<Character | null> {
try {
console.log(`Fetching: ${characterName}...`);
@@ -269,10 +312,10 @@ async function fetchCharacter(characterUrl, characterName, characterpictureUrl,
let finalCharacterId = normalizeId(characterUrl);
try {
const finalUrl = new URL(response.url);
const characterUrl = finalUrl.pathname.replace('/fr/wiki/', '');
if (characterUrl) {
finalCharacterUrl = characterUrl;
finalCharacterId = normalizeId(characterUrl);
const characterUrlPath = finalUrl.pathname.replace('/fr/wiki/', '');
if (characterUrlPath) {
finalCharacterUrl = characterUrlPath;
finalCharacterId = normalizeId(characterUrlPath);
}
} catch {
// If HTTP is not ok or redirected URL, throw an error to be caught in the outer block
@@ -292,7 +335,7 @@ async function fetchCharacter(characterUrl, characterName, characterpictureUrl,
finalCharacterId = normalizeId(finalCharacterUrl + '_' + name);
// Extract gender from the specific categories link
let gender = null;
let gender: string | null = null;
if ($('.page-header__categories a[title="Catégorie:Personnages Masculins"]').length > 0) {
gender = 'Male';
} else if ($('.page-header__categories a[title="Catégorie:Personnages Féminins"]').length > 0) {
@@ -360,7 +403,7 @@ async function fetchCharacter(characterUrl, characterName, characterpictureUrl,
url: finalCharacterUrl
};
} catch (error) {
console.error(`Error fetching ${characterName}:`, error.message);
console.error(`Error fetching ${characterName}:`, (error as Error).message);
return null;
}
}
@@ -369,7 +412,7 @@ async function fetchCharacter(characterUrl, characterName, characterpictureUrl,
/**
* Extract age from infobox
*/
function extractAge($) {
function extractAge($: cheerio.CheerioAPI): number | null {
const div = $('[data-source="âge"] .pi-data-value');
if (div.length === 0) return null;
@@ -394,7 +437,7 @@ function extractAge($) {
/**
* Extract affiliations from infobox
*/
function extractAffiliations($) {
function extractAffiliations($: cheerio.CheerioAPI): string[] {
const div = $('[data-source="affiliation"] .pi-data-value');
if (div.length === 0) return [];
@@ -420,7 +463,7 @@ function extractAffiliations($) {
* Extract epithets from infobox
* Epithets are always between double quotes
*/
function extractEpithets($) {
function extractEpithets($: cheerio.CheerioAPI): string[] {
const div = $('[data-source="épithète"] .pi-data-value');
if (div.length === 0) return [];
@@ -446,7 +489,7 @@ function extractEpithets($) {
* Extract devil fruit from infobox
* Returns both normalized ID and URL
*/
async function extractDevilFruit($) {
async function extractDevilFruit($: cheerio.CheerioAPI): Promise<DevilFruitData | null> {
const link = $('[data-source="dfnom"] .pi-data-value a').first();
if (link.length === 0) return null;
@@ -473,7 +516,7 @@ async function extractDevilFruit($) {
};
}
} catch (error) {
console.error(`Error fetching devil fruit page: ${error.message}`);
console.error(`Error fetching devil fruit page: ${(error as Error).message}`);
}
// Fallback to the original href
@@ -486,7 +529,7 @@ async function extractDevilFruit($) {
/**
* Extract bounty from infobox
*/
function extractBounty($) {
function extractBounty($: cheerio.CheerioAPI): number | null {
const div = $('[data-source="prime"] .pi-data-value');
if (div.length === 0) return 0;
@@ -511,14 +554,14 @@ function extractBounty($) {
// Remove all non-digits
cleanText = cleanText.replace(/\D/g, '');
return cleanText || 0;
return cleanText ? parseInt(cleanText) : 0;
}
/**
* Extract height from infobox
*/
function extractHeight($) {
function extractHeight($: cheerio.CheerioAPI): number | null {
const div = $('[data-source="taille"] .pi-data-value');
if (div.length === 0) return null;
@@ -539,7 +582,7 @@ function extractHeight($) {
content = text.split('<br>').pop();
}
let cleanText = content.replace(/<[^>]*>/g, '').trim();
let cleanText = (content || '').replace(/<[^>]*>/g, '').trim();
// Remove content with parentheses
cleanText = cleanText.replace(/\([^)]*\)/g, '');
@@ -548,21 +591,21 @@ function extractHeight($) {
const normalized = cleanText.toLowerCase().replace(/\s/g, '');
if (normalized.includes('cm')) {
const digitsOnly = normalized.replace(/\D/g, '');
return digitsOnly || null;
return parseFloat(digitsOnly) || null;
}
if (normalized.includes('m')) {
const parts = normalized.split('m').filter(Boolean);
return parts.length > 0 ? parts.join('.') : null;
return parts.length > 0 ? parseFloat(parts.join('.')) : null;
}
return normalized.replace(/\D/g, '') || null;
return normalized.length > 0 ? parseFloat(normalized.replace(/\D/g, '')) : null;
}
/**
* Extract origin from infobox
*/
function extractOrigin($) {
function extractOrigin($: cheerio.CheerioAPI): string | null {
const div = $('[data-source="origine"] .pi-data-value');
if (div.length === 0) return null;
@@ -585,7 +628,7 @@ function extractOrigin($) {
/**
* Extract status from infobox
*/
function extractStatus($) {
function extractStatus($: cheerio.CheerioAPI): string | null {
const div = $('[data-source="statut"] .pi-data-value');
if (div.length === 0) return null;
@@ -604,7 +647,7 @@ function extractStatus($) {
/**
* Save data to JSON
*/
async function saveToJSON(characters) {
async function saveToJSON(characters: Character[]): Promise<void> {
const filepath = `${OUTPUT_DIR}/characters.json`;
fs.writeFileSync(filepath, JSON.stringify(characters, null, 2));
console.log(`✓ Saved to ${filepath}`);
@@ -613,7 +656,7 @@ async function saveToJSON(characters) {
/**
* Save data to CSV
*/
async function saveToCSV(characters) {
async function saveToCSV(characters: Character[]): Promise<void> {
const filepath = `${OUTPUT_DIR}/characters.csv`;
const csvWriter = createObjectCsvWriter({
path: filepath,
@@ -669,7 +712,7 @@ async function saveToCSV(characters) {
/**
* Fetch devil fruit data from fandom using provided URL
*/
async function fetchDevilFruit(devilFruitUrl, devilFruitId) {
async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Promise<DevilFruit | null> {
try {
console.log(`Fetching devil fruit: ${devilFruitId}...`);
@@ -680,7 +723,7 @@ async function fetchDevilFruit(devilFruitUrl, devilFruitId) {
const name = $('span.mw-page-title-main').text().trim();
// Extract type from label in infobox
let type = null;
let type: string | null = null;
const typeDiv = $('[data-source="type"] .pi-data-value');
if (typeDiv.length > 0) {
const typeText = typeDiv.text().trim().toLowerCase();
@@ -700,7 +743,7 @@ async function fetchDevilFruit(devilFruitUrl, devilFruitId) {
url: devilFruitUrl
};
} catch (error) {
console.error(`Error fetching devil fruit ${devilFruitUrl}:`, error.message);
console.error(`Error fetching devil fruit ${devilFruitUrl}:`, (error as Error).message);
return null;
}
}
@@ -708,7 +751,7 @@ async function fetchDevilFruit(devilFruitUrl, devilFruitId) {
/**
* Save devil fruits to JSON
*/
async function saveDevilFruitsToJSON(devilFruits) {
async function saveDevilFruitsToJSON(devilFruits: DevilFruit[]): Promise<void> {
const filepath = `${OUTPUT_DIR}/devil-fruits.json`;
fs.writeFileSync(filepath, JSON.stringify(devilFruits, null, 2));
console.log(`✓ Saved to ${filepath}`);
@@ -717,7 +760,7 @@ async function saveDevilFruitsToJSON(devilFruits) {
/**
* Save devil fruits to CSV
*/
async function saveDevilFruitsToCSV(devilFruits) {
async function saveDevilFruitsToCSV(devilFruits: DevilFruit[]): Promise<void> {
const filepath = `${OUTPUT_DIR}/devil-fruits.csv`;
const csvWriter = createObjectCsvWriter({
path: filepath,
@@ -745,7 +788,7 @@ async function saveDevilFruitsToCSV(devilFruits) {
/**
* Main execution
*/
async function main() {
async function main(): Promise<void> {
const format = process.argv[2] || 'all'; // json, csv, or all
console.log(`\nOne Piece Scraper - Mode: ${format}\n`);
@@ -787,12 +830,12 @@ async function main() {
return;
}
const characters = [];
const devilFruitUrls = new Set();
let failedCharacters = [...characterList];
const characters: Character[] = [];
const devilFruitUrls = new Set<string>();
let failedCharacters: CharacterListItem[] = [...characterList];
while (failedCharacters.length > 0) {
const nextFailedCharacters = [];
const nextFailedCharacters: CharacterListItem[] = [];
console.log(`\nFetching ${failedCharacters.length} characters...`);
for (let i = 0; i < failedCharacters.length; i++) {
@@ -828,7 +871,7 @@ async function main() {
// Add arc IDs to character data
if (data.firstAppearance) {
const arc = arcsList.find(a => a.startChapter <= parseInt(data.firstAppearance) && (a.endChapter === null || a.endChapter >= parseInt(data.firstAppearance)));
const arc = arcsList.find(a => a.startChapter <= data.firstAppearance && (a.endChapter === null || a.endChapter >= data.firstAppearance));
if (arc) {
data.arcId = arc.id;
}
@@ -857,7 +900,7 @@ async function main() {
if (devilFruitUrls.size === 0) {
console.warn('No devil fruits found from characters, skipping...\n');
} else {
const devilFruits = [];
const devilFruits: DevilFruit[] = [];
const devilFruitUrlArray = Array.from(devilFruitUrls);
for (let i = 0; i < devilFruitUrlArray.length; i++) {
@@ -886,7 +929,7 @@ async function main() {
}
// Update characters with normalized devil fruit IDs
const devilFruitMap = new Map(devilFruits.map(df => [df.id, df.id]));
const devilFruitMap = new Map<string, string>(devilFruits.map(df => [df.id, df.id]));
characters.forEach(char => {
if (char.devilFruitUrl) {
const normalizedId = normalizeId(char.devilFruitUrl);