Refactor code structure for improved readability and maintainability

This commit is contained in:
2026-03-14 15:34:30 +01:00
parent a041a8caf5
commit 66afda5101
2 changed files with 802 additions and 784 deletions

View File

@@ -4,6 +4,8 @@ import { sql, eq } from 'drizzle-orm';
import fs from 'fs';
import { arc, character, devilFruit, characterScrapeValidation, type DevilFruitType } from '../src/lib/server/db/schema';
type Status = 'Alive' | 'Dead' | 'Unknown';
type ArcRecord = {
id: string;
name: string;
@@ -22,9 +24,11 @@ type DevilFruitRecord = {
type CharacterRecord = {
id: string;
name: string;
frName?: string | null;
gender?: string | null;
age?: number | null;
affiliations?: string[] | string | null;
frAffiliations?: string[] | string | null;
devilFruitId?: string | null;
hakiObservation?: boolean;
hakiArmament?: boolean;
@@ -32,12 +36,15 @@ type CharacterRecord = {
bounty?: number | null;
height?: number | null;
origin?: string | null;
frOrigin?: string | null;
firstAppearance?: number;
pictureUrl?: string | null;
epithets?: string[] | string | null;
status?: string | null;
frEpithets?: string[] | string | null;
status?: Status | null;
arcId?: string | null;
url?: string | null;
frUrl?: string | null;
};
const DATABASE_URL = process.env.DATABASE_URL || 'file:local.db';
@@ -86,7 +93,7 @@ function toJsonArray(value: string[] | string | null | undefined): string[] | nu
function toDevilFruitType(value: DevilFruitType | string | null | undefined): DevilFruitType | null {
if (!value) return null;
if (value === 'Paramecia' || value === 'Zoan' || value === 'Logia' || value === 'Unknown') {
if (value === 'Paramecia' || value === 'Zoan' || value === 'Logia' || value === 'Smile' || value === 'Unknown') {
return value;
}
return 'Unknown';
@@ -115,59 +122,25 @@ function transformCharacterData(item: CharacterRecord) {
gender: toNullable(item.gender),
age: toNullable(item.age),
affiliations: toJsonArray(item.affiliations),
frAffiliations: toJsonArray(item.frAffiliations),
devilFruitId: toNullable(item.devilFruitId),
hakiObservation: !!item.hakiObservation,
hakiArmament: !!item.hakiArmament,
hakiConqueror: !!item.hakiConqueror,
bounty: item.bounty ?? 0,
height: toNumber(item.height as any),
height: toNumber(item.height as string | number | null),
origin: toNullable(item.origin),
frOrigin: toNullable(item.frOrigin),
firstAppearance: item.firstAppearance ?? 0,
pictureUrl: toNullable(item.pictureUrl),
epithets: toJsonArray(item.epithets),
frEpithets: toJsonArray(item.frEpithets),
status: toNullable(item.status),
arcId: toNullable(item.arcId),
url: toNullable(item.url)
};
}
function hasChanged(jsonData: any, dbData: any): boolean {
if (!dbData) return true;
// Print any differences for debugging
for (const key in jsonData) {
const jsonValue = jsonData[key];
const dbValue = dbData[key];
const jsonString = typeof jsonValue === 'object' ? JSON.stringify(jsonValue) : String(jsonValue);
const dbString = typeof dbValue === 'object' ? JSON.stringify(dbValue) : String(dbValue);
if (jsonString !== dbString) {
console.log(`\nField "${key}" changed for character ID ${jsonData.id}:`);
console.log(` JSON: ${jsonString}`);
console.log(` DB: ${dbString}`);
} }
// Compare each field
return (
jsonData.name != dbData.name ||
jsonData.gender != dbData.gender ||
jsonData.age != dbData.age ||
JSON.stringify(jsonData.affiliations) != JSON.stringify(dbData.affiliations) ||
jsonData.devilFruitId != dbData.devilFruitId ||
jsonData.hakiObservation != dbData.hakiObservation ||
jsonData.hakiArmament != dbData.hakiArmament ||
jsonData.hakiConqueror != dbData.hakiConqueror ||
jsonData.bounty != dbData.bounty ||
jsonData.height != dbData.height ||
jsonData.origin != dbData.origin ||
jsonData.firstAppearance != dbData.firstAppearance ||
jsonData.pictureUrl != dbData.pictureUrl ||
JSON.stringify(jsonData.epithets) != JSON.stringify(dbData.epithets) ||
jsonData.status != dbData.status ||
jsonData.arcId != dbData.arcId ||
jsonData.url != dbData.url
);
}
async function isCharacterTableEmpty(): Promise<boolean> {
const result = await db.select({ count: sql<number>`COUNT(*)` }).from(character);
return result[0]?.count === 0;

View File

@@ -57,14 +57,15 @@ interface DevilFruit {
url: string;
}
const FANDOM_API_BASE = 'https://onepiece.fandom.com/api.php?action=parse&redirects=true&format=json&page=';
const FR_FANDOM_API_BASE = 'https://onepiece.fandom.com/fr/api.php?action=parse&redirects=true&format=json&page=';
const FANDOM_API_BASE =
'https://onepiece.fandom.com/api.php?action=parse&redirects=true&format=json&page=';
const FR_FANDOM_API_BASE =
'https://onepiece.fandom.com/fr/api.php?action=parse&redirects=true&format=json&page=';
const OUTPUT_DIR = './scraped-data';
const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed
const INITIAL_RETRY_DELAY = 1000;
const FETCH_CONCURRENCY = 50;
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
@@ -73,13 +74,17 @@ if (!fs.existsSync(OUTPUT_DIR)) {
/**
* Retry a fetch request with exponential backoff
*/
async function fetchWithRetry(url: string, options: RequestInit = {}, retries: number = 0): Promise<Response> {
async function fetchWithRetry(
url: string,
options: RequestInit = {},
retries: number = 0
): Promise<Response> {
try {
const headers: Record<string, string> = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) Firefox/150.0',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
Connection: 'keep-alive',
...((options.headers as Record<string, string>) || {})
};
@@ -97,7 +102,7 @@ async function fetchWithRetry(url: string, options: RequestInit = {}, retries: n
if (retries < MAX_RETRIES) {
const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
console.log(`⚠️ HTTP ${response.status} for ${url}, retrying in ${delay}ms...`);
await new Promise(resolve => setTimeout(resolve, delay));
await new Promise((resolve) => setTimeout(resolve, delay));
return fetchWithRetry(url, options, retries + 1);
}
@@ -108,7 +113,7 @@ async function fetchWithRetry(url: string, options: RequestInit = {}, retries: n
if (retries < MAX_RETRIES) {
const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
console.log(`⚠️ Network error: ${(error as Error).message}, retrying in ${delay}ms...`);
await new Promise(resolve => setTimeout(resolve, delay));
await new Promise((resolve) => setTimeout(resolve, delay));
return fetchWithRetry(url, options, retries + 1);
}
@@ -123,11 +128,12 @@ async function fetchWithRetry(url: string, options: RequestInit = {}, retries: n
function getFrLink(links: { lang: string; ['*']: string; url: string }[]): { url: string } | null {
// Get french url by getting parse.langlinks where lang is "fr" and extract the name from there
const frLink = links.find((link: { lang: string; ['*']: string; url: string }) => link.lang === 'fr');
const frLink = links.find(
(link: { lang: string; ['*']: string; url: string }) => link.lang === 'fr'
);
return frLink ? { url: frLink['url'] } : null;
}
/**
* Normalize string by decoding URI components, punctuation, and replacing spaces with underscores
*/
@@ -190,9 +196,7 @@ async function fetchAllArcs(): Promise<Arc[]> {
}
const startChapter = parseInt(chapterMatch[1], 10);
const endChapter = /current/i.test(chapterMatch[2])
? null
: parseInt(chapterMatch[2], 10);
const endChapter = /current/i.test(chapterMatch[2]) ? null : parseInt(chapterMatch[2], 10);
let arcId = normalizeId(cleanUrl);
arcId = arcId.replace(/_arc$/i, '');
@@ -200,7 +204,10 @@ async function fetchAllArcs(): Promise<Arc[]> {
// Query the href page via API to get the correct HTML content (in case of redirect) and extract the French name from there
const arcResponse = await fetchWithRetry(`${FANDOM_API_BASE}${cleanUrl}`);
const arcJsonData = await arcResponse.json();
let frArcName: string | null = arcJsonData.parse?.langlinks.find((link: { lang: string; ['*']: string }) => link.lang === 'fr')?.['*'] || null;
let frArcName: string | null =
arcJsonData.parse?.langlinks.find(
(link: { lang: string; ['*']: string }) => link.lang === 'fr'
)?.['*'] || null;
// Remove "Arc" suffix from French name if present to keep it consistent with English names (e.g. "Arc de Luffy" becomes "Luffy")
if (frArcName && /\bArc\b/i.test(frArcName)) {
@@ -250,7 +257,7 @@ async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
{ id: 'startChapter', title: 'Start Chapter' },
{ id: 'endChapter', title: 'End Chapter' },
{ id: 'url', title: 'URL' }
],
]
});
const records = arcs
@@ -329,7 +336,7 @@ async function fetchCharacter(
characterUrl: string,
characterName: string,
characterChapter: number,
arcsList: Arc[],
arcsList: Arc[]
): Promise<Character | null> {
try {
console.log(`Fetching: ${characterName}...`);
@@ -371,7 +378,7 @@ async function fetchCharacter(
const age = extractAge($);
// Extract affiliations
const affiliations = extractAffiliations($);
const affiliations = await extractAffiliations($, 'en');
// Extract epithets
const epithets = extractEpithets($);
@@ -412,7 +419,11 @@ async function fetchCharacter(
const status = extractStatus($);
let arcId = '';
const arc = arcsList.find(a => a.startChapter <= firstAppearance && (a.endChapter === null || a.endChapter >= firstAppearance));
const arc = arcsList.find(
(a) =>
a.startChapter <= firstAppearance &&
(a.endChapter === null || a.endChapter >= firstAppearance)
);
if (!arc) {
return null;
}
@@ -420,16 +431,27 @@ async function fetchCharacter(
const frLink = getFrLink(jsonData.parse?.langlinks || []);
const frUrl = frLink ? frLink.url.replace('https://onepiece.fandom.com/fr/wiki/', '') : null;
const frjsonData = frUrl ? await fetchWithRetry(`${FR_FANDOM_API_BASE}${frUrl}`).then(res => res.json()) : null;
const frjsonData = frUrl
? await fetchWithRetry(`${FR_FANDOM_API_BASE}${frUrl}`).then((res) => res.json())
: null;
const frName = frjsonData?.parse?.title || null;
let frName = frjsonData?.parse?.title || null;
const frAffiliations = frjsonData ? extractAffiliations(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null;
const frAffiliations = frjsonData
? await extractAffiliations(cheerio.load(frjsonData.parse?.text?.['*'] || ''), 'fr')
: null;
const frEpithets = frjsonData ? extractEpithets(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null;
const frEpithets = frjsonData
? extractEpithets(cheerio.load(frjsonData.parse?.text?.['*'] || ''))
: null;
const frOrigin = frjsonData ? extractOrigin(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null;
const frOrigin = frjsonData
? extractOrigin(cheerio.load(frjsonData.parse?.text?.['*'] || ''))
: null;
if (name !== jsonData.parse?.title) {
frName = name;
}
return {
id: finalCharacterId,
@@ -453,7 +475,7 @@ async function fetchCharacter(
firstAppearance,
arcId,
status,
pictureUrl: "Image_Non_Disponible",
pictureUrl: 'Image_Non_Disponible',
url: characterUrl,
frUrl
};
@@ -463,7 +485,6 @@ async function fetchCharacter(
}
}
/**
* Extract age from infobox
*/
@@ -492,7 +513,7 @@ function extractAge($: cheerio.CheerioAPI): number | null {
/**
* Extract affiliations from infobox
*/
function extractAffiliations($: cheerio.CheerioAPI): string[] {
async function extractAffiliations($: cheerio.CheerioAPI, lang: string): Promise<string[]> {
const div = $('[data-source="affiliation"] .pi-data-value');
if (div.length === 0) return [];
@@ -502,10 +523,31 @@ function extractAffiliations($: cheerio.CheerioAPI): string[] {
const text = cleanedDiv.html();
if (!text) return [];
// Extract all link values
const linkValues = cleanedDiv.find('a').map((_i, el) => $(el).text().trim()).get();
if (linkValues.length > 0) {
return linkValues;
// Resolve affiliations from linked page titles.
const links = cleanedDiv.find('a').toArray();
if (links.length > 0) {
const linkValues = await Promise.all(
links.map(async (el) => {
const href = $(el).attr('href') || '';
const resolvedTitle = await fetchWithRetry(
`${lang === 'fr' ? FR_FANDOM_API_BASE : FANDOM_API_BASE}${href.replace('/fr/wiki/', '').replace('/wiki/', '')}`
)
.then((res) => res.json())
.then((json) => json.parse?.title)
.catch(() => null);
if (resolvedTitle) {
return resolvedTitle;
}
return $(el).text().trim();
})
);
const uniqueLinks = Array.from(new Set(linkValues.filter(Boolean)));
if (uniqueLinks.length > 0) {
return uniqueLinks;
}
}
// Fallback to parsing text
@@ -528,9 +570,7 @@ function extractEpithets($: cheerio.CheerioAPI): string[] {
const html = cleanedDiv.html();
if (!html) return [];
const plainText = html
.replace(/<br\s*\/?\s*>/gi, '\n')
.replace(/<[^>]*>/g, '');
const plainText = html.replace(/<br\s*\/?\s*>/gi, '\n').replace(/<[^>]*>/g, '');
const lines = plainText
.split('\n')
@@ -571,14 +611,12 @@ async function extractDevilFruit($: cheerio.CheerioAPI): Promise<DevilFruitData
const cleanUrl = href.replace('/wiki/', '');
return {
devilFruitId: normalizeId(cleanUrl),
devilFruitUrl: cleanUrl
};
}
/**
* Extract bounty from infobox
*/
@@ -634,7 +672,8 @@ function extractHeight($: cheerio.CheerioAPI): number | null {
// Keep only lines that look like a height value, then pick the latest one.
const heightLines = lines.filter((line) => /\d/.test(line) && /(cm|m)/i.test(line));
const latestLine = heightLines.length > 0 ? heightLines[heightLines.length - 1] : lines[lines.length - 1];
const latestLine =
heightLines.length > 0 ? heightLines[heightLines.length - 1] : lines[lines.length - 1];
if (!latestLine) return null;
// Remove descriptive suffixes like "(post-timeskip)".
@@ -661,7 +700,9 @@ function extractHeight($: cheerio.CheerioAPI): number | null {
* Extract origin from infobox
*/
function extractOrigin($: cheerio.CheerioAPI): string | null {
const div = $('[data-source="origin"] .pi-data-value, [data-source="origine"] .pi-data-value').first();
const div = $(
'[data-source="origin"] .pi-data-value, [data-source="origine"] .pi-data-value'
).first();
if (div.length === 0) return null;
let text = div.html();
@@ -700,7 +741,6 @@ function extractStatus($: cheerio.CheerioAPI): string | null {
return 'Alive';
}
/**
* Save data to JSON
*/
@@ -736,7 +776,7 @@ async function saveToCSV(characters: Character[]): Promise<void> {
{ id: 'arcId', title: 'Arc ID' },
{ id: 'pictureUrl', title: 'Image URL' },
{ id: 'url', title: 'Fandom URL' }
],
]
});
const records = characters
@@ -749,9 +789,11 @@ async function saveToCSV(characters: Character[]): Promise<void> {
height: c.height || '',
origin: c.origin || '',
status: c.status || '',
epithets: Array.isArray(c.epithets) ? c.epithets.join(', ') : (c.epithets || ''),
epithets: Array.isArray(c.epithets) ? c.epithets.join(', ') : c.epithets || '',
devilFruitId: c.devilFruitId || '',
affiliations: Array.isArray(c.affiliations) ? c.affiliations.join(', ') : (c.affiliations || ''),
affiliations: Array.isArray(c.affiliations)
? c.affiliations.join(', ')
: c.affiliations || '',
bounty: c.bounty ?? 0,
hakiObservation: c.hakiObservation ? 1 : 0,
hakiArmament: c.hakiArmament ? 1 : 0,
@@ -769,7 +811,10 @@ async function saveToCSV(characters: Character[]): Promise<void> {
/**
* Fetch devil fruit data from fandom using provided URL
*/
async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Promise<DevilFruit | null> {
async function fetchDevilFruit(
devilFruitUrl: string,
devilFruitId: string
): Promise<DevilFruit | null> {
try {
console.log(`Fetching devil fruit: ${devilFruitUrl}...`);
@@ -789,8 +834,9 @@ async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Pro
let type: string | null = null;
// Determine type based on categories (if categories contain "Paramecia", "Zoan", "Logia" or "Smile")
if (jsonData.parse?.categories) {
const categories = jsonData.parse.categories
.map((cat: { ['*']: string }) => String(cat['*'] || '').toLowerCase());
const categories = jsonData.parse.categories.map((cat: { ['*']: string }) =>
String(cat['*'] || '').toLowerCase()
);
if (categories.some((category: string) => category.includes('paramecia'))) {
type = 'Paramecia';
@@ -836,7 +882,7 @@ async function saveDevilFruitsToCSV(devilFruits: DevilFruit[]): Promise<void> {
{ id: 'name', title: 'Name' },
{ id: 'type', title: 'Type' },
{ id: 'url', title: 'URL' }
],
]
});
const records = devilFruits
@@ -942,7 +988,6 @@ async function main(): Promise<void> {
devilFruitUrls.add(data.devilFruitUrl);
}
characters.push(data);
} else {
nextFailedCharacters.push(char);
@@ -1001,8 +1046,8 @@ async function main(): Promise<void> {
}
// Update characters with normalized devil fruit IDs
const devilFruitMap = new Map<string, string>(devilFruits.map(df => [df.id, df.id]));
characters.forEach(char => {
const devilFruitMap = new Map<string, string>(devilFruits.map((df) => [df.id, df.id]));
characters.forEach((char) => {
if (char.devilFruitUrl) {
const normalizedId = normalizeId(char.devilFruitUrl);
char.devilFruitId = devilFruitMap.get(normalizedId) || normalizedId;