Refactor code structure for improved readability and maintainability
This commit is contained in:
@@ -4,6 +4,8 @@ import { sql, eq } from 'drizzle-orm';
|
||||
import fs from 'fs';
|
||||
import { arc, character, devilFruit, characterScrapeValidation, type DevilFruitType } from '../src/lib/server/db/schema';
|
||||
|
||||
type Status = 'Alive' | 'Dead' | 'Unknown';
|
||||
|
||||
type ArcRecord = {
|
||||
id: string;
|
||||
name: string;
|
||||
@@ -22,9 +24,11 @@ type DevilFruitRecord = {
|
||||
type CharacterRecord = {
|
||||
id: string;
|
||||
name: string;
|
||||
frName?: string | null;
|
||||
gender?: string | null;
|
||||
age?: number | null;
|
||||
affiliations?: string[] | string | null;
|
||||
frAffiliations?: string[] | string | null;
|
||||
devilFruitId?: string | null;
|
||||
hakiObservation?: boolean;
|
||||
hakiArmament?: boolean;
|
||||
@@ -32,12 +36,15 @@ type CharacterRecord = {
|
||||
bounty?: number | null;
|
||||
height?: number | null;
|
||||
origin?: string | null;
|
||||
frOrigin?: string | null;
|
||||
firstAppearance?: number;
|
||||
pictureUrl?: string | null;
|
||||
epithets?: string[] | string | null;
|
||||
status?: string | null;
|
||||
frEpithets?: string[] | string | null;
|
||||
status?: Status | null;
|
||||
arcId?: string | null;
|
||||
url?: string | null;
|
||||
frUrl?: string | null;
|
||||
};
|
||||
|
||||
const DATABASE_URL = process.env.DATABASE_URL || 'file:local.db';
|
||||
@@ -86,7 +93,7 @@ function toJsonArray(value: string[] | string | null | undefined): string[] | nu
|
||||
|
||||
function toDevilFruitType(value: DevilFruitType | string | null | undefined): DevilFruitType | null {
|
||||
if (!value) return null;
|
||||
if (value === 'Paramecia' || value === 'Zoan' || value === 'Logia' || value === 'Unknown') {
|
||||
if (value === 'Paramecia' || value === 'Zoan' || value === 'Logia' || value === 'Smile' || value === 'Unknown') {
|
||||
return value;
|
||||
}
|
||||
return 'Unknown';
|
||||
@@ -115,59 +122,25 @@ function transformCharacterData(item: CharacterRecord) {
|
||||
gender: toNullable(item.gender),
|
||||
age: toNullable(item.age),
|
||||
affiliations: toJsonArray(item.affiliations),
|
||||
frAffiliations: toJsonArray(item.frAffiliations),
|
||||
devilFruitId: toNullable(item.devilFruitId),
|
||||
hakiObservation: !!item.hakiObservation,
|
||||
hakiArmament: !!item.hakiArmament,
|
||||
hakiConqueror: !!item.hakiConqueror,
|
||||
bounty: item.bounty ?? 0,
|
||||
height: toNumber(item.height as any),
|
||||
height: toNumber(item.height as string | number | null),
|
||||
origin: toNullable(item.origin),
|
||||
frOrigin: toNullable(item.frOrigin),
|
||||
firstAppearance: item.firstAppearance ?? 0,
|
||||
pictureUrl: toNullable(item.pictureUrl),
|
||||
epithets: toJsonArray(item.epithets),
|
||||
frEpithets: toJsonArray(item.frEpithets),
|
||||
status: toNullable(item.status),
|
||||
arcId: toNullable(item.arcId),
|
||||
url: toNullable(item.url)
|
||||
};
|
||||
}
|
||||
|
||||
function hasChanged(jsonData: any, dbData: any): boolean {
|
||||
if (!dbData) return true;
|
||||
|
||||
// Print any differences for debugging
|
||||
for (const key in jsonData) {
|
||||
const jsonValue = jsonData[key];
|
||||
const dbValue = dbData[key];
|
||||
const jsonString = typeof jsonValue === 'object' ? JSON.stringify(jsonValue) : String(jsonValue);
|
||||
const dbString = typeof dbValue === 'object' ? JSON.stringify(dbValue) : String(dbValue);
|
||||
if (jsonString !== dbString) {
|
||||
console.log(`\nField "${key}" changed for character ID ${jsonData.id}:`);
|
||||
console.log(` JSON: ${jsonString}`);
|
||||
console.log(` DB: ${dbString}`);
|
||||
} }
|
||||
|
||||
// Compare each field
|
||||
return (
|
||||
jsonData.name != dbData.name ||
|
||||
jsonData.gender != dbData.gender ||
|
||||
jsonData.age != dbData.age ||
|
||||
JSON.stringify(jsonData.affiliations) != JSON.stringify(dbData.affiliations) ||
|
||||
jsonData.devilFruitId != dbData.devilFruitId ||
|
||||
jsonData.hakiObservation != dbData.hakiObservation ||
|
||||
jsonData.hakiArmament != dbData.hakiArmament ||
|
||||
jsonData.hakiConqueror != dbData.hakiConqueror ||
|
||||
jsonData.bounty != dbData.bounty ||
|
||||
jsonData.height != dbData.height ||
|
||||
jsonData.origin != dbData.origin ||
|
||||
jsonData.firstAppearance != dbData.firstAppearance ||
|
||||
jsonData.pictureUrl != dbData.pictureUrl ||
|
||||
JSON.stringify(jsonData.epithets) != JSON.stringify(dbData.epithets) ||
|
||||
jsonData.status != dbData.status ||
|
||||
jsonData.arcId != dbData.arcId ||
|
||||
jsonData.url != dbData.url
|
||||
);
|
||||
}
|
||||
|
||||
async function isCharacterTableEmpty(): Promise<boolean> {
|
||||
const result = await db.select({ count: sql<number>`COUNT(*)` }).from(character);
|
||||
return result[0]?.count === 0;
|
||||
|
||||
@@ -57,14 +57,15 @@ interface DevilFruit {
|
||||
url: string;
|
||||
}
|
||||
|
||||
const FANDOM_API_BASE = 'https://onepiece.fandom.com/api.php?action=parse&redirects=true&format=json&page=';
|
||||
const FR_FANDOM_API_BASE = 'https://onepiece.fandom.com/fr/api.php?action=parse&redirects=true&format=json&page=';
|
||||
const FANDOM_API_BASE =
|
||||
'https://onepiece.fandom.com/api.php?action=parse&redirects=true&format=json&page=';
|
||||
const FR_FANDOM_API_BASE =
|
||||
'https://onepiece.fandom.com/fr/api.php?action=parse&redirects=true&format=json&page=';
|
||||
const OUTPUT_DIR = './scraped-data';
|
||||
const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed
|
||||
const INITIAL_RETRY_DELAY = 1000;
|
||||
const FETCH_CONCURRENCY = 50;
|
||||
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
@@ -73,13 +74,17 @@ if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
/**
|
||||
* Retry a fetch request with exponential backoff
|
||||
*/
|
||||
async function fetchWithRetry(url: string, options: RequestInit = {}, retries: number = 0): Promise<Response> {
|
||||
async function fetchWithRetry(
|
||||
url: string,
|
||||
options: RequestInit = {},
|
||||
retries: number = 0
|
||||
): Promise<Response> {
|
||||
try {
|
||||
const headers: Record<string, string> = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) Firefox/150.0',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
Connection: 'keep-alive',
|
||||
...((options.headers as Record<string, string>) || {})
|
||||
};
|
||||
|
||||
@@ -97,7 +102,7 @@ async function fetchWithRetry(url: string, options: RequestInit = {}, retries: n
|
||||
if (retries < MAX_RETRIES) {
|
||||
const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
|
||||
console.log(`⚠️ HTTP ${response.status} for ${url}, retrying in ${delay}ms...`);
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
return fetchWithRetry(url, options, retries + 1);
|
||||
}
|
||||
|
||||
@@ -108,7 +113,7 @@ async function fetchWithRetry(url: string, options: RequestInit = {}, retries: n
|
||||
if (retries < MAX_RETRIES) {
|
||||
const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
|
||||
console.log(`⚠️ Network error: ${(error as Error).message}, retrying in ${delay}ms...`);
|
||||
await new Promise(resolve => setTimeout(resolve, delay));
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
return fetchWithRetry(url, options, retries + 1);
|
||||
}
|
||||
|
||||
@@ -123,11 +128,12 @@ async function fetchWithRetry(url: string, options: RequestInit = {}, retries: n
|
||||
|
||||
function getFrLink(links: { lang: string; ['*']: string; url: string }[]): { url: string } | null {
|
||||
// Get french url by getting parse.langlinks where lang is "fr" and extract the name from there
|
||||
const frLink = links.find((link: { lang: string; ['*']: string; url: string }) => link.lang === 'fr');
|
||||
const frLink = links.find(
|
||||
(link: { lang: string; ['*']: string; url: string }) => link.lang === 'fr'
|
||||
);
|
||||
return frLink ? { url: frLink['url'] } : null;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Normalize string by decoding URI components, punctuation, and replacing spaces with underscores
|
||||
*/
|
||||
@@ -190,9 +196,7 @@ async function fetchAllArcs(): Promise<Arc[]> {
|
||||
}
|
||||
|
||||
const startChapter = parseInt(chapterMatch[1], 10);
|
||||
const endChapter = /current/i.test(chapterMatch[2])
|
||||
? null
|
||||
: parseInt(chapterMatch[2], 10);
|
||||
const endChapter = /current/i.test(chapterMatch[2]) ? null : parseInt(chapterMatch[2], 10);
|
||||
|
||||
let arcId = normalizeId(cleanUrl);
|
||||
arcId = arcId.replace(/_arc$/i, '');
|
||||
@@ -200,7 +204,10 @@ async function fetchAllArcs(): Promise<Arc[]> {
|
||||
// Query the href page via API to get the correct HTML content (in case of redirect) and extract the French name from there
|
||||
const arcResponse = await fetchWithRetry(`${FANDOM_API_BASE}${cleanUrl}`);
|
||||
const arcJsonData = await arcResponse.json();
|
||||
let frArcName: string | null = arcJsonData.parse?.langlinks.find((link: { lang: string; ['*']: string }) => link.lang === 'fr')?.['*'] || null;
|
||||
let frArcName: string | null =
|
||||
arcJsonData.parse?.langlinks.find(
|
||||
(link: { lang: string; ['*']: string }) => link.lang === 'fr'
|
||||
)?.['*'] || null;
|
||||
|
||||
// Remove "Arc" suffix from French name if present to keep it consistent with English names (e.g. "Arc de Luffy" becomes "Luffy")
|
||||
if (frArcName && /\bArc\b/i.test(frArcName)) {
|
||||
@@ -250,7 +257,7 @@ async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
|
||||
{ id: 'startChapter', title: 'Start Chapter' },
|
||||
{ id: 'endChapter', title: 'End Chapter' },
|
||||
{ id: 'url', title: 'URL' }
|
||||
],
|
||||
]
|
||||
});
|
||||
|
||||
const records = arcs
|
||||
@@ -329,7 +336,7 @@ async function fetchCharacter(
|
||||
characterUrl: string,
|
||||
characterName: string,
|
||||
characterChapter: number,
|
||||
arcsList: Arc[],
|
||||
arcsList: Arc[]
|
||||
): Promise<Character | null> {
|
||||
try {
|
||||
console.log(`Fetching: ${characterName}...`);
|
||||
@@ -371,7 +378,7 @@ async function fetchCharacter(
|
||||
const age = extractAge($);
|
||||
|
||||
// Extract affiliations
|
||||
const affiliations = extractAffiliations($);
|
||||
const affiliations = await extractAffiliations($, 'en');
|
||||
|
||||
// Extract epithets
|
||||
const epithets = extractEpithets($);
|
||||
@@ -412,7 +419,11 @@ async function fetchCharacter(
|
||||
const status = extractStatus($);
|
||||
|
||||
let arcId = '';
|
||||
const arc = arcsList.find(a => a.startChapter <= firstAppearance && (a.endChapter === null || a.endChapter >= firstAppearance));
|
||||
const arc = arcsList.find(
|
||||
(a) =>
|
||||
a.startChapter <= firstAppearance &&
|
||||
(a.endChapter === null || a.endChapter >= firstAppearance)
|
||||
);
|
||||
if (!arc) {
|
||||
return null;
|
||||
}
|
||||
@@ -420,16 +431,27 @@ async function fetchCharacter(
|
||||
|
||||
const frLink = getFrLink(jsonData.parse?.langlinks || []);
|
||||
const frUrl = frLink ? frLink.url.replace('https://onepiece.fandom.com/fr/wiki/', '') : null;
|
||||
const frjsonData = frUrl ? await fetchWithRetry(`${FR_FANDOM_API_BASE}${frUrl}`).then(res => res.json()) : null;
|
||||
const frjsonData = frUrl
|
||||
? await fetchWithRetry(`${FR_FANDOM_API_BASE}${frUrl}`).then((res) => res.json())
|
||||
: null;
|
||||
|
||||
const frName = frjsonData?.parse?.title || null;
|
||||
let frName = frjsonData?.parse?.title || null;
|
||||
|
||||
const frAffiliations = frjsonData ? extractAffiliations(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null;
|
||||
const frAffiliations = frjsonData
|
||||
? await extractAffiliations(cheerio.load(frjsonData.parse?.text?.['*'] || ''), 'fr')
|
||||
: null;
|
||||
|
||||
const frEpithets = frjsonData ? extractEpithets(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null;
|
||||
const frEpithets = frjsonData
|
||||
? extractEpithets(cheerio.load(frjsonData.parse?.text?.['*'] || ''))
|
||||
: null;
|
||||
|
||||
const frOrigin = frjsonData ? extractOrigin(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null;
|
||||
const frOrigin = frjsonData
|
||||
? extractOrigin(cheerio.load(frjsonData.parse?.text?.['*'] || ''))
|
||||
: null;
|
||||
|
||||
if (name !== jsonData.parse?.title) {
|
||||
frName = name;
|
||||
}
|
||||
|
||||
return {
|
||||
id: finalCharacterId,
|
||||
@@ -453,7 +475,7 @@ async function fetchCharacter(
|
||||
firstAppearance,
|
||||
arcId,
|
||||
status,
|
||||
pictureUrl: "Image_Non_Disponible",
|
||||
pictureUrl: 'Image_Non_Disponible',
|
||||
url: characterUrl,
|
||||
frUrl
|
||||
};
|
||||
@@ -463,7 +485,6 @@ async function fetchCharacter(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Extract age from infobox
|
||||
*/
|
||||
@@ -492,7 +513,7 @@ function extractAge($: cheerio.CheerioAPI): number | null {
|
||||
/**
|
||||
* Extract affiliations from infobox
|
||||
*/
|
||||
function extractAffiliations($: cheerio.CheerioAPI): string[] {
|
||||
async function extractAffiliations($: cheerio.CheerioAPI, lang: string): Promise<string[]> {
|
||||
const div = $('[data-source="affiliation"] .pi-data-value');
|
||||
if (div.length === 0) return [];
|
||||
|
||||
@@ -502,10 +523,31 @@ function extractAffiliations($: cheerio.CheerioAPI): string[] {
|
||||
const text = cleanedDiv.html();
|
||||
if (!text) return [];
|
||||
|
||||
// Extract all link values
|
||||
const linkValues = cleanedDiv.find('a').map((_i, el) => $(el).text().trim()).get();
|
||||
if (linkValues.length > 0) {
|
||||
return linkValues;
|
||||
// Resolve affiliations from linked page titles.
|
||||
const links = cleanedDiv.find('a').toArray();
|
||||
if (links.length > 0) {
|
||||
const linkValues = await Promise.all(
|
||||
links.map(async (el) => {
|
||||
const href = $(el).attr('href') || '';
|
||||
const resolvedTitle = await fetchWithRetry(
|
||||
`${lang === 'fr' ? FR_FANDOM_API_BASE : FANDOM_API_BASE}${href.replace('/fr/wiki/', '').replace('/wiki/', '')}`
|
||||
)
|
||||
.then((res) => res.json())
|
||||
.then((json) => json.parse?.title)
|
||||
.catch(() => null);
|
||||
|
||||
if (resolvedTitle) {
|
||||
return resolvedTitle;
|
||||
}
|
||||
|
||||
return $(el).text().trim();
|
||||
})
|
||||
);
|
||||
|
||||
const uniqueLinks = Array.from(new Set(linkValues.filter(Boolean)));
|
||||
if (uniqueLinks.length > 0) {
|
||||
return uniqueLinks;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to parsing text
|
||||
@@ -528,9 +570,7 @@ function extractEpithets($: cheerio.CheerioAPI): string[] {
|
||||
const html = cleanedDiv.html();
|
||||
if (!html) return [];
|
||||
|
||||
const plainText = html
|
||||
.replace(/<br\s*\/?\s*>/gi, '\n')
|
||||
.replace(/<[^>]*>/g, '');
|
||||
const plainText = html.replace(/<br\s*\/?\s*>/gi, '\n').replace(/<[^>]*>/g, '');
|
||||
|
||||
const lines = plainText
|
||||
.split('\n')
|
||||
@@ -571,14 +611,12 @@ async function extractDevilFruit($: cheerio.CheerioAPI): Promise<DevilFruitData
|
||||
|
||||
const cleanUrl = href.replace('/wiki/', '');
|
||||
|
||||
|
||||
return {
|
||||
devilFruitId: normalizeId(cleanUrl),
|
||||
devilFruitUrl: cleanUrl
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Extract bounty from infobox
|
||||
*/
|
||||
@@ -634,7 +672,8 @@ function extractHeight($: cheerio.CheerioAPI): number | null {
|
||||
|
||||
// Keep only lines that look like a height value, then pick the latest one.
|
||||
const heightLines = lines.filter((line) => /\d/.test(line) && /(cm|m)/i.test(line));
|
||||
const latestLine = heightLines.length > 0 ? heightLines[heightLines.length - 1] : lines[lines.length - 1];
|
||||
const latestLine =
|
||||
heightLines.length > 0 ? heightLines[heightLines.length - 1] : lines[lines.length - 1];
|
||||
if (!latestLine) return null;
|
||||
|
||||
// Remove descriptive suffixes like "(post-timeskip)".
|
||||
@@ -661,7 +700,9 @@ function extractHeight($: cheerio.CheerioAPI): number | null {
|
||||
* Extract origin from infobox
|
||||
*/
|
||||
function extractOrigin($: cheerio.CheerioAPI): string | null {
|
||||
const div = $('[data-source="origin"] .pi-data-value, [data-source="origine"] .pi-data-value').first();
|
||||
const div = $(
|
||||
'[data-source="origin"] .pi-data-value, [data-source="origine"] .pi-data-value'
|
||||
).first();
|
||||
if (div.length === 0) return null;
|
||||
|
||||
let text = div.html();
|
||||
@@ -700,7 +741,6 @@ function extractStatus($: cheerio.CheerioAPI): string | null {
|
||||
return 'Alive';
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Save data to JSON
|
||||
*/
|
||||
@@ -736,7 +776,7 @@ async function saveToCSV(characters: Character[]): Promise<void> {
|
||||
{ id: 'arcId', title: 'Arc ID' },
|
||||
{ id: 'pictureUrl', title: 'Image URL' },
|
||||
{ id: 'url', title: 'Fandom URL' }
|
||||
],
|
||||
]
|
||||
});
|
||||
|
||||
const records = characters
|
||||
@@ -749,9 +789,11 @@ async function saveToCSV(characters: Character[]): Promise<void> {
|
||||
height: c.height || '',
|
||||
origin: c.origin || '',
|
||||
status: c.status || '',
|
||||
epithets: Array.isArray(c.epithets) ? c.epithets.join(', ') : (c.epithets || ''),
|
||||
epithets: Array.isArray(c.epithets) ? c.epithets.join(', ') : c.epithets || '',
|
||||
devilFruitId: c.devilFruitId || '',
|
||||
affiliations: Array.isArray(c.affiliations) ? c.affiliations.join(', ') : (c.affiliations || ''),
|
||||
affiliations: Array.isArray(c.affiliations)
|
||||
? c.affiliations.join(', ')
|
||||
: c.affiliations || '',
|
||||
bounty: c.bounty ?? 0,
|
||||
hakiObservation: c.hakiObservation ? 1 : 0,
|
||||
hakiArmament: c.hakiArmament ? 1 : 0,
|
||||
@@ -769,7 +811,10 @@ async function saveToCSV(characters: Character[]): Promise<void> {
|
||||
/**
|
||||
* Fetch devil fruit data from fandom using provided URL
|
||||
*/
|
||||
async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Promise<DevilFruit | null> {
|
||||
async function fetchDevilFruit(
|
||||
devilFruitUrl: string,
|
||||
devilFruitId: string
|
||||
): Promise<DevilFruit | null> {
|
||||
try {
|
||||
console.log(`Fetching devil fruit: ${devilFruitUrl}...`);
|
||||
|
||||
@@ -789,8 +834,9 @@ async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Pro
|
||||
let type: string | null = null;
|
||||
// Determine type based on categories (if categories contain "Paramecia", "Zoan", "Logia" or "Smile")
|
||||
if (jsonData.parse?.categories) {
|
||||
const categories = jsonData.parse.categories
|
||||
.map((cat: { ['*']: string }) => String(cat['*'] || '').toLowerCase());
|
||||
const categories = jsonData.parse.categories.map((cat: { ['*']: string }) =>
|
||||
String(cat['*'] || '').toLowerCase()
|
||||
);
|
||||
|
||||
if (categories.some((category: string) => category.includes('paramecia'))) {
|
||||
type = 'Paramecia';
|
||||
@@ -836,7 +882,7 @@ async function saveDevilFruitsToCSV(devilFruits: DevilFruit[]): Promise<void> {
|
||||
{ id: 'name', title: 'Name' },
|
||||
{ id: 'type', title: 'Type' },
|
||||
{ id: 'url', title: 'URL' }
|
||||
],
|
||||
]
|
||||
});
|
||||
|
||||
const records = devilFruits
|
||||
@@ -942,7 +988,6 @@ async function main(): Promise<void> {
|
||||
devilFruitUrls.add(data.devilFruitUrl);
|
||||
}
|
||||
|
||||
|
||||
characters.push(data);
|
||||
} else {
|
||||
nextFailedCharacters.push(char);
|
||||
@@ -1001,8 +1046,8 @@ async function main(): Promise<void> {
|
||||
}
|
||||
|
||||
// Update characters with normalized devil fruit IDs
|
||||
const devilFruitMap = new Map<string, string>(devilFruits.map(df => [df.id, df.id]));
|
||||
characters.forEach(char => {
|
||||
const devilFruitMap = new Map<string, string>(devilFruits.map((df) => [df.id, df.id]));
|
||||
characters.forEach((char) => {
|
||||
if (char.devilFruitUrl) {
|
||||
const normalizedId = normalizeId(char.devilFruitUrl);
|
||||
char.devilFruitId = devilFruitMap.get(normalizedId) || normalizedId;
|
||||
|
||||
Reference in New Issue
Block a user