Refactor code structure for improved readability and maintainability
This commit is contained in:
@@ -4,6 +4,8 @@ import { sql, eq } from 'drizzle-orm';
|
|||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
import { arc, character, devilFruit, characterScrapeValidation, type DevilFruitType } from '../src/lib/server/db/schema';
|
import { arc, character, devilFruit, characterScrapeValidation, type DevilFruitType } from '../src/lib/server/db/schema';
|
||||||
|
|
||||||
|
type Status = 'Alive' | 'Dead' | 'Unknown';
|
||||||
|
|
||||||
type ArcRecord = {
|
type ArcRecord = {
|
||||||
id: string;
|
id: string;
|
||||||
name: string;
|
name: string;
|
||||||
@@ -22,9 +24,11 @@ type DevilFruitRecord = {
|
|||||||
type CharacterRecord = {
|
type CharacterRecord = {
|
||||||
id: string;
|
id: string;
|
||||||
name: string;
|
name: string;
|
||||||
|
frName?: string | null;
|
||||||
gender?: string | null;
|
gender?: string | null;
|
||||||
age?: number | null;
|
age?: number | null;
|
||||||
affiliations?: string[] | string | null;
|
affiliations?: string[] | string | null;
|
||||||
|
frAffiliations?: string[] | string | null;
|
||||||
devilFruitId?: string | null;
|
devilFruitId?: string | null;
|
||||||
hakiObservation?: boolean;
|
hakiObservation?: boolean;
|
||||||
hakiArmament?: boolean;
|
hakiArmament?: boolean;
|
||||||
@@ -32,12 +36,15 @@ type CharacterRecord = {
|
|||||||
bounty?: number | null;
|
bounty?: number | null;
|
||||||
height?: number | null;
|
height?: number | null;
|
||||||
origin?: string | null;
|
origin?: string | null;
|
||||||
|
frOrigin?: string | null;
|
||||||
firstAppearance?: number;
|
firstAppearance?: number;
|
||||||
pictureUrl?: string | null;
|
pictureUrl?: string | null;
|
||||||
epithets?: string[] | string | null;
|
epithets?: string[] | string | null;
|
||||||
status?: string | null;
|
frEpithets?: string[] | string | null;
|
||||||
|
status?: Status | null;
|
||||||
arcId?: string | null;
|
arcId?: string | null;
|
||||||
url?: string | null;
|
url?: string | null;
|
||||||
|
frUrl?: string | null;
|
||||||
};
|
};
|
||||||
|
|
||||||
const DATABASE_URL = process.env.DATABASE_URL || 'file:local.db';
|
const DATABASE_URL = process.env.DATABASE_URL || 'file:local.db';
|
||||||
@@ -86,7 +93,7 @@ function toJsonArray(value: string[] | string | null | undefined): string[] | nu
|
|||||||
|
|
||||||
function toDevilFruitType(value: DevilFruitType | string | null | undefined): DevilFruitType | null {
|
function toDevilFruitType(value: DevilFruitType | string | null | undefined): DevilFruitType | null {
|
||||||
if (!value) return null;
|
if (!value) return null;
|
||||||
if (value === 'Paramecia' || value === 'Zoan' || value === 'Logia' || value === 'Unknown') {
|
if (value === 'Paramecia' || value === 'Zoan' || value === 'Logia' || value === 'Smile' || value === 'Unknown') {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
return 'Unknown';
|
return 'Unknown';
|
||||||
@@ -115,59 +122,25 @@ function transformCharacterData(item: CharacterRecord) {
|
|||||||
gender: toNullable(item.gender),
|
gender: toNullable(item.gender),
|
||||||
age: toNullable(item.age),
|
age: toNullable(item.age),
|
||||||
affiliations: toJsonArray(item.affiliations),
|
affiliations: toJsonArray(item.affiliations),
|
||||||
|
frAffiliations: toJsonArray(item.frAffiliations),
|
||||||
devilFruitId: toNullable(item.devilFruitId),
|
devilFruitId: toNullable(item.devilFruitId),
|
||||||
hakiObservation: !!item.hakiObservation,
|
hakiObservation: !!item.hakiObservation,
|
||||||
hakiArmament: !!item.hakiArmament,
|
hakiArmament: !!item.hakiArmament,
|
||||||
hakiConqueror: !!item.hakiConqueror,
|
hakiConqueror: !!item.hakiConqueror,
|
||||||
bounty: item.bounty ?? 0,
|
bounty: item.bounty ?? 0,
|
||||||
height: toNumber(item.height as any),
|
height: toNumber(item.height as string | number | null),
|
||||||
origin: toNullable(item.origin),
|
origin: toNullable(item.origin),
|
||||||
|
frOrigin: toNullable(item.frOrigin),
|
||||||
firstAppearance: item.firstAppearance ?? 0,
|
firstAppearance: item.firstAppearance ?? 0,
|
||||||
pictureUrl: toNullable(item.pictureUrl),
|
pictureUrl: toNullable(item.pictureUrl),
|
||||||
epithets: toJsonArray(item.epithets),
|
epithets: toJsonArray(item.epithets),
|
||||||
|
frEpithets: toJsonArray(item.frEpithets),
|
||||||
status: toNullable(item.status),
|
status: toNullable(item.status),
|
||||||
arcId: toNullable(item.arcId),
|
arcId: toNullable(item.arcId),
|
||||||
url: toNullable(item.url)
|
url: toNullable(item.url)
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
function hasChanged(jsonData: any, dbData: any): boolean {
|
|
||||||
if (!dbData) return true;
|
|
||||||
|
|
||||||
// Print any differences for debugging
|
|
||||||
for (const key in jsonData) {
|
|
||||||
const jsonValue = jsonData[key];
|
|
||||||
const dbValue = dbData[key];
|
|
||||||
const jsonString = typeof jsonValue === 'object' ? JSON.stringify(jsonValue) : String(jsonValue);
|
|
||||||
const dbString = typeof dbValue === 'object' ? JSON.stringify(dbValue) : String(dbValue);
|
|
||||||
if (jsonString !== dbString) {
|
|
||||||
console.log(`\nField "${key}" changed for character ID ${jsonData.id}:`);
|
|
||||||
console.log(` JSON: ${jsonString}`);
|
|
||||||
console.log(` DB: ${dbString}`);
|
|
||||||
} }
|
|
||||||
|
|
||||||
// Compare each field
|
|
||||||
return (
|
|
||||||
jsonData.name != dbData.name ||
|
|
||||||
jsonData.gender != dbData.gender ||
|
|
||||||
jsonData.age != dbData.age ||
|
|
||||||
JSON.stringify(jsonData.affiliations) != JSON.stringify(dbData.affiliations) ||
|
|
||||||
jsonData.devilFruitId != dbData.devilFruitId ||
|
|
||||||
jsonData.hakiObservation != dbData.hakiObservation ||
|
|
||||||
jsonData.hakiArmament != dbData.hakiArmament ||
|
|
||||||
jsonData.hakiConqueror != dbData.hakiConqueror ||
|
|
||||||
jsonData.bounty != dbData.bounty ||
|
|
||||||
jsonData.height != dbData.height ||
|
|
||||||
jsonData.origin != dbData.origin ||
|
|
||||||
jsonData.firstAppearance != dbData.firstAppearance ||
|
|
||||||
jsonData.pictureUrl != dbData.pictureUrl ||
|
|
||||||
JSON.stringify(jsonData.epithets) != JSON.stringify(dbData.epithets) ||
|
|
||||||
jsonData.status != dbData.status ||
|
|
||||||
jsonData.arcId != dbData.arcId ||
|
|
||||||
jsonData.url != dbData.url
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
async function isCharacterTableEmpty(): Promise<boolean> {
|
async function isCharacterTableEmpty(): Promise<boolean> {
|
||||||
const result = await db.select({ count: sql<number>`COUNT(*)` }).from(character);
|
const result = await db.select({ count: sql<number>`COUNT(*)` }).from(character);
|
||||||
return result[0]?.count === 0;
|
return result[0]?.count === 0;
|
||||||
|
|||||||
@@ -57,14 +57,15 @@ interface DevilFruit {
|
|||||||
url: string;
|
url: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
const FANDOM_API_BASE = 'https://onepiece.fandom.com/api.php?action=parse&redirects=true&format=json&page=';
|
const FANDOM_API_BASE =
|
||||||
const FR_FANDOM_API_BASE = 'https://onepiece.fandom.com/fr/api.php?action=parse&redirects=true&format=json&page=';
|
'https://onepiece.fandom.com/api.php?action=parse&redirects=true&format=json&page=';
|
||||||
|
const FR_FANDOM_API_BASE =
|
||||||
|
'https://onepiece.fandom.com/fr/api.php?action=parse&redirects=true&format=json&page=';
|
||||||
const OUTPUT_DIR = './scraped-data';
|
const OUTPUT_DIR = './scraped-data';
|
||||||
const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed
|
const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed
|
||||||
const INITIAL_RETRY_DELAY = 1000;
|
const INITIAL_RETRY_DELAY = 1000;
|
||||||
const FETCH_CONCURRENCY = 50;
|
const FETCH_CONCURRENCY = 50;
|
||||||
|
|
||||||
|
|
||||||
// Create output directory
|
// Create output directory
|
||||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||||
@@ -73,13 +74,17 @@ if (!fs.existsSync(OUTPUT_DIR)) {
|
|||||||
/**
|
/**
|
||||||
* Retry a fetch request with exponential backoff
|
* Retry a fetch request with exponential backoff
|
||||||
*/
|
*/
|
||||||
async function fetchWithRetry(url: string, options: RequestInit = {}, retries: number = 0): Promise<Response> {
|
async function fetchWithRetry(
|
||||||
|
url: string,
|
||||||
|
options: RequestInit = {},
|
||||||
|
retries: number = 0
|
||||||
|
): Promise<Response> {
|
||||||
try {
|
try {
|
||||||
const headers: Record<string, string> = {
|
const headers: Record<string, string> = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) Firefox/150.0',
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) Firefox/150.0',
|
||||||
'Accept-Language': 'en-US,en;q=0.9',
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
'Accept-Encoding': 'gzip, deflate, br',
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
'Connection': 'keep-alive',
|
Connection: 'keep-alive',
|
||||||
...((options.headers as Record<string, string>) || {})
|
...((options.headers as Record<string, string>) || {})
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -97,7 +102,7 @@ async function fetchWithRetry(url: string, options: RequestInit = {}, retries: n
|
|||||||
if (retries < MAX_RETRIES) {
|
if (retries < MAX_RETRIES) {
|
||||||
const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
|
const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
|
||||||
console.log(`⚠️ HTTP ${response.status} for ${url}, retrying in ${delay}ms...`);
|
console.log(`⚠️ HTTP ${response.status} for ${url}, retrying in ${delay}ms...`);
|
||||||
await new Promise(resolve => setTimeout(resolve, delay));
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||||
return fetchWithRetry(url, options, retries + 1);
|
return fetchWithRetry(url, options, retries + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -108,7 +113,7 @@ async function fetchWithRetry(url: string, options: RequestInit = {}, retries: n
|
|||||||
if (retries < MAX_RETRIES) {
|
if (retries < MAX_RETRIES) {
|
||||||
const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
|
const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
|
||||||
console.log(`⚠️ Network error: ${(error as Error).message}, retrying in ${delay}ms...`);
|
console.log(`⚠️ Network error: ${(error as Error).message}, retrying in ${delay}ms...`);
|
||||||
await new Promise(resolve => setTimeout(resolve, delay));
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||||
return fetchWithRetry(url, options, retries + 1);
|
return fetchWithRetry(url, options, retries + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -123,11 +128,12 @@ async function fetchWithRetry(url: string, options: RequestInit = {}, retries: n
|
|||||||
|
|
||||||
function getFrLink(links: { lang: string; ['*']: string; url: string }[]): { url: string } | null {
|
function getFrLink(links: { lang: string; ['*']: string; url: string }[]): { url: string } | null {
|
||||||
// Get french url by getting parse.langlinks where lang is "fr" and extract the name from there
|
// Get french url by getting parse.langlinks where lang is "fr" and extract the name from there
|
||||||
const frLink = links.find((link: { lang: string; ['*']: string; url: string }) => link.lang === 'fr');
|
const frLink = links.find(
|
||||||
|
(link: { lang: string; ['*']: string; url: string }) => link.lang === 'fr'
|
||||||
|
);
|
||||||
return frLink ? { url: frLink['url'] } : null;
|
return frLink ? { url: frLink['url'] } : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Normalize string by decoding URI components, punctuation, and replacing spaces with underscores
|
* Normalize string by decoding URI components, punctuation, and replacing spaces with underscores
|
||||||
*/
|
*/
|
||||||
@@ -190,9 +196,7 @@ async function fetchAllArcs(): Promise<Arc[]> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const startChapter = parseInt(chapterMatch[1], 10);
|
const startChapter = parseInt(chapterMatch[1], 10);
|
||||||
const endChapter = /current/i.test(chapterMatch[2])
|
const endChapter = /current/i.test(chapterMatch[2]) ? null : parseInt(chapterMatch[2], 10);
|
||||||
? null
|
|
||||||
: parseInt(chapterMatch[2], 10);
|
|
||||||
|
|
||||||
let arcId = normalizeId(cleanUrl);
|
let arcId = normalizeId(cleanUrl);
|
||||||
arcId = arcId.replace(/_arc$/i, '');
|
arcId = arcId.replace(/_arc$/i, '');
|
||||||
@@ -200,7 +204,10 @@ async function fetchAllArcs(): Promise<Arc[]> {
|
|||||||
// Query the href page via API to get the correct HTML content (in case of redirect) and extract the French name from there
|
// Query the href page via API to get the correct HTML content (in case of redirect) and extract the French name from there
|
||||||
const arcResponse = await fetchWithRetry(`${FANDOM_API_BASE}${cleanUrl}`);
|
const arcResponse = await fetchWithRetry(`${FANDOM_API_BASE}${cleanUrl}`);
|
||||||
const arcJsonData = await arcResponse.json();
|
const arcJsonData = await arcResponse.json();
|
||||||
let frArcName: string | null = arcJsonData.parse?.langlinks.find((link: { lang: string; ['*']: string }) => link.lang === 'fr')?.['*'] || null;
|
let frArcName: string | null =
|
||||||
|
arcJsonData.parse?.langlinks.find(
|
||||||
|
(link: { lang: string; ['*']: string }) => link.lang === 'fr'
|
||||||
|
)?.['*'] || null;
|
||||||
|
|
||||||
// Remove "Arc" suffix from French name if present to keep it consistent with English names (e.g. "Arc de Luffy" becomes "Luffy")
|
// Remove "Arc" suffix from French name if present to keep it consistent with English names (e.g. "Arc de Luffy" becomes "Luffy")
|
||||||
if (frArcName && /\bArc\b/i.test(frArcName)) {
|
if (frArcName && /\bArc\b/i.test(frArcName)) {
|
||||||
@@ -250,7 +257,7 @@ async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
|
|||||||
{ id: 'startChapter', title: 'Start Chapter' },
|
{ id: 'startChapter', title: 'Start Chapter' },
|
||||||
{ id: 'endChapter', title: 'End Chapter' },
|
{ id: 'endChapter', title: 'End Chapter' },
|
||||||
{ id: 'url', title: 'URL' }
|
{ id: 'url', title: 'URL' }
|
||||||
],
|
]
|
||||||
});
|
});
|
||||||
|
|
||||||
const records = arcs
|
const records = arcs
|
||||||
@@ -329,7 +336,7 @@ async function fetchCharacter(
|
|||||||
characterUrl: string,
|
characterUrl: string,
|
||||||
characterName: string,
|
characterName: string,
|
||||||
characterChapter: number,
|
characterChapter: number,
|
||||||
arcsList: Arc[],
|
arcsList: Arc[]
|
||||||
): Promise<Character | null> {
|
): Promise<Character | null> {
|
||||||
try {
|
try {
|
||||||
console.log(`Fetching: ${characterName}...`);
|
console.log(`Fetching: ${characterName}...`);
|
||||||
@@ -371,7 +378,7 @@ async function fetchCharacter(
|
|||||||
const age = extractAge($);
|
const age = extractAge($);
|
||||||
|
|
||||||
// Extract affiliations
|
// Extract affiliations
|
||||||
const affiliations = extractAffiliations($);
|
const affiliations = await extractAffiliations($, 'en');
|
||||||
|
|
||||||
// Extract epithets
|
// Extract epithets
|
||||||
const epithets = extractEpithets($);
|
const epithets = extractEpithets($);
|
||||||
@@ -412,7 +419,11 @@ async function fetchCharacter(
|
|||||||
const status = extractStatus($);
|
const status = extractStatus($);
|
||||||
|
|
||||||
let arcId = '';
|
let arcId = '';
|
||||||
const arc = arcsList.find(a => a.startChapter <= firstAppearance && (a.endChapter === null || a.endChapter >= firstAppearance));
|
const arc = arcsList.find(
|
||||||
|
(a) =>
|
||||||
|
a.startChapter <= firstAppearance &&
|
||||||
|
(a.endChapter === null || a.endChapter >= firstAppearance)
|
||||||
|
);
|
||||||
if (!arc) {
|
if (!arc) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@@ -420,16 +431,27 @@ async function fetchCharacter(
|
|||||||
|
|
||||||
const frLink = getFrLink(jsonData.parse?.langlinks || []);
|
const frLink = getFrLink(jsonData.parse?.langlinks || []);
|
||||||
const frUrl = frLink ? frLink.url.replace('https://onepiece.fandom.com/fr/wiki/', '') : null;
|
const frUrl = frLink ? frLink.url.replace('https://onepiece.fandom.com/fr/wiki/', '') : null;
|
||||||
const frjsonData = frUrl ? await fetchWithRetry(`${FR_FANDOM_API_BASE}${frUrl}`).then(res => res.json()) : null;
|
const frjsonData = frUrl
|
||||||
|
? await fetchWithRetry(`${FR_FANDOM_API_BASE}${frUrl}`).then((res) => res.json())
|
||||||
|
: null;
|
||||||
|
|
||||||
const frName = frjsonData?.parse?.title || null;
|
let frName = frjsonData?.parse?.title || null;
|
||||||
|
|
||||||
const frAffiliations = frjsonData ? extractAffiliations(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null;
|
const frAffiliations = frjsonData
|
||||||
|
? await extractAffiliations(cheerio.load(frjsonData.parse?.text?.['*'] || ''), 'fr')
|
||||||
|
: null;
|
||||||
|
|
||||||
const frEpithets = frjsonData ? extractEpithets(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null;
|
const frEpithets = frjsonData
|
||||||
|
? extractEpithets(cheerio.load(frjsonData.parse?.text?.['*'] || ''))
|
||||||
|
: null;
|
||||||
|
|
||||||
const frOrigin = frjsonData ? extractOrigin(cheerio.load(frjsonData.parse?.text?.['*'] || '')) : null;
|
const frOrigin = frjsonData
|
||||||
|
? extractOrigin(cheerio.load(frjsonData.parse?.text?.['*'] || ''))
|
||||||
|
: null;
|
||||||
|
|
||||||
|
if (name !== jsonData.parse?.title) {
|
||||||
|
frName = name;
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
id: finalCharacterId,
|
id: finalCharacterId,
|
||||||
@@ -453,7 +475,7 @@ async function fetchCharacter(
|
|||||||
firstAppearance,
|
firstAppearance,
|
||||||
arcId,
|
arcId,
|
||||||
status,
|
status,
|
||||||
pictureUrl: "Image_Non_Disponible",
|
pictureUrl: 'Image_Non_Disponible',
|
||||||
url: characterUrl,
|
url: characterUrl,
|
||||||
frUrl
|
frUrl
|
||||||
};
|
};
|
||||||
@@ -463,7 +485,6 @@ async function fetchCharacter(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract age from infobox
|
* Extract age from infobox
|
||||||
*/
|
*/
|
||||||
@@ -492,7 +513,7 @@ function extractAge($: cheerio.CheerioAPI): number | null {
|
|||||||
/**
|
/**
|
||||||
* Extract affiliations from infobox
|
* Extract affiliations from infobox
|
||||||
*/
|
*/
|
||||||
function extractAffiliations($: cheerio.CheerioAPI): string[] {
|
async function extractAffiliations($: cheerio.CheerioAPI, lang: string): Promise<string[]> {
|
||||||
const div = $('[data-source="affiliation"] .pi-data-value');
|
const div = $('[data-source="affiliation"] .pi-data-value');
|
||||||
if (div.length === 0) return [];
|
if (div.length === 0) return [];
|
||||||
|
|
||||||
@@ -502,10 +523,31 @@ function extractAffiliations($: cheerio.CheerioAPI): string[] {
|
|||||||
const text = cleanedDiv.html();
|
const text = cleanedDiv.html();
|
||||||
if (!text) return [];
|
if (!text) return [];
|
||||||
|
|
||||||
// Extract all link values
|
// Resolve affiliations from linked page titles.
|
||||||
const linkValues = cleanedDiv.find('a').map((_i, el) => $(el).text().trim()).get();
|
const links = cleanedDiv.find('a').toArray();
|
||||||
if (linkValues.length > 0) {
|
if (links.length > 0) {
|
||||||
return linkValues;
|
const linkValues = await Promise.all(
|
||||||
|
links.map(async (el) => {
|
||||||
|
const href = $(el).attr('href') || '';
|
||||||
|
const resolvedTitle = await fetchWithRetry(
|
||||||
|
`${lang === 'fr' ? FR_FANDOM_API_BASE : FANDOM_API_BASE}${href.replace('/fr/wiki/', '').replace('/wiki/', '')}`
|
||||||
|
)
|
||||||
|
.then((res) => res.json())
|
||||||
|
.then((json) => json.parse?.title)
|
||||||
|
.catch(() => null);
|
||||||
|
|
||||||
|
if (resolvedTitle) {
|
||||||
|
return resolvedTitle;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $(el).text().trim();
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
const uniqueLinks = Array.from(new Set(linkValues.filter(Boolean)));
|
||||||
|
if (uniqueLinks.length > 0) {
|
||||||
|
return uniqueLinks;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fallback to parsing text
|
// Fallback to parsing text
|
||||||
@@ -528,9 +570,7 @@ function extractEpithets($: cheerio.CheerioAPI): string[] {
|
|||||||
const html = cleanedDiv.html();
|
const html = cleanedDiv.html();
|
||||||
if (!html) return [];
|
if (!html) return [];
|
||||||
|
|
||||||
const plainText = html
|
const plainText = html.replace(/<br\s*\/?\s*>/gi, '\n').replace(/<[^>]*>/g, '');
|
||||||
.replace(/<br\s*\/?\s*>/gi, '\n')
|
|
||||||
.replace(/<[^>]*>/g, '');
|
|
||||||
|
|
||||||
const lines = plainText
|
const lines = plainText
|
||||||
.split('\n')
|
.split('\n')
|
||||||
@@ -571,14 +611,12 @@ async function extractDevilFruit($: cheerio.CheerioAPI): Promise<DevilFruitData
|
|||||||
|
|
||||||
const cleanUrl = href.replace('/wiki/', '');
|
const cleanUrl = href.replace('/wiki/', '');
|
||||||
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
devilFruitId: normalizeId(cleanUrl),
|
devilFruitId: normalizeId(cleanUrl),
|
||||||
devilFruitUrl: cleanUrl
|
devilFruitUrl: cleanUrl
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract bounty from infobox
|
* Extract bounty from infobox
|
||||||
*/
|
*/
|
||||||
@@ -634,7 +672,8 @@ function extractHeight($: cheerio.CheerioAPI): number | null {
|
|||||||
|
|
||||||
// Keep only lines that look like a height value, then pick the latest one.
|
// Keep only lines that look like a height value, then pick the latest one.
|
||||||
const heightLines = lines.filter((line) => /\d/.test(line) && /(cm|m)/i.test(line));
|
const heightLines = lines.filter((line) => /\d/.test(line) && /(cm|m)/i.test(line));
|
||||||
const latestLine = heightLines.length > 0 ? heightLines[heightLines.length - 1] : lines[lines.length - 1];
|
const latestLine =
|
||||||
|
heightLines.length > 0 ? heightLines[heightLines.length - 1] : lines[lines.length - 1];
|
||||||
if (!latestLine) return null;
|
if (!latestLine) return null;
|
||||||
|
|
||||||
// Remove descriptive suffixes like "(post-timeskip)".
|
// Remove descriptive suffixes like "(post-timeskip)".
|
||||||
@@ -661,7 +700,9 @@ function extractHeight($: cheerio.CheerioAPI): number | null {
|
|||||||
* Extract origin from infobox
|
* Extract origin from infobox
|
||||||
*/
|
*/
|
||||||
function extractOrigin($: cheerio.CheerioAPI): string | null {
|
function extractOrigin($: cheerio.CheerioAPI): string | null {
|
||||||
const div = $('[data-source="origin"] .pi-data-value, [data-source="origine"] .pi-data-value').first();
|
const div = $(
|
||||||
|
'[data-source="origin"] .pi-data-value, [data-source="origine"] .pi-data-value'
|
||||||
|
).first();
|
||||||
if (div.length === 0) return null;
|
if (div.length === 0) return null;
|
||||||
|
|
||||||
let text = div.html();
|
let text = div.html();
|
||||||
@@ -700,7 +741,6 @@ function extractStatus($: cheerio.CheerioAPI): string | null {
|
|||||||
return 'Alive';
|
return 'Alive';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Save data to JSON
|
* Save data to JSON
|
||||||
*/
|
*/
|
||||||
@@ -736,7 +776,7 @@ async function saveToCSV(characters: Character[]): Promise<void> {
|
|||||||
{ id: 'arcId', title: 'Arc ID' },
|
{ id: 'arcId', title: 'Arc ID' },
|
||||||
{ id: 'pictureUrl', title: 'Image URL' },
|
{ id: 'pictureUrl', title: 'Image URL' },
|
||||||
{ id: 'url', title: 'Fandom URL' }
|
{ id: 'url', title: 'Fandom URL' }
|
||||||
],
|
]
|
||||||
});
|
});
|
||||||
|
|
||||||
const records = characters
|
const records = characters
|
||||||
@@ -749,9 +789,11 @@ async function saveToCSV(characters: Character[]): Promise<void> {
|
|||||||
height: c.height || '',
|
height: c.height || '',
|
||||||
origin: c.origin || '',
|
origin: c.origin || '',
|
||||||
status: c.status || '',
|
status: c.status || '',
|
||||||
epithets: Array.isArray(c.epithets) ? c.epithets.join(', ') : (c.epithets || ''),
|
epithets: Array.isArray(c.epithets) ? c.epithets.join(', ') : c.epithets || '',
|
||||||
devilFruitId: c.devilFruitId || '',
|
devilFruitId: c.devilFruitId || '',
|
||||||
affiliations: Array.isArray(c.affiliations) ? c.affiliations.join(', ') : (c.affiliations || ''),
|
affiliations: Array.isArray(c.affiliations)
|
||||||
|
? c.affiliations.join(', ')
|
||||||
|
: c.affiliations || '',
|
||||||
bounty: c.bounty ?? 0,
|
bounty: c.bounty ?? 0,
|
||||||
hakiObservation: c.hakiObservation ? 1 : 0,
|
hakiObservation: c.hakiObservation ? 1 : 0,
|
||||||
hakiArmament: c.hakiArmament ? 1 : 0,
|
hakiArmament: c.hakiArmament ? 1 : 0,
|
||||||
@@ -769,7 +811,10 @@ async function saveToCSV(characters: Character[]): Promise<void> {
|
|||||||
/**
|
/**
|
||||||
* Fetch devil fruit data from fandom using provided URL
|
* Fetch devil fruit data from fandom using provided URL
|
||||||
*/
|
*/
|
||||||
async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Promise<DevilFruit | null> {
|
async function fetchDevilFruit(
|
||||||
|
devilFruitUrl: string,
|
||||||
|
devilFruitId: string
|
||||||
|
): Promise<DevilFruit | null> {
|
||||||
try {
|
try {
|
||||||
console.log(`Fetching devil fruit: ${devilFruitUrl}...`);
|
console.log(`Fetching devil fruit: ${devilFruitUrl}...`);
|
||||||
|
|
||||||
@@ -789,8 +834,9 @@ async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Pro
|
|||||||
let type: string | null = null;
|
let type: string | null = null;
|
||||||
// Determine type based on categories (if categories contain "Paramecia", "Zoan", "Logia" or "Smile")
|
// Determine type based on categories (if categories contain "Paramecia", "Zoan", "Logia" or "Smile")
|
||||||
if (jsonData.parse?.categories) {
|
if (jsonData.parse?.categories) {
|
||||||
const categories = jsonData.parse.categories
|
const categories = jsonData.parse.categories.map((cat: { ['*']: string }) =>
|
||||||
.map((cat: { ['*']: string }) => String(cat['*'] || '').toLowerCase());
|
String(cat['*'] || '').toLowerCase()
|
||||||
|
);
|
||||||
|
|
||||||
if (categories.some((category: string) => category.includes('paramecia'))) {
|
if (categories.some((category: string) => category.includes('paramecia'))) {
|
||||||
type = 'Paramecia';
|
type = 'Paramecia';
|
||||||
@@ -836,7 +882,7 @@ async function saveDevilFruitsToCSV(devilFruits: DevilFruit[]): Promise<void> {
|
|||||||
{ id: 'name', title: 'Name' },
|
{ id: 'name', title: 'Name' },
|
||||||
{ id: 'type', title: 'Type' },
|
{ id: 'type', title: 'Type' },
|
||||||
{ id: 'url', title: 'URL' }
|
{ id: 'url', title: 'URL' }
|
||||||
],
|
]
|
||||||
});
|
});
|
||||||
|
|
||||||
const records = devilFruits
|
const records = devilFruits
|
||||||
@@ -942,7 +988,6 @@ async function main(): Promise<void> {
|
|||||||
devilFruitUrls.add(data.devilFruitUrl);
|
devilFruitUrls.add(data.devilFruitUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
characters.push(data);
|
characters.push(data);
|
||||||
} else {
|
} else {
|
||||||
nextFailedCharacters.push(char);
|
nextFailedCharacters.push(char);
|
||||||
@@ -1001,8 +1046,8 @@ async function main(): Promise<void> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Update characters with normalized devil fruit IDs
|
// Update characters with normalized devil fruit IDs
|
||||||
const devilFruitMap = new Map<string, string>(devilFruits.map(df => [df.id, df.id]));
|
const devilFruitMap = new Map<string, string>(devilFruits.map((df) => [df.id, df.id]));
|
||||||
characters.forEach(char => {
|
characters.forEach((char) => {
|
||||||
if (char.devilFruitUrl) {
|
if (char.devilFruitUrl) {
|
||||||
const normalizedId = normalizeId(char.devilFruitUrl);
|
const normalizedId = normalizeId(char.devilFruitUrl);
|
||||||
char.devilFruitId = devilFruitMap.get(normalizedId) || normalizedId;
|
char.devilFruitId = devilFruitMap.get(normalizedId) || normalizedId;
|
||||||
|
|||||||
Reference in New Issue
Block a user