refactor: enhance character data transformation and improve fetching logic in character-related scripts

This commit is contained in:
2026-03-14 18:32:43 +01:00
parent 8b08950719
commit b1cc691422
8 changed files with 129 additions and 102 deletions

View File

@@ -119,6 +119,7 @@ function transformCharacterData(item: CharacterRecord) {
return {
id: item.id,
name: item.name,
frName: toNullable(item.frName),
gender: toNullable(item.gender),
age: toNullable(item.age),
affiliations: toJsonArray(item.affiliations),
@@ -137,7 +138,8 @@ function transformCharacterData(item: CharacterRecord) {
frEpithets: toJsonArray(item.frEpithets),
status: toNullable(item.status),
arcId: toNullable(item.arcId),
url: toNullable(item.url)
url: toNullable(item.url),
frUrl: toNullable(item.frUrl)
};
}

View File

@@ -276,13 +276,12 @@ async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
}
/**
* Fetch all cannon characters from One Piece fandom using API
* Fetch all cannon characters from One Piece fandom, including their full data.
*/
async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
async function fetchAllCharacters(arcsList: Arc[]): Promise<Character[]> {
try {
const apiUrl = `${FANDOM_API_BASE}List_of_Canon_Characters`;
console.log('Fetching character list via API...');
const response = await fetchWithRetry(apiUrl);
const response = await fetchWithRetry(`${FANDOM_API_BASE}List_of_Canon_Characters`);
const jsonData = await response.json();
// Extract HTML from API response
@@ -292,7 +291,7 @@ async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
}
const $ = cheerio.load(htmlContent);
const characters: CharacterListItem[] = [];
const characterList: CharacterListItem[] = [];
$('table.fandom-table tbody tr').each((index, element) => {
if (index === 0) return; // Skip header row
let charUrl = $(element).find('td:nth-child(2) a').attr('href');
@@ -304,27 +303,99 @@ async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
charChapter = charChapter.replace(/\D/g, '');
// If charChapter is empty, skip the character as it means they don't have a proper page and are just mentioned in the list
if (!charChapter) {
return;
}
if (parseInt(charChapter, 10) === 0) {
if (!charChapter || parseInt(charChapter, 10) === 0) {
return;
}
if (charUrl) {
charUrl = charUrl.replace('/wiki/', '');
characters.push({
characterList.push({
name: charName,
url: charUrl,
chapter: parseInt(charChapter, 10)
});
}
});
console.log(`Found ${characters.length} characters.`);
if (characterList.length === 0) {
console.error('No characters found.');
return [];
}
console.log(`Found ${characterList.length} characters.`);
// Fetch the french character list to get the picture URLs
console.log('Fetching French character list via API...');
const frResponse = await fetchWithRetry(`${FR_FANDOM_API_BASE}Liste_des_Personnages_Canon`);
const frJsonData = await frResponse.json();
// Create a map of character name to picture URL from the French list
const frHtmlContent = frJsonData.parse?.text?.['*'];
const fr$ = cheerio.load(frHtmlContent);
const frCharacterPictureMap: Record<string, string> = {};
fr$('table.wikitable tbody tr').each((index, element) => {
if (index === 0) return; // Skip header row
const charName = fr$(element).find('td:nth-child(2) a').text().trim();
const pictureUrl = fr$(element).find('td:nth-child(1) img').attr('data-src') || fr$(element).find('td:nth-child(1) img').attr('src') || null;
if (charName && pictureUrl) {
frCharacterPictureMap[charName] = pictureUrl;
}
});
const characters: Character[] = [];
let failedCharacters: CharacterListItem[] = [...characterList];
while (failedCharacters.length > 0) {
const nextFailedCharacters: CharacterListItem[] = [];
console.log(`\nFetching ${failedCharacters.length} characters...`);
for (let i = 0; i < failedCharacters.length; i += FETCH_CONCURRENCY) {
const batch = failedCharacters.slice(i, i + FETCH_CONCURRENCY);
const batchResults = await Promise.all(
batch.map(async (char) => {
const data = await fetchCharacter(char.url, char.name, char.chapter, arcsList, frCharacterPictureMap);
return { char, data };
})
);
for (const { char, data } of batchResults) {
if (data) {
console.table({
ID: data.id,
Name: data.name,
Gender: data.gender,
Age: data.age,
Status: data.status,
Epithets: data.epithets.join(', '),
Affiliations: data.affiliations.join(', '),
DevilFruitId: data.devilFruitId,
DevilFruitUrl: data.devilFruitUrl,
HakiObservation: data.hakiObservation ? 'Yes' : 'No',
HakiArmament: data.hakiArmament ? 'Yes' : 'No',
HakiConqueror: data.hakiConqueror ? 'Yes' : 'No',
Height: data.height,
Bounty: data.bounty,
Origin: data.origin,
FirstAppearance: data.firstAppearance,
pictureUrl: data.pictureUrl,
FandomURL: data.url
});
characters.push(data);
} else {
nextFailedCharacters.push(char);
}
}
}
failedCharacters = nextFailedCharacters;
if (failedCharacters.length > 0) {
console.log(`⚠️ ${failedCharacters.length} characters failed. Retrying...`);
}
}
console.log(`\n✓ Scraped ${characters.length} characters\n`);
return characters;
} catch (error) {
console.error('Error fetching character list:', (error as Error).message);
console.error('Error fetching characters:', (error as Error).message);
return [];
}
}
@@ -336,7 +407,8 @@ async function fetchCharacter(
characterUrl: string,
characterName: string,
characterChapter: number,
arcsList: Arc[]
arcsList: Arc[],
frCharacterPictureMap: Record<string, string>
): Promise<Character | null> {
try {
console.log(`Fetching: ${characterName}...`);
@@ -453,6 +525,8 @@ async function fetchCharacter(
frName = name;
}
const pictureUrl = frCharacterPictureMap[frName || ''] || null;
return {
id: finalCharacterId,
name,
@@ -475,7 +549,7 @@ async function fetchCharacter(
firstAppearance,
arcId,
status,
pictureUrl: 'Image_Non_Disponible',
pictureUrl,
url: characterUrl,
frUrl
};
@@ -934,72 +1008,17 @@ async function main(): Promise<void> {
}
// Step 2: Scraping Characters
console.log('=== Step 1: Scraping Characters ===\n');
const characterList = await fetchAllCharactersUrl();
console.log('=== Step 2: Scraping Characters ===\n');
const characters = await fetchAllCharacters(arcsList);
if (characterList.length === 0) {
if (characters.length === 0) {
console.error('No characters found. Exiting.');
return;
}
const characters: Character[] = [];
const devilFruitUrls = new Set<string>();
let failedCharacters: CharacterListItem[] = [...characterList];
while (failedCharacters.length > 0) {
const nextFailedCharacters: CharacterListItem[] = [];
console.log(`\nFetching ${failedCharacters.length} characters...`);
for (let i = 0; i < failedCharacters.length; i += FETCH_CONCURRENCY) {
const batch = failedCharacters.slice(i, i + FETCH_CONCURRENCY);
const batchResults = await Promise.all(
batch.map(async (char) => {
const data = await fetchCharacter(char.url, char.name, char.chapter, arcsList);
return { char, data };
})
);
for (const { char, data } of batchResults) {
if (data) {
console.table({
ID: data.id,
Name: data.name,
Gender: data.gender,
Age: data.age,
Status: data.status,
Epithets: data.epithets.join(', '),
Affiliations: data.affiliations.join(', '),
DevilFruitId: data.devilFruitId,
DevilFruitUrl: data.devilFruitUrl,
HakiObservation: data.hakiObservation ? 'Yes' : 'No',
HakiArmament: data.hakiArmament ? 'Yes' : 'No',
HakiConqueror: data.hakiConqueror ? 'Yes' : 'No',
Height: data.height,
Bounty: data.bounty,
Origin: data.origin,
FirstAppearance: data.firstAppearance,
pictureUrl: data.pictureUrl,
FandomURL: data.url
});
if (data.devilFruitUrl) {
devilFruitUrls.add(data.devilFruitUrl);
}
characters.push(data);
} else {
nextFailedCharacters.push(char);
}
}
}
failedCharacters = nextFailedCharacters;
if (failedCharacters.length > 0) {
console.log(`⚠️ ${failedCharacters.length} characters failed. Retrying...`);
}
}
console.log(`\n✓ Scraped ${characters.length} characters\n`);
const devilFruitUrls = new Set<string>(
characters.filter((c) => c.devilFruitUrl).map((c) => c.devilFruitUrl!)
);
console.log(`✓ Found ${devilFruitUrls.size} unique devil fruits\n`);
// Step 3: Scraping Devil Fruits