refactor: enhance character data transformation and improve fetching logic in character-related scripts
This commit is contained in:
@@ -276,13 +276,12 @@ async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch all cannon characters from One Piece fandom using API
|
||||
* Fetch all cannon characters from One Piece fandom, including their full data.
|
||||
*/
|
||||
async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
|
||||
async function fetchAllCharacters(arcsList: Arc[]): Promise<Character[]> {
|
||||
try {
|
||||
const apiUrl = `${FANDOM_API_BASE}List_of_Canon_Characters`;
|
||||
console.log('Fetching character list via API...');
|
||||
const response = await fetchWithRetry(apiUrl);
|
||||
const response = await fetchWithRetry(`${FANDOM_API_BASE}List_of_Canon_Characters`);
|
||||
const jsonData = await response.json();
|
||||
|
||||
// Extract HTML from API response
|
||||
@@ -292,7 +291,7 @@ async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
|
||||
}
|
||||
|
||||
const $ = cheerio.load(htmlContent);
|
||||
const characters: CharacterListItem[] = [];
|
||||
const characterList: CharacterListItem[] = [];
|
||||
$('table.fandom-table tbody tr').each((index, element) => {
|
||||
if (index === 0) return; // Skip header row
|
||||
let charUrl = $(element).find('td:nth-child(2) a').attr('href');
|
||||
@@ -304,27 +303,99 @@ async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
|
||||
charChapter = charChapter.replace(/\D/g, '');
|
||||
|
||||
// If charChapter is empty, skip the character as it means they don't have a proper page and are just mentioned in the list
|
||||
if (!charChapter) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (parseInt(charChapter, 10) === 0) {
|
||||
if (!charChapter || parseInt(charChapter, 10) === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (charUrl) {
|
||||
charUrl = charUrl.replace('/wiki/', '');
|
||||
characters.push({
|
||||
characterList.push({
|
||||
name: charName,
|
||||
url: charUrl,
|
||||
chapter: parseInt(charChapter, 10)
|
||||
});
|
||||
}
|
||||
});
|
||||
console.log(`Found ${characters.length} characters.`);
|
||||
|
||||
if (characterList.length === 0) {
|
||||
console.error('No characters found.');
|
||||
return [];
|
||||
}
|
||||
console.log(`Found ${characterList.length} characters.`);
|
||||
|
||||
// Fetch the french character list to get the picture URLs
|
||||
console.log('Fetching French character list via API...');
|
||||
const frResponse = await fetchWithRetry(`${FR_FANDOM_API_BASE}Liste_des_Personnages_Canon`);
|
||||
const frJsonData = await frResponse.json();
|
||||
|
||||
// Create a map of character name to picture URL from the French list
|
||||
const frHtmlContent = frJsonData.parse?.text?.['*'];
|
||||
const fr$ = cheerio.load(frHtmlContent);
|
||||
const frCharacterPictureMap: Record<string, string> = {};
|
||||
fr$('table.wikitable tbody tr').each((index, element) => {
|
||||
if (index === 0) return; // Skip header row
|
||||
const charName = fr$(element).find('td:nth-child(2) a').text().trim();
|
||||
const pictureUrl = fr$(element).find('td:nth-child(1) img').attr('data-src') || fr$(element).find('td:nth-child(1) img').attr('src') || null;
|
||||
if (charName && pictureUrl) {
|
||||
frCharacterPictureMap[charName] = pictureUrl;
|
||||
}
|
||||
});
|
||||
|
||||
const characters: Character[] = [];
|
||||
let failedCharacters: CharacterListItem[] = [...characterList];
|
||||
|
||||
while (failedCharacters.length > 0) {
|
||||
const nextFailedCharacters: CharacterListItem[] = [];
|
||||
console.log(`\nFetching ${failedCharacters.length} characters...`);
|
||||
|
||||
for (let i = 0; i < failedCharacters.length; i += FETCH_CONCURRENCY) {
|
||||
const batch = failedCharacters.slice(i, i + FETCH_CONCURRENCY);
|
||||
const batchResults = await Promise.all(
|
||||
batch.map(async (char) => {
|
||||
const data = await fetchCharacter(char.url, char.name, char.chapter, arcsList, frCharacterPictureMap);
|
||||
return { char, data };
|
||||
})
|
||||
);
|
||||
|
||||
for (const { char, data } of batchResults) {
|
||||
if (data) {
|
||||
console.table({
|
||||
ID: data.id,
|
||||
Name: data.name,
|
||||
Gender: data.gender,
|
||||
Age: data.age,
|
||||
Status: data.status,
|
||||
Epithets: data.epithets.join(', '),
|
||||
Affiliations: data.affiliations.join(', '),
|
||||
DevilFruitId: data.devilFruitId,
|
||||
DevilFruitUrl: data.devilFruitUrl,
|
||||
HakiObservation: data.hakiObservation ? 'Yes' : 'No',
|
||||
HakiArmament: data.hakiArmament ? 'Yes' : 'No',
|
||||
HakiConqueror: data.hakiConqueror ? 'Yes' : 'No',
|
||||
Height: data.height,
|
||||
Bounty: data.bounty,
|
||||
Origin: data.origin,
|
||||
FirstAppearance: data.firstAppearance,
|
||||
pictureUrl: data.pictureUrl,
|
||||
FandomURL: data.url
|
||||
});
|
||||
characters.push(data);
|
||||
} else {
|
||||
nextFailedCharacters.push(char);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
failedCharacters = nextFailedCharacters;
|
||||
if (failedCharacters.length > 0) {
|
||||
console.log(`⚠️ ${failedCharacters.length} characters failed. Retrying...`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n✓ Scraped ${characters.length} characters\n`);
|
||||
return characters;
|
||||
} catch (error) {
|
||||
console.error('Error fetching character list:', (error as Error).message);
|
||||
console.error('Error fetching characters:', (error as Error).message);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
@@ -336,7 +407,8 @@ async function fetchCharacter(
|
||||
characterUrl: string,
|
||||
characterName: string,
|
||||
characterChapter: number,
|
||||
arcsList: Arc[]
|
||||
arcsList: Arc[],
|
||||
frCharacterPictureMap: Record<string, string>
|
||||
): Promise<Character | null> {
|
||||
try {
|
||||
console.log(`Fetching: ${characterName}...`);
|
||||
@@ -453,6 +525,8 @@ async function fetchCharacter(
|
||||
frName = name;
|
||||
}
|
||||
|
||||
const pictureUrl = frCharacterPictureMap[frName || ''] || null;
|
||||
|
||||
return {
|
||||
id: finalCharacterId,
|
||||
name,
|
||||
@@ -475,7 +549,7 @@ async function fetchCharacter(
|
||||
firstAppearance,
|
||||
arcId,
|
||||
status,
|
||||
pictureUrl: 'Image_Non_Disponible',
|
||||
pictureUrl,
|
||||
url: characterUrl,
|
||||
frUrl
|
||||
};
|
||||
@@ -934,72 +1008,17 @@ async function main(): Promise<void> {
|
||||
}
|
||||
|
||||
// Step 2: Scraping Characters
|
||||
console.log('=== Step 1: Scraping Characters ===\n');
|
||||
const characterList = await fetchAllCharactersUrl();
|
||||
console.log('=== Step 2: Scraping Characters ===\n');
|
||||
const characters = await fetchAllCharacters(arcsList);
|
||||
|
||||
if (characterList.length === 0) {
|
||||
if (characters.length === 0) {
|
||||
console.error('No characters found. Exiting.');
|
||||
return;
|
||||
}
|
||||
|
||||
const characters: Character[] = [];
|
||||
const devilFruitUrls = new Set<string>();
|
||||
let failedCharacters: CharacterListItem[] = [...characterList];
|
||||
|
||||
while (failedCharacters.length > 0) {
|
||||
const nextFailedCharacters: CharacterListItem[] = [];
|
||||
console.log(`\nFetching ${failedCharacters.length} characters...`);
|
||||
|
||||
for (let i = 0; i < failedCharacters.length; i += FETCH_CONCURRENCY) {
|
||||
const batch = failedCharacters.slice(i, i + FETCH_CONCURRENCY);
|
||||
const batchResults = await Promise.all(
|
||||
batch.map(async (char) => {
|
||||
const data = await fetchCharacter(char.url, char.name, char.chapter, arcsList);
|
||||
return { char, data };
|
||||
})
|
||||
);
|
||||
|
||||
for (const { char, data } of batchResults) {
|
||||
if (data) {
|
||||
console.table({
|
||||
ID: data.id,
|
||||
Name: data.name,
|
||||
Gender: data.gender,
|
||||
Age: data.age,
|
||||
Status: data.status,
|
||||
Epithets: data.epithets.join(', '),
|
||||
Affiliations: data.affiliations.join(', '),
|
||||
DevilFruitId: data.devilFruitId,
|
||||
DevilFruitUrl: data.devilFruitUrl,
|
||||
HakiObservation: data.hakiObservation ? 'Yes' : 'No',
|
||||
HakiArmament: data.hakiArmament ? 'Yes' : 'No',
|
||||
HakiConqueror: data.hakiConqueror ? 'Yes' : 'No',
|
||||
Height: data.height,
|
||||
Bounty: data.bounty,
|
||||
Origin: data.origin,
|
||||
FirstAppearance: data.firstAppearance,
|
||||
pictureUrl: data.pictureUrl,
|
||||
FandomURL: data.url
|
||||
});
|
||||
|
||||
if (data.devilFruitUrl) {
|
||||
devilFruitUrls.add(data.devilFruitUrl);
|
||||
}
|
||||
|
||||
characters.push(data);
|
||||
} else {
|
||||
nextFailedCharacters.push(char);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
failedCharacters = nextFailedCharacters;
|
||||
if (failedCharacters.length > 0) {
|
||||
console.log(`⚠️ ${failedCharacters.length} characters failed. Retrying...`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n✓ Scraped ${characters.length} characters\n`);
|
||||
const devilFruitUrls = new Set<string>(
|
||||
characters.filter((c) => c.devilFruitUrl).map((c) => c.devilFruitUrl!)
|
||||
);
|
||||
console.log(`✓ Found ${devilFruitUrls.size} unique devil fruits\n`);
|
||||
|
||||
// Step 3: Scraping Devil Fruits
|
||||
|
||||
Reference in New Issue
Block a user