feat: update DevilFruitType to include 'Smile' for enhanced categorization
All checks were successful
Build Docker Image / build (push) Successful in 1m28s
All checks were successful
Build Docker Image / build (push) Successful in 1m28s
This commit is contained in:
@@ -52,10 +52,11 @@ interface DevilFruit {
|
|||||||
url: string;
|
url: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
const FANDOM_BASE_URL = 'https://onepiece.fandom.com/fr/wiki';
|
const FANDOM_API_BASE = 'https://onepiece.fandom.com/fr/api.php?action=parse&format=json&page=';
|
||||||
const OUTPUT_DIR = './scraped-data';
|
const OUTPUT_DIR = './scraped-data';
|
||||||
const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed
|
const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed
|
||||||
const INITIAL_RETRY_DELAY = 1000;
|
const INITIAL_RETRY_DELAY = 1000;
|
||||||
|
const CHARACTER_FETCH_CONCURRENCY = 50;
|
||||||
|
|
||||||
// Store cookies across requests (simulate browser behavior)
|
// Store cookies across requests (simulate browser behavior)
|
||||||
const cookies = new Map<string, string>();
|
const cookies = new Map<string, string>();
|
||||||
@@ -153,15 +154,22 @@ function normalizeId(str: string): string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fetch all arcs from One Piece fandom
|
* Fetch all arcs from One Piece fandom using API
|
||||||
*/
|
*/
|
||||||
async function fetchAllArcs(): Promise<Arc[]> {
|
async function fetchAllArcs(): Promise<Arc[]> {
|
||||||
try {
|
try {
|
||||||
const url = `${FANDOM_BASE_URL}/Chapitres_et_Tomes`;
|
const apiUrl = `${FANDOM_API_BASE}Chapitres_et_Tomes`;
|
||||||
console.log('Fetching arcs list...');
|
console.log('Fetching arcs list via API...');
|
||||||
const response = await fetchWithRetry(url);
|
const response = await fetchWithRetry(apiUrl);
|
||||||
const data = await response.text();
|
const jsonData = await response.json() as any;
|
||||||
const $ = cheerio.load(data);
|
|
||||||
|
// Extract HTML from API response
|
||||||
|
const htmlContent = jsonData.parse?.text?.['*'];
|
||||||
|
if (!htmlContent) {
|
||||||
|
throw new Error('Unable to extract HTML content from API response');
|
||||||
|
}
|
||||||
|
|
||||||
|
const $ = cheerio.load(htmlContent);
|
||||||
const arcs: Arc[] = [];
|
const arcs: Arc[] = [];
|
||||||
|
|
||||||
// Find all arc links in the table
|
// Find all arc links in the table
|
||||||
@@ -247,15 +255,22 @@ async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fetch all cannon characters from One Piece fandom
|
* Fetch all cannon characters from One Piece fandom using API
|
||||||
*/
|
*/
|
||||||
async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
|
async function fetchAllCharactersUrl(): Promise<CharacterListItem[]> {
|
||||||
try {
|
try {
|
||||||
const url = `${FANDOM_BASE_URL}/Liste_des_Personnages_Canon`;
|
const apiUrl = `${FANDOM_API_BASE}Liste_des_Personnages_Canon`;
|
||||||
console.log('Fetching character list...');
|
console.log('Fetching character list via API...');
|
||||||
const response = await fetchWithRetry(url);
|
const response = await fetchWithRetry(apiUrl);
|
||||||
const data = await response.text();
|
const jsonData = await response.json() as any;
|
||||||
const $ = cheerio.load(data);
|
|
||||||
|
// Extract HTML from API response
|
||||||
|
const htmlContent = jsonData.parse?.text?.['*'];
|
||||||
|
if (!htmlContent) {
|
||||||
|
throw new Error('Unable to extract HTML content from API response');
|
||||||
|
}
|
||||||
|
|
||||||
|
const $ = cheerio.load(htmlContent);
|
||||||
const characters: CharacterListItem[] = [];
|
const characters: CharacterListItem[] = [];
|
||||||
$('table.wikitable tbody tr').each((index, element) => {
|
$('table.wikitable tbody tr').each((index, element) => {
|
||||||
if (index === 0) return; // Skip header row
|
if (index === 0) return; // Skip header row
|
||||||
@@ -303,43 +318,47 @@ async function fetchCharacter(
|
|||||||
try {
|
try {
|
||||||
console.log(`Fetching: ${characterName}...`);
|
console.log(`Fetching: ${characterName}...`);
|
||||||
|
|
||||||
const response = await fetchWithRetry(`${FANDOM_BASE_URL}/${characterUrl}`, {
|
// Use API to fetch character page
|
||||||
redirect: 'follow'
|
const apiUrl = `${FANDOM_API_BASE}${characterUrl}`;
|
||||||
});
|
let response = await fetchWithRetry(apiUrl);
|
||||||
|
|
||||||
// Use final URL after redirects (canonical character page)
|
let jsonData = await response.json() as any;
|
||||||
|
|
||||||
|
// Use final page name from API (if parse.limks contains one element, it means the original page was a redirect, so we use the the element 0 as the final URL, otherwise we use the original URL)
|
||||||
let finalCharacterUrl = characterUrl;
|
let finalCharacterUrl = characterUrl;
|
||||||
let finalCharacterId = normalizeId(characterUrl);
|
if (jsonData.parse?.links?.length === 1) {
|
||||||
try {
|
finalCharacterUrl = jsonData.parse.links[0]['*'];
|
||||||
const finalUrl = new URL(response.url);
|
// Query the API again with the final URL to get the correct HTML content (in case of redirect)
|
||||||
const characterUrlPath = finalUrl.pathname.replace('/fr/wiki/', '');
|
response = await fetchWithRetry(`${FANDOM_API_BASE}${finalCharacterUrl}`);
|
||||||
if (characterUrlPath) {
|
jsonData = await response.json() as any;
|
||||||
finalCharacterUrl = characterUrlPath;
|
|
||||||
finalCharacterId = normalizeId(characterUrlPath);
|
|
||||||
}
|
|
||||||
} catch {
|
|
||||||
// If HTTP is not ok or redirected URL, throw an error to be caught in the outer block
|
|
||||||
if (!response.ok) {
|
|
||||||
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const data = await response.text();
|
const categories = jsonData.parse?.categories || [];
|
||||||
|
|
||||||
const $ = cheerio.load(data);
|
// Extract HTML from API response
|
||||||
|
const htmlContent = jsonData.parse?.text?.['*'];
|
||||||
|
if (!htmlContent) {
|
||||||
|
throw new Error('Unable to extract HTML content from API response');
|
||||||
|
}
|
||||||
|
|
||||||
// Extract character name
|
const $ = cheerio.load(htmlContent);
|
||||||
const name = $('h1.mw-page-title-main').text().trim() || characterName.replace(/_/g, ' ');
|
|
||||||
|
const name = characterName;
|
||||||
|
|
||||||
// Generate character ID from URL + name combination
|
// Generate character ID from URL + name combination
|
||||||
finalCharacterId = normalizeId(finalCharacterUrl + '_' + name);
|
const finalCharacterId = normalizeId(finalCharacterUrl + '_' + name);
|
||||||
|
|
||||||
// Extract gender from the specific categories link
|
// Extract gender from JSON categories
|
||||||
let gender: string | null = null;
|
let gender: string | null = null;
|
||||||
if ($('.page-header__categories a[title="Catégorie:Personnages Masculins"]').length > 0) {
|
for (const cat of categories) {
|
||||||
|
const catName = cat['*'] || '';
|
||||||
|
if (catName === 'Personnages_Masculins') {
|
||||||
gender = 'Male';
|
gender = 'Male';
|
||||||
} else if ($('.page-header__categories a[title="Catégorie:Personnages Féminins"]').length > 0) {
|
break;
|
||||||
|
} else if (catName === 'Personnages_Féminins') {
|
||||||
gender = 'Female';
|
gender = 'Female';
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract age
|
// Extract age
|
||||||
@@ -356,10 +375,20 @@ async function fetchCharacter(
|
|||||||
const devilFruitId = devilFruitData?.devilFruitId || null;
|
const devilFruitId = devilFruitData?.devilFruitId || null;
|
||||||
const devilFruitUrl = devilFruitData?.devilFruitUrl || null;
|
const devilFruitUrl = devilFruitData?.devilFruitUrl || null;
|
||||||
|
|
||||||
// Extract haki
|
// Extract haki from JSON categories
|
||||||
const hakiObservation = $('.page-header__categories a[title="Catégorie:Utilisateurs du Haki de l\'observation"]').length > 0;
|
let hakiObservation = false;
|
||||||
const hakiArmament = $('.page-header__categories a[title="Catégorie:Utilisateurs du Haki de l\'armement"]').length > 0;
|
let hakiArmament = false;
|
||||||
const hakiConqueror = $('.page-header__categories a[title="Catégorie:Utilisateurs du Haki des rois"]').length > 0;
|
let hakiConqueror = false;
|
||||||
|
for (const cat of categories) {
|
||||||
|
const catName = cat['*'] || '';
|
||||||
|
if (catName === 'Utilisateurs_du_Haki_de_l\'observation') {
|
||||||
|
hakiObservation = true;
|
||||||
|
} else if (catName === 'Utilisateurs_du_Haki_de_l\'armement') {
|
||||||
|
hakiArmament = true;
|
||||||
|
} else if (catName === 'Utilisateurs_du_Haki_des_rois') {
|
||||||
|
hakiConqueror = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Extract bounty
|
// Extract bounty
|
||||||
const bounty = extractBounty($);
|
const bounty = extractBounty($);
|
||||||
@@ -499,15 +528,17 @@ async function extractDevilFruit($: cheerio.CheerioAPI): Promise<DevilFruitData
|
|||||||
const cleanUrl = href.replace('/fr/wiki/', '');
|
const cleanUrl = href.replace('/fr/wiki/', '');
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Fetch the page to follow redirects
|
// Fetch the page via API to follow redirects
|
||||||
const response = await fetchWithRetry(`${FANDOM_BASE_URL}/${cleanUrl}`, {
|
const apiUrl = `${FANDOM_API_BASE}${decodeURIComponent(cleanUrl)}`;
|
||||||
redirect: 'follow' // Explicitly follow redirects
|
const response = await fetchWithRetry(apiUrl);
|
||||||
});
|
const jsonData = await response.json() as any;
|
||||||
|
|
||||||
|
// Use final page name from API (if parse.links contains one element, it means the original page was a redirect, so we use the the element 0 as the final URL, otherwise we use the original URL)
|
||||||
|
let finalPath = cleanUrl;
|
||||||
|
if (jsonData.parse?.links?.length === 1) {
|
||||||
|
finalPath = jsonData.parse.links[0]['*'];
|
||||||
|
}
|
||||||
|
|
||||||
// Use the final URL after redirects
|
|
||||||
const finalUrl = new URL(response.url);
|
|
||||||
const pathname = finalUrl.pathname;
|
|
||||||
const finalPath = pathname.replace('/fr/wiki/', '');
|
|
||||||
|
|
||||||
if (finalPath) {
|
if (finalPath) {
|
||||||
return {
|
return {
|
||||||
@@ -628,9 +659,9 @@ function extractOrigin($: cheerio.CheerioAPI): string | null {
|
|||||||
/**
|
/**
|
||||||
* Extract status from infobox
|
* Extract status from infobox
|
||||||
*/
|
*/
|
||||||
function extractStatus($: cheerio.CheerioAPI): string | null {
|
function extractStatus($: cheerio.CheerioAPI): string {
|
||||||
const div = $('[data-source="statut"] .pi-data-value');
|
const div = $('[data-source="statut"] .pi-data-value');
|
||||||
if (div.length === 0) return null;
|
if (div.length === 0) return 'Alive';
|
||||||
|
|
||||||
const statusText = div.text().trim().toLowerCase();
|
const statusText = div.text().trim().toLowerCase();
|
||||||
|
|
||||||
@@ -640,7 +671,7 @@ function extractStatus($: cheerio.CheerioAPI): string | null {
|
|||||||
return 'Dead';
|
return 'Dead';
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return 'Alive';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -714,25 +745,35 @@ async function saveToCSV(characters: Character[]): Promise<void> {
|
|||||||
*/
|
*/
|
||||||
async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Promise<DevilFruit | null> {
|
async function fetchDevilFruit(devilFruitUrl: string, devilFruitId: string): Promise<DevilFruit | null> {
|
||||||
try {
|
try {
|
||||||
console.log(`Fetching devil fruit: ${devilFruitId}...`);
|
console.log(`Fetching devil fruit: ${devilFruitUrl}...`);
|
||||||
|
|
||||||
const response = await fetchWithRetry(`${FANDOM_BASE_URL}/${devilFruitUrl}`);
|
// Use API to fetch devil fruit page
|
||||||
const data = await response.text();
|
const apiUrl = `${FANDOM_API_BASE}${devilFruitUrl}`;
|
||||||
const $ = cheerio.load(data);
|
const response = await fetchWithRetry(apiUrl);
|
||||||
|
const jsonData = await response.json() as any;
|
||||||
|
|
||||||
const name = $('span.mw-page-title-main').text().trim();
|
// Extract HTML from API response
|
||||||
|
const htmlContent = jsonData.parse?.text?.['*'];
|
||||||
|
if (!htmlContent) {
|
||||||
|
throw new Error('Unable to extract HTML content from API response');
|
||||||
|
}
|
||||||
|
|
||||||
|
const name = jsonData.parse?.title || devilFruitId.replace(/_/g, ' ');
|
||||||
|
|
||||||
// Extract type from label in infobox
|
|
||||||
let type: string | null = null;
|
let type: string | null = null;
|
||||||
const typeDiv = $('[data-source="type"] .pi-data-value');
|
// Determine type based on categories (if categories contain "Paramecia", "Zoan", "Logia" or "Smile")
|
||||||
if (typeDiv.length > 0) {
|
if (jsonData.parse?.categories) {
|
||||||
const typeText = typeDiv.text().trim().toLowerCase();
|
const categories = jsonData.parse.categories
|
||||||
if (typeText.includes('zoan')) {
|
.map((cat: any) => String(cat['*'] || '').toLowerCase());
|
||||||
type = 'Zoan';
|
|
||||||
} else if (typeText.includes('paramecia')) {
|
if (categories.some((category: string) => category.includes('paramecia'))) {
|
||||||
type = 'Paramecia';
|
type = 'Paramecia';
|
||||||
} else if (typeText.includes('logia')) {
|
} else if (categories.some((category: string) => category.includes('zoan'))) {
|
||||||
|
type = 'Zoan';
|
||||||
|
} else if (categories.some((category: string) => category.includes('logia'))) {
|
||||||
type = 'Logia';
|
type = 'Logia';
|
||||||
|
} else if (categories.some((category: string) => category.includes('smile'))) {
|
||||||
|
type = 'Smile';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -838,10 +879,16 @@ async function main(): Promise<void> {
|
|||||||
const nextFailedCharacters: CharacterListItem[] = [];
|
const nextFailedCharacters: CharacterListItem[] = [];
|
||||||
console.log(`\nFetching ${failedCharacters.length} characters...`);
|
console.log(`\nFetching ${failedCharacters.length} characters...`);
|
||||||
|
|
||||||
for (let i = 0; i < failedCharacters.length; i++) {
|
for (let i = 0; i < failedCharacters.length; i += CHARACTER_FETCH_CONCURRENCY) {
|
||||||
const char = failedCharacters[i];
|
const batch = failedCharacters.slice(i, i + CHARACTER_FETCH_CONCURRENCY);
|
||||||
|
const batchResults = await Promise.all(
|
||||||
|
batch.map(async (char) => {
|
||||||
const data = await fetchCharacter(char.url, char.name, char.pictureUrl, char.chapter);
|
const data = await fetchCharacter(char.url, char.name, char.pictureUrl, char.chapter);
|
||||||
|
return { char, data };
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const { char, data } of batchResults) {
|
||||||
if (data) {
|
if (data) {
|
||||||
console.table({
|
console.table({
|
||||||
ID: data.id,
|
ID: data.id,
|
||||||
@@ -864,12 +911,10 @@ async function main(): Promise<void> {
|
|||||||
FandomURL: data.url
|
FandomURL: data.url
|
||||||
});
|
});
|
||||||
|
|
||||||
// Collect devil fruit URLs
|
|
||||||
if (data.devilFruitUrl) {
|
if (data.devilFruitUrl) {
|
||||||
devilFruitUrls.add(data.devilFruitUrl);
|
devilFruitUrls.add(data.devilFruitUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add arc IDs to character data
|
|
||||||
if (data.firstAppearance) {
|
if (data.firstAppearance) {
|
||||||
const arc = arcsList.find(a => a.startChapter <= data.firstAppearance && (a.endChapter === null || a.endChapter >= data.firstAppearance));
|
const arc = arcsList.find(a => a.startChapter <= data.firstAppearance && (a.endChapter === null || a.endChapter >= data.firstAppearance));
|
||||||
if (arc) {
|
if (arc) {
|
||||||
@@ -879,9 +924,8 @@ async function main(): Promise<void> {
|
|||||||
|
|
||||||
characters.push(data);
|
characters.push(data);
|
||||||
} else {
|
} else {
|
||||||
// Add to retry list and wait before next character
|
|
||||||
nextFailedCharacters.push(char);
|
nextFailedCharacters.push(char);
|
||||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import { integer, sqliteTable, text, real, unique } from 'drizzle-orm/sqlite-cor
|
|||||||
import { user } from './auth.schema';
|
import { user } from './auth.schema';
|
||||||
|
|
||||||
// Define devil fruit types
|
// Define devil fruit types
|
||||||
export type DevilFruitType = 'Paramecia' | 'Zoan' | 'Logia' | 'Unknown';
|
export type DevilFruitType = 'Paramecia' | 'Zoan' | 'Logia' | 'Smile' | 'Unknown';
|
||||||
|
|
||||||
// Define the site config table schema
|
// Define the site config table schema
|
||||||
export const config = sqliteTable('config', {
|
export const config = sqliteTable('config', {
|
||||||
|
|||||||
Reference in New Issue
Block a user