OnePieceDle/scripts/scrape-onepiece.ts

import * as cheerio from 'cheerio';
import fs from 'fs';
import { createObjectCsvWriter } from 'csv-writer';

// Type definitions
interface Arc {
	id: string;
	name: string;
	frName: string | null;
	startChapter: number;
	endChapter: number | null;
	url: string;
}

interface Character {
	id: string;
	name: string;
	frName: string | null;
	gender: string | null;
	age: number | null;
	height: number | null;
	origin: string | null;
	frOrigin: string | null;
	devilFruitId: string | null;
	devilFruitUrl: string | null;
	affiliations: string[];
	frAffiliations: string[] | null;
	bounty: number | null;
	hakiObservation: boolean;
	hakiArmament: boolean;
	hakiConqueror: boolean;
	epithets: string[];
	frEpithets: string[] | null;
	firstAppearance: number;
	status: string | null;
	pictureUrl: string | null;
	url: string;
	frUrl: string | null;
	arcId: string;
}

interface CharacterListItem {
	name: string;
	url: string;
	chapter: number;
}

interface DevilFruitData {
	devilFruitId: string;
	devilFruitUrl: string;
}

interface DevilFruit {
	id: string;
	name: string;
	type: string | null;
	url: string;
}

const FANDOM_API_BASE =
	'https://onepiece.fandom.com/api.php?action=parse&redirects=true&format=json&page=';
const FR_FANDOM_API_BASE =
	'https://onepiece.fandom.com/fr/api.php?action=parse&redirects=true&format=json&page=';
const OUTPUT_DIR = './scraped-data';
const MAX_RETRIES = 0; // Set to 0 to disable retries, can be increased if needed
const INITIAL_RETRY_DELAY = 1000;
const FETCH_CONCURRENCY = 50;

// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
	fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}

/**
 * Retry a fetch request with exponential backoff
 */
async function fetchWithRetry(
	url: string,
	options: RequestInit = {},
	retries: number = 0
): Promise<Response> {
	try {
		const headers: Record<string, string> = {
			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) Firefox/150.0',
			'Accept-Language': 'en-US,en;q=0.9',
			'Accept-Encoding': 'gzip, deflate, br',
			Connection: 'keep-alive',
			...((options.headers as Record<string, string>) || {})
		};

		const response = await fetch(url, {
			headers,
			...options
		});

		// Check if response is OK (status 200-299)
		if (response.ok) {
			return response;
		}

		// If not OK and we have retries left, retry
		if (retries < MAX_RETRIES) {
			const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
			console.log(`⚠️  HTTP ${response.status} for ${url}, retrying in ${delay}ms...`);
			await new Promise((resolve) => setTimeout(resolve, delay));
			return fetchWithRetry(url, options, retries + 1);
		}

		// If we've exhausted retries, throw error
		throw new Error(`HTTP ${response.status}: ${response.statusText}`);
	} catch (error) {
		// If it's a network error and we have retries left, retry
		if (retries < MAX_RETRIES) {
			const delay = INITIAL_RETRY_DELAY * Math.pow(2, retries);
			console.log(`⚠️  Network error: ${(error as Error).message}, retrying in ${delay}ms...`);
			await new Promise((resolve) => setTimeout(resolve, delay));
			return fetchWithRetry(url, options, retries + 1);
		}

		// If we've exhausted retries, throw error
		throw error;
	}
}

/**
 * Get the French link from the API response links array
 */

function getFrLink(links: { lang: string; ['*']: string; url: string }[]): { url: string } | null {
	// Get french url by getting parse.langlinks where lang is "fr" and extract the name from there
	const frLink = links.find(
		(link: { lang: string; ['*']: string; url: string }) => link.lang === 'fr'
	);
	return frLink ? { url: frLink['url'] } : null;
}

/**
 * Normalize string by decoding URI components, punctuation, and replacing spaces with underscores
 */
function normalizeId(str: string): string {
	return decodeURIComponent(str)
		.normalize('NFD')
		.replace(/[,:.()]/g, '')
		.replace(/\s+/g, '_')
		.toLowerCase();
}

/**
 * Fetch all arcs from One Piece fandom using API
 */
async function fetchAllArcs(): Promise<Arc[]> {
	try {
		const apiUrl = `${FANDOM_API_BASE}Chapters_and_Volumes`;
		console.log('Fetching arcs list via API...');
		const response = await fetchWithRetry(apiUrl);
		const jsonData = await response.json();

		// Extract HTML from API response
		const htmlContent = jsonData.parse?.text?.['*'];
		if (!htmlContent) {
			throw new Error('Unable to extract HTML content from API response');
		}

		const $ = cheerio.load(htmlContent);
		const arcs: Arc[] = [];

		const seenArcUrls = new Set<string>();

		// Arc rows are in table cells where first link points to a *_Arc page and text includes chapter range.
		const arcCells = $('table.wikitable td').toArray();
		for (const element of arcCells) {
			const cell = $(element);
			const firstLink = cell.find('a').first();
			const href = firstLink.attr('href') || '';
			let arcName = firstLink.text().trim();

			if (!href.startsWith('/wiki/') || !/_Arc$/i.test(href)) {
				continue;
			}

			if (!arcName || !/\bArc\b/i.test(arcName)) {
				continue;
			}

			arcName = arcName.replace(/\bArc\b/i, '').trim();

			const cleanUrl = href.replace('/wiki/', '');
			if (seenArcUrls.has(cleanUrl)) {
				continue;
			}

			const cellText = cell.text().replace(/\s+/g, ' ').trim();
			const chapterMatch = cellText.match(/Chapters\s+(\d+)\s+to\s+(\d+|Current)/i);
			if (!chapterMatch) {
				continue;
			}

			const startChapter = parseInt(chapterMatch[1], 10);
			const endChapter = /current/i.test(chapterMatch[2]) ? null : parseInt(chapterMatch[2], 10);

			let arcId = normalizeId(cleanUrl);
			arcId = arcId.replace(/_arc$/i, '');

			// Query the href page via API to get the correct HTML content (in case of redirect) and extract the French name from there
			const arcResponse = await fetchWithRetry(`${FANDOM_API_BASE}${cleanUrl}`);
			const arcJsonData = await arcResponse.json();
			let frArcName: string | null =
				arcJsonData.parse?.langlinks.find(
					(link: { lang: string; ['*']: string }) => link.lang === 'fr'
				)?.['*'] || null;

			// Remove "Arc" suffix from French name if present to keep it consistent with English names (e.g. "Arc de Luffy" becomes "Luffy")
			if (frArcName && /\bArc\b/i.test(frArcName)) {
				frArcName = frArcName.replace(/\bArc\b/i, '').trim();
			}

			arcs.push({
				id: arcId,
				name: arcName,
				frName: frArcName,
				startChapter,
				endChapter,
				url: cleanUrl
			});

			seenArcUrls.add(cleanUrl);
		}

		console.log(`Found ${arcs.length} arcs.`);
		return arcs;
	} catch (error) {
		console.error('Error fetching arcs list:', (error as Error).message);
		return [];
	}
}

/**
 * Save arcs to JSON
 */
async function saveArcsToJSON(arcs: Arc[]): Promise<void> {
	const filepath = `${OUTPUT_DIR}/arcs.json`;
	fs.writeFileSync(filepath, JSON.stringify(arcs, null, 2));
	console.log(`✓ Saved to ${filepath}`);
}

/**
 * Save arcs to CSV
 */
async function saveArcsToCSV(arcs: Arc[]): Promise<void> {
	const filepath = `${OUTPUT_DIR}/arcs.csv`;
	const csvWriter = createObjectCsvWriter({
		path: filepath,
		header: [
			{ id: 'id', title: 'ID' },
			{ id: 'name', title: 'Name' },
			{ id: 'frName', title: 'French Name' },
			{ id: 'startChapter', title: 'Start Chapter' },
			{ id: 'endChapter', title: 'End Chapter' },
			{ id: 'url', title: 'URL' }
		]
	});

	const records = arcs
		.filter((arc) => arc !== null)
		.map((arc) => ({
			id: arc.id || '',
			name: arc.name || '',
			frName: arc.frName || '',
			startChapter: arc.startChapter || '',
			endChapter: arc.endChapter || '',
			url: arc.url || ''
		}));

	await csvWriter.writeRecords(records);
	console.log(`✓ Saved to ${filepath}`);
}

/**
 * Fetch all cannon characters from One Piece fandom, including their full data.
 */
async function fetchAllCharacters(arcsList: Arc[]): Promise<Character[]> {
	try {
		console.log('Fetching character list via API...');
		const response = await fetchWithRetry(`${FANDOM_API_BASE}List_of_Canon_Characters`);
		const jsonData = await response.json();

		// Extract HTML from API response
		const htmlContent = jsonData.parse?.text?.['*'];
		if (!htmlContent) {
			throw new Error('Unable to extract HTML content from API response');
		}

		const $ = cheerio.load(htmlContent);
		const characterList: CharacterListItem[] = [];
		$('table.fandom-table tbody tr').each((index, element) => {
			if (index === 0) return; // Skip header row
			let charUrl = $(element).find('td:nth-child(2) a').attr('href');
			const charName = $(element).find('td:nth-child(2) a').text().trim();
			let charChapter = $(element).find('td:nth-child(3)').text().trim();

			// Remove parentheses and their content from chapter info (e.g. "1 (flashback)" becomes "1")
			charChapter = charChapter.replace(/\([^)]*\)/g, '');
			charChapter = charChapter.replace(/\D/g, '');

			// If charChapter is empty, skip the character as it means they don't have a proper page and are just mentioned in the list
			if (!charChapter || parseInt(charChapter, 10) === 0) {
				return;
			}

			if (charName.toLowerCase().includes('family')) {
				return;
			}

			if (charUrl) {
				charUrl = charUrl.replace('/wiki/', '');
				characterList.push({
					name: charName,
					url: charUrl,
					chapter: parseInt(charChapter, 10)
				});
			}
		});

		if (characterList.length === 0) {
			console.error('No characters found.');
			return [];
		}
		console.log(`Found ${characterList.length} characters.`);

        // Fetch the french character list to get the picture URLs
        console.log('Fetching French character list via API...');
        const frResponse = await fetchWithRetry(`${FR_FANDOM_API_BASE}Liste_des_Personnages_Canon`);
        const frJsonData = await frResponse.json();

        // Create a map of character name to picture URL from the French list
        const frHtmlContent = frJsonData.parse?.text?.['*'];
        const fr$ = cheerio.load(frHtmlContent);
        const frCharacterPictureMap: Record<string, string> = {};
        fr$('table.wikitable  tbody tr').each((index, element) => {
            if (index === 0) return; // Skip header row
            const charName = fr$(element).find('td:nth-child(2) a').text().trim();
            const pictureUrl = fr$(element).find('td:nth-child(1) img').attr('data-src') || fr$(element).find('td:nth-child(1) img').attr('src') || null;
            if (charName && pictureUrl) {
                frCharacterPictureMap[charName] = pictureUrl;
            }
        });

		const characters: Character[] = [];
		let failedCharacters: CharacterListItem[] = [...characterList];

		while (failedCharacters.length > 0) {
			const nextFailedCharacters: CharacterListItem[] = [];
			console.log(`\nFetching ${failedCharacters.length} characters...`);

			for (let i = 0; i < failedCharacters.length; i += FETCH_CONCURRENCY) {
				const batch = failedCharacters.slice(i, i + FETCH_CONCURRENCY);
				const batchResults = await Promise.all(
					batch.map(async (char) => {
						const data = await fetchCharacter(char.url, char.name, char.chapter, arcsList, frCharacterPictureMap);
						return { char, data };
					})
				);

				for (const { char, data } of batchResults) {
					if (data) {
						console.table({
							ID: data.id,
							Name: data.name,
							Gender: data.gender,
							Age: data.age,
							Status: data.status,
							Epithets: data.epithets.join(', '),
							Affiliations: data.affiliations.join(', '),
							DevilFruitId: data.devilFruitId,
							DevilFruitUrl: data.devilFruitUrl,
							HakiObservation: data.hakiObservation ? 'Yes' : 'No',
							HakiArmament: data.hakiArmament ? 'Yes' : 'No',
							HakiConqueror: data.hakiConqueror ? 'Yes' : 'No',
							Height: data.height,
							Bounty: data.bounty,
							Origin: data.origin,
							FirstAppearance: data.firstAppearance,
							pictureUrl: data.pictureUrl,
							FandomURL: data.url
						});
						characters.push(data);
					} else {
						nextFailedCharacters.push(char);
					}
				}
			}

			failedCharacters = nextFailedCharacters;
			if (failedCharacters.length > 0) {
				console.log(`⚠️  ${failedCharacters.length} characters failed. Retrying...`);
			}
		}

		console.log(`\n✓ Scraped ${characters.length} characters\n`);
		return characters;
	} catch (error) {
		console.error('Error fetching characters:', (error as Error).message);
		return [];
	}
}

/**
 * Fetch character data from fandom using provided URL
 */
async function fetchCharacter(
	characterUrl: string,
	characterName: string,
	characterChapter: number,
	arcsList: Arc[],
	frCharacterPictureMap: Record<string, string>
): Promise<Character | null> {
	try {
		console.log(`Fetching: ${characterName}...`);

		// Use API to fetch character page
		const apiUrl = `${FANDOM_API_BASE}${characterUrl}`;
		const response = await fetchWithRetry(apiUrl);
		const jsonData = await response.json();

		const categories = jsonData.parse?.categories || [];

		// Extract HTML from API response
		const htmlContent = jsonData.parse?.text?.['*'];
		if (!htmlContent) {
			throw new Error('Unable to extract HTML content from API response');
		}

		const $ = cheerio.load(htmlContent);

		const name = characterName;

		// Generate character ID from URL + name combination
		const finalCharacterId = normalizeId(characterUrl + '_' + name);

		// Extract gender from JSON categories
		let gender: string | null = null;
		for (const cat of categories) {
			const catName = cat['*'] || '';
			if (catName === 'Male_Characters' || catName === 'Kings' || catName === 'Princes' || catName === 'Former_Kings' || catName === 'Former_Princes') {
				gender = 'Male';
				break;
			} else if (catName === 'Female_Characters' || catName === 'Queens' || catName === 'Princesses' || catName === 'Former_Queens' || catName === 'Former_Princesses') {
				gender = 'Female';
				break;
			}
		}

		// Extract age
		const age = extractAge($);

		// Extract affiliations
		const affiliations = await extractAffiliations($, 'en');

		// Extract epithets
		const epithets = extractEpithets($);

		// Extract devil fruit
		const devilFruitData = await extractDevilFruit($);
		const devilFruitId = devilFruitData?.devilFruitId || null;
		const devilFruitUrl = devilFruitData?.devilFruitUrl || null;

		// Extract haki from JSON categories
		let hakiObservation = false;
		let hakiArmament = false;
		let hakiConqueror = false;
		for (const cat of categories) {
			const catName = cat['*'] || '';
			if (catName === 'Observation_Haki_Users') {
				hakiObservation = true;
			} else if (catName === 'Armament_Haki_Users') {
				hakiArmament = true;
			} else if (catName === 'Supreme_King_Haki_Users') {
				hakiConqueror = true;
			}
		}

		// Extract bounty
		const bounty = extractBounty($);

		// Extract height
		const height = extractHeight($);

		// Use chapter from character list, cast to int
		const firstAppearance = characterChapter;

		// Extract origin
		const origin = extractOrigin($);

		// Extract status
		const status = extractStatus($);

		let arcId = '';
		const arc = arcsList.find(
			(a) =>
				a.startChapter <= firstAppearance &&
				(a.endChapter === null || a.endChapter >= firstAppearance)
		);
		if (!arc) {
			return null;
		}
		arcId = arc.id;

		const frLink = getFrLink(jsonData.parse?.langlinks || []);
		const frUrl = frLink ? frLink.url.replace('https://onepiece.fandom.com/fr/wiki/', '') : null;
		const frjsonData = frUrl
			? await fetchWithRetry(`${FR_FANDOM_API_BASE}${frUrl}`).then((res) => res.json())
			: null;

		let frName = frjsonData?.parse?.title || null;

		const frAffiliations = frjsonData
			? await extractAffiliations(cheerio.load(frjsonData.parse?.text?.['*'] || ''), 'fr')
			: null;

		const frEpithets = frjsonData
			? extractEpithets(cheerio.load(frjsonData.parse?.text?.['*'] || ''))
			: null;

		const frOrigin = frjsonData
			? extractOrigin(cheerio.load(frjsonData.parse?.text?.['*'] || ''))
			: null;

		if (name !== jsonData.parse?.title) {
			frName = name;
		}

        const pictureUrl = frCharacterPictureMap[frName || ''] || null;

		return {
			id: finalCharacterId,
			name,
			frName,
			gender,
			age,
			height,
			origin,
			frOrigin,
			devilFruitId,
			devilFruitUrl,
			affiliations,
			frAffiliations,
			bounty,
			hakiObservation,
			hakiArmament,
			hakiConqueror,
			epithets,
			frEpithets,
			firstAppearance,
			arcId,
			status,
			pictureUrl,
			url: characterUrl,
			frUrl
		};
	} catch (error) {
		console.error(`Error fetching ${characterName}:`, (error as Error).message);
		return null;
	}
}

/**
 * Extract age from infobox
 */
function extractAge($: cheerio.CheerioAPI): number | null {
	const div = $('[data-source="age"] .pi-data-value');
	if (div.length === 0) return null;

	let text = div.html();
	if (!text) return null;

	// Remove all sup blocks (citations)
	text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');

	// Get the last element and extract only digits
	const parts = text.split('<br');
	const lastPart = parts[parts.length - 1];
	let cleanText = lastPart.replace(/<[^>]*>/g, '').trim();

	// Remove content with parentheses
	cleanText = cleanText.replace(/\([^)]*\)/g, '');

	const digitsOnly = cleanText.replace(/\D/g, '');
	return parseInt(digitsOnly) || null;
}

/**
 * Extract affiliations from infobox
 */
async function extractAffiliations($: cheerio.CheerioAPI, lang: string): Promise<string[]> {
	const div = $('[data-source="affiliation"] .pi-data-value');
	if (div.length === 0) return [];

	const cleanedDiv = div.clone();
	cleanedDiv.find('sup').remove();

	const text = cleanedDiv.html();
	if (!text) return [];

	// Resolve affiliations from linked page titles.
	const links = cleanedDiv.find('a').toArray();
	if (links.length > 0) {
		const linkValues = await Promise.all(
			links.map(async (el) => {
				const href = $(el).attr('href') || '';
				const resolvedTitle = await fetchWithRetry(
					`${lang === 'fr' ? FR_FANDOM_API_BASE : FANDOM_API_BASE}${href.replace('/fr/wiki/', '').replace('/wiki/', '')}`
				)
					.then((res) => res.json())
					.then((json) => json.parse?.title)
					.catch(() => null);

				if (resolvedTitle) {
					return resolvedTitle;
				}

				return $(el).text().trim();
			})
		);

		const uniqueLinks = Array.from(new Set(linkValues.filter(Boolean)));
		if (uniqueLinks.length > 0) {
			return uniqueLinks;
		}
	}

	// Fallback to parsing text
	const cleanText = text.replace(/<[^>]*>/g, '').trim();
	const parts = cleanText.split(/\s*\n\s*|\s*;\s*|\s*,\s*/).filter(Boolean);
	return parts.length > 0 ? parts : [];
}

/**
 * Extract epithets from infobox
 * Handles both quoted and unquoted epithets, keeping only the main/latest readable values.
 */
function extractEpithets($: cheerio.CheerioAPI): string[] {
	const div = $(
		'[data-source="epithet"] .pi-data-value, [data-source="épithète"] .pi-data-value'
	).first();
	if (div.length === 0) return [];

	const cleanedDiv = div.clone();
	cleanedDiv.find('sup').remove();

	const html = cleanedDiv.html();
	if (!html) return [];

	const plainText = html.replace(/<br\s*\/?\s*>/gi, '\n').replace(/<[^>]*>/g, '');

	const lines = plainText
		.split('\n')
		.map((line) => line.trim())
		.filter(Boolean);

	const epithets = lines
		.map((line) => {
			const normalized = line.replace(/\s+/g, ' ').trim();

			// Prefer explicit quoted epithet if present.
			const quotedMatch = normalized.match(/["«“](.*?)["»”]/);
			if (quotedMatch?.[1]) {
				return quotedMatch[1].trim();
			}

			// Otherwise keep only the base epithet text before extra notes/translations.
			return normalized
				.split(/[;(]/)[0]
				.replace(/["'«»“”]/g, '')
				.trim();
		})
		.filter(Boolean);

	return Array.from(new Set(epithets));
}

/**
 * Extract devil fruit from infobox
 * Returns both normalized ID and URL
 */
async function extractDevilFruit($: cheerio.CheerioAPI): Promise<DevilFruitData | null> {
	const link = $('[data-source="dfname"] .pi-data-value a').first();
	if (link.length === 0) return null;

	const href = link.attr('href');
	if (!href || !href.startsWith('/wiki/')) return null;

	const cleanUrl = href.replace('/wiki/', '');

    // Query the devil fruit page via API to get the correct HTML content (in case of redirect) and extract the type from there
    const dfResponse = await fetchWithRetry(`${FANDOM_API_BASE}${cleanUrl}`);
    const dfJsonData = await dfResponse.json();
    const fruitTitle = dfJsonData.parse?.title || '';

	return {
		devilFruitId: normalizeId(fruitTitle),
		devilFruitUrl: fruitTitle
	};
}

/**
 * Extract bounty from infobox
 */
function extractBounty($: cheerio.CheerioAPI): number | null {
	const div = $('[data-source="bounty"] .pi-data-value');
	if (div.length === 0) return 0;

	const cleanedDiv = div.clone();
	// Drop references and old crossed-out bounty values.
	cleanedDiv.find('sup, s, del, strike').remove();

	const text = cleanedDiv.text().replace(/\s+/g, ' ').trim();
	if (!text) return 0;

	// Parse the first amount token (e.g. "3,189,000,000"), which is the active bounty.
	const amountMatch = text.match(/\d{1,3}(?:[\s,.'’]\d{3})+|\d+/);
	if (!amountMatch) return 0;

	const digits = amountMatch[0].replace(/\D/g, '');
	if (!digits) return 0;

	const value = Number(digits);
	return Number.isFinite(value) ? value : 0;
}

/**
 * Extract height from infobox
 */
function extractHeight($: cheerio.CheerioAPI): number | null {
	const div = $('[data-source="height"] .pi-data-value');
	if (div.length === 0) return null;

	let text = div.html();
	if (!text) return null;

	// Remove all sup blocks (citations)
	text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');

	// Convert line breaks to new lines so we can reliably pick the latest value.
	const textWithNewLines = text.replace(/<br\s*\/?\s*>/gi, '\n');
	const lines = textWithNewLines
		.replace(/<[^>]*>/g, '')
		.split('\n')
		.map((line) => line.trim())
		.filter(Boolean);

	// Keep only lines that look like a height value, then pick the latest one.
	const heightLines = lines.filter((line) => /\d/.test(line) && /(cm|m)/i.test(line));
	const latestLine =
		heightLines.length > 0 ? heightLines[heightLines.length - 1] : lines[lines.length - 1];
	if (!latestLine) return null;

	// Remove descriptive suffixes like "(post-timeskip)".
	const cleanText = latestLine.replace(/\([^)]*\)/g, '').trim();
	const normalized = cleanText.toLowerCase().replace(/\s/g, '');

	// Values are stored in meters in this dataset.
	const cmMatch = normalized.match(/(\d+(?:[.,]\d+)?)cm/);
	if (cmMatch) {
		const cm = parseFloat(cmMatch[1].replace(',', '.'));
		return Number.isFinite(cm) ? cm / 100 : null;
	}

	const mMatch = normalized.match(/(\d+(?:[.,]\d+)?)m/);
	if (mMatch) {
		const meters = parseFloat(mMatch[1].replace(',', '.'));
		return Number.isFinite(meters) ? meters : null;
	}

	return null;
}

/**
 * Extract origin from infobox
 */
function extractOrigin($: cheerio.CheerioAPI): string | null {
	const div = $(
		'[data-source="origin"] .pi-data-value, [data-source="origine"] .pi-data-value'
	).first();
	if (div.length === 0) return null;

	let text = div.html();
	if (!text) return null;

	// Remove all sup blocks (citations)
	text = text.replace(/<sup[^>]*>.*?<\/sup>/gi, '');

	// Extract the first value before any <br> tag
	const firstValue = text.split('<br')[0].trim();
	let cleanText = firstValue.replace(/<[^>]*>/g, '').trim();

	// Remove content with parentheses
	cleanText = cleanText.replace(/\([^)]*\)/g, '').trim();

	return cleanText || null;
}

/**
 * Extract status from infobox
 */
function extractStatus($: cheerio.CheerioAPI): string | null {
	const div = $('[data-source="status"] .pi-data-value');
	if (div.length === 0) return null;

	const statusText = div.text().trim().toLowerCase();

	if (statusText.includes('alive')) {
		return 'Alive';
	} else if (statusText.includes('deceased')) {
		return 'Dead';
	} else if (statusText.includes('unknown')) {
		return 'Unknown';
	}

	return 'Alive';
}

/**
 * Save data to JSON
 */
async function saveToJSON(characters: Character[]): Promise<void> {
	const filepath = `${OUTPUT_DIR}/characters.json`;
	fs.writeFileSync(filepath, JSON.stringify(characters, null, 2));
	console.log(`✓ Saved to ${filepath}`);
}

/**
 * Save data to CSV
 */
async function saveToCSV(characters: Character[]): Promise<void> {
	const filepath = `${OUTPUT_DIR}/characters.csv`;
	const csvWriter = createObjectCsvWriter({
		path: filepath,
		header: [
			{ id: 'id', title: 'ID' },
			{ id: 'name', title: 'Name' },
			{ id: 'gender', title: 'Gender' },
			{ id: 'age', title: 'Age' },
			{ id: 'height', title: 'Height' },
			{ id: 'origin', title: 'Origin' },
			{ id: 'status', title: 'Status' },
			{ id: 'epithets', title: 'Epithets' },
			{ id: 'devilFruitId', title: 'Devil Fruit ID' },
			{ id: 'affiliations', title: 'Affiliations' },
			{ id: 'bounty', title: 'Bounty' },
			{ id: 'hakiObservation', title: 'Haki Observation' },
			{ id: 'hakiArmament', title: 'Haki Armament' },
			{ id: 'hakiConqueror', title: 'Haki Conqueror' },
			{ id: 'firstAppearance', title: 'First Appearance' },
			{ id: 'arcId', title: 'Arc ID' },
			{ id: 'pictureUrl', title: 'Image URL' },
			{ id: 'url', title: 'Fandom URL' }
		]
	});

	const records = characters
		.filter((c) => c !== null)
		.map((c) => ({
			id: c.id || '',
			name: c.name || '',
			gender: c.gender || '',
			age: c.age || '',
			height: c.height || '',
			origin: c.origin || '',
			status: c.status || '',
			epithets: Array.isArray(c.epithets) ? c.epithets.join(', ') : c.epithets || '',
			devilFruitId: c.devilFruitId || '',
			affiliations: Array.isArray(c.affiliations)
				? c.affiliations.join(', ')
				: c.affiliations || '',
			bounty: c.bounty ?? 0,
			hakiObservation: c.hakiObservation ? 1 : 0,
			hakiArmament: c.hakiArmament ? 1 : 0,
			hakiConqueror: c.hakiConqueror ? 1 : 0,
			firstAppearance: c.firstAppearance || '',
			arcId: c.arcId || '',
			pictureUrl: c.pictureUrl || '',
			url: c.url || ''
		}));

	await csvWriter.writeRecords(records);
	console.log(`✓ Saved to ${filepath}`);
}

/**
 * Fetch devil fruit data from fandom using provided URL
 */
async function fetchDevilFruit(
	devilFruitUrl: string,
	devilFruitId: string
): Promise<DevilFruit | null> {
	try {
		console.log(`Fetching devil fruit: ${devilFruitUrl}...`);

		// Use API to fetch devil fruit page
		const apiUrl = `${FANDOM_API_BASE}${devilFruitUrl}`;
		const response = await fetchWithRetry(apiUrl);
		const jsonData = await response.json();

		// Extract HTML from API response
		const htmlContent = jsonData.parse?.text?.['*'];
		if (!htmlContent) {
			throw new Error('Unable to extract HTML content from API response');
		}

		const name = jsonData.parse?.title || devilFruitId.replace(/_/g, ' ');

		let type: string | null = null;
		// Determine type based on categories (if categories contain "Paramecia", "Zoan", "Logia" or "Smile")
		if (jsonData.parse?.categories) {
			const categories = jsonData.parse.categories.map((cat: { ['*']: string }) =>
				String(cat['*'] || '').toLowerCase()
			);

			if (categories.some((category: string) => category.includes('paramecia'))) {
				type = 'Paramecia';
			} else if (categories.some((category: string) => category.includes('zoan'))) {
				type = 'Zoan';
			} else if (categories.some((category: string) => category.includes('logia'))) {
				type = 'Logia';
			} else if (categories.some((category: string) => category.includes('smile'))) {
				type = 'Smile';
			}
		}

		return {
			id: devilFruitId,
			name,
			type,
			url: devilFruitUrl
		};
	} catch (error) {
		console.error(`Error fetching devil fruit ${devilFruitUrl}:`, (error as Error).message);
		return null;
	}
}

/**
 * Save devil fruits to JSON
 */
async function saveDevilFruitsToJSON(devilFruits: DevilFruit[]): Promise<void> {
	const filepath = `${OUTPUT_DIR}/devil-fruits.json`;
	fs.writeFileSync(filepath, JSON.stringify(devilFruits, null, 2));
	console.log(`✓ Saved to ${filepath}`);
}

/**
 * Save devil fruits to CSV
 */
async function saveDevilFruitsToCSV(devilFruits: DevilFruit[]): Promise<void> {
	const filepath = `${OUTPUT_DIR}/devil-fruits.csv`;
	const csvWriter = createObjectCsvWriter({
		path: filepath,
		header: [
			{ id: 'id', title: 'ID' },
			{ id: 'name', title: 'Name' },
			{ id: 'type', title: 'Type' },
			{ id: 'url', title: 'URL' }
		]
	});

	const records = devilFruits
		.filter((df) => df !== null)
		.map((df) => ({
			id: df.id || '',
			name: df.name || '',
			type: df.type || '',
			url: df.url || ''
		}));

	await csvWriter.writeRecords(records);
	console.log(`✓ Saved to ${filepath}`);
}

/**
 * Main execution
 */
async function main(): Promise<void> {
	const format = process.argv[2] || 'all'; // json, csv, or all

	console.log(`\nOne Piece Scraper - Mode: ${format}\n`);

	// Step 1: Scraping Arcs
	console.log('=== Step 1: Scraping Arcs ===\n');
	const arcsList = await fetchAllArcs();

	if (arcsList.length > 0) {
		// Display arcs in table format
		arcsList.forEach((arc) => {
			console.table({
				ID: arc.id,
				Name: arc.name,
				FrenchName: arc.frName || '',
				StartChapter: arc.startChapter,
				EndChapter: arc.endChapter || 'Ongoing',
				URL: arc.url
			});
		});

		console.log(`\n✓ Found ${arcsList.length} arcs\n`);

		if (format === 'json' || format === 'all') {
			await saveArcsToJSON(arcsList);
		}
		if (format === 'csv' || format === 'all') {
			await saveArcsToCSV(arcsList);
		}
	} else {
		console.warn('No arcs found, continuing...\n');
	}

	// Step 2: Scraping Characters
	console.log('=== Step 2: Scraping Characters ===\n');
	const characters = await fetchAllCharacters(arcsList);

	if (characters.length === 0) {
		console.error('No characters found. Exiting.');
		return;
	}

	const devilFruitUrls = new Set<string>(
		characters.filter((c) => c.devilFruitUrl).map((c) => c.devilFruitUrl!)
	);
	console.log(`✓ Found ${devilFruitUrls.size} unique devil fruits\n`);

	// Step 3: Scraping Devil Fruits
	console.log('=== Step 2: Scraping Devil Fruits ===\n');

	if (devilFruitUrls.size === 0) {
		console.warn('No devil fruits found from characters, skipping...\n');
	} else {
		const devilFruits: DevilFruit[] = [];
		const devilFruitUrlArray = Array.from(devilFruitUrls);

		for (let i = 0; i < devilFruitUrlArray.length; i += FETCH_CONCURRENCY) {
			const batch = devilFruitUrlArray.slice(i, i + FETCH_CONCURRENCY);
			const batchResults = await Promise.all(
				batch.map(async (url) => {
					const data = await fetchDevilFruit(url, normalizeId(url));
					return { url, data };
				})
			);

			for (const { data } of batchResults) {
				if (data) {
					console.table({
						ID: data.id,
						Name: data.name,
						Type: data.type,
						URL: data.url
					});

					devilFruits.push(data);
				}
			}
		}

		console.log(`\n✓ Scraped ${devilFruits.length} devil fruits\n`);

		if (format === 'json' || format === 'all') {
			await saveDevilFruitsToJSON(devilFruits);
		}
		if (format === 'csv' || format === 'all') {
			await saveDevilFruitsToCSV(devilFruits);
		}

		// Update characters with normalized devil fruit IDs
		const devilFruitMap = new Map<string, string>(devilFruits.map((df) => [df.id, df.id]));
		characters.forEach((char) => {
			if (char.devilFruitUrl) {
				const normalizedId = normalizeId(char.devilFruitUrl);
				char.devilFruitId = devilFruitMap.get(normalizedId) || normalizedId;
			}
		});
	}

	// Save characters after devil fruit IDs are updated
	if (format === 'json' || format === 'all') {
		await saveToJSON(characters);
	}
	if (format === 'csv' || format === 'all') {
		await saveToCSV(characters);
	}

	console.log('\n✓ Done!\n');
}

main().catch(console.error);