mantimon-tcg/backend/scripts/scrape_pokemon_pocket.py

#!/usr/bin/env python
"""Scrape Pokemon TCG Pocket card data from pokemon-zone.com.

This script fetches card data from the Genetic Apex (A1) and Mythical Island (A1a)
sets and saves them as individual JSON files for use in the Mantimon TCG game engine.

Usage:
    # Scrape entire set
    uv run python scripts/scrape_pokemon_pocket.py --set a1

    # Scrape with limit (for testing)
    uv run python scripts/scrape_pokemon_pocket.py --set a1 --limit 5

    # Scrape single card by ID
    uv run python scripts/scrape_pokemon_pocket.py --card a1-132-gardevoir

    # Regenerate index from existing card files
    uv run python scripts/scrape_pokemon_pocket.py --reindex

Output:
    - Individual card files: data/raw/{set}/{number}-{name}.json
    - Combined index: data/raw/_index.json
    - Error log: data/raw/_errors.log
"""

import argparse
import json
import logging
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import UTC, datetime
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup, Tag

# =============================================================================
# Configuration
# =============================================================================

BASE_URL = "https://www.pokemon-zone.com"
DATA_DIR = Path(__file__).parent.parent / "data" / "raw"
IMAGES_DIR = Path(__file__).parent.parent / "data" / "images"
REQUEST_DELAY = 1.5  # seconds between requests
IMAGE_REQUEST_DELAY = 0.5  # faster for images (different server)
MAX_RETRIES = 3
RETRY_DELAY = 5  # seconds

# Set info for validation and metadata
SETS = {
    "a1": {"name": "Genetic Apex", "expected_cards": 286},
    "a1a": {"name": "Mythical Island", "expected_cards": 86},
}

# Energy type mapping from CSS classes (for attack cost icons)
ENERGY_TYPES = {
    "energy-icon--type-grass": "grass",
    "energy-icon--type-fire": "fire",
    "energy-icon--type-water": "water",
    "energy-icon--type-lightning": "lightning",
    "energy-icon--type-psychic": "psychic",
    "energy-icon--type-fighting": "fighting",
    "energy-icon--type-darkness": "darkness",
    "energy-icon--type-metal": "metal",
    "energy-icon--type-colorless": "colorless",
    "energy-icon--type-dragon": "dragon",
}

# Energy text mapping from CSS classes (for inline text references)
# These appear in effect text like "Discard a <span class="energy-text--type-fire"></span> Energy"
ENERGY_TEXT_TYPES = {
    "energy-text--type-grass": "Grass",
    "energy-text--type-fire": "Fire",
    "energy-text--type-water": "Water",
    "energy-text--type-lightning": "Lightning",
    "energy-text--type-psychic": "Psychic",
    "energy-text--type-fighting": "Fighting",
    "energy-text--type-darkness": "Darkness",
    "energy-text--type-metal": "Metal",
    "energy-text--type-colorless": "Colorless",
    "energy-text--type-dragon": "Dragon",
}

# Rarity code mapping from CSS classes (rarity-icon--rarity-X)
RARITY_CODES = {
    "C": "Common",
    "U": "Uncommon",
    "R": "Rare",
    "RR": "Double Rare",
    "AR": "Art Rare",
    "SAR": "Special Art Rare",
    "UR": "Ultra Rare",
    "IM": "Immersive",
    "S": "Shiny",
    "CR": "Crown Rare",
}

# Fossil cards that are Trainer/Item cards, not Pokemon
# These are often misclassified because they have Pokemon-like layouts on the source site
FOSSIL_CARDS = {
    "Helix Fossil",
    "Dome Fossil",
    "Old Amber",
}

# Text artifact patterns to fix (caused by stripped energy icons)
# Format: (pattern, replacement)
# Energy icons render as empty strings, merging adjacent text
TEXT_ARTIFACT_FIXES = [
    # Energy-related artifacts
    (r"\baEnergy\b", "an Energy"),
    (r"\bofEnergy\b", "of Energy"),
    (r"\bextraEnergy\b", "extra Energy"),
    (r"\battachedEnergy\b", "attached Energy"),
    (r"\banyEnergy\b", "any Energy"),
    (r"(\d+)Energy\b", r"\1 Energy"),  # "2Energy" -> "2 Energy"
    (r"(\d+)-HP\b", r"\1 HP"),  # "40-HP" -> "40 HP"
    # Pokemon-related artifacts
    (r"\bBasicPokémon\b", "Basic Pokémon"),
    (r"\bBenchedPokémon\b", "Benched Pokémon"),
    (r"\bthePokémon\b", "the Pokémon"),
    (r"\bthisPokémon\b", "this Pokémon"),
    (r"\byourPokémon\b", "your Pokémon"),
    (r"\bActivePokémon\b", "Active Pokémon"),
    (r"\bDefendingPokémon\b", "Defending Pokémon"),
    (r"\bopponent'sPokémon\b", "opponent's Pokémon"),
    (r"\bOpponent'sPokémon\b", "Opponent's Pokémon"),
    # Other common artifacts
    (r"\bthatPokémon\b", "that Pokémon"),
    (r"\beachPokémon\b", "each Pokémon"),
    (r"\baPokémon\b", "a Pokémon"),
]


def replace_energy_text_spans(element: Tag) -> None:
    """Replace energy-text spans with their text representation in-place.

    The source website uses <span class="energy-text energy-text--type-fire"></span>
    to render inline energy type icons in effect text. When BeautifulSoup extracts
    text with get_text(), these empty spans disappear, merging adjacent words.

    This function finds all such spans and replaces them with their text equivalent
    (e.g., "Fire") BEFORE text extraction, preserving the energy type information.

    Args:
        element: BeautifulSoup element to process in-place. The element is modified
                 directly - energy-text spans are replaced with NavigableString text.

    Example:
        >>> soup = BeautifulSoup('<div>Discard a <span class="energy-text--type-fire"></span> Energy</div>')
        >>> div = soup.find('div')
        >>> replace_energy_text_spans(div)
        >>> div.get_text()
        'Discard a Fire Energy'
    """
    from bs4 import NavigableString

    # Find all energy-text spans (they may have multiple classes)
    for span in element.find_all("span", class_=re.compile(r"energy-text--type-")):
        energy_type = None
        for cls in span.get("class", []):
            if cls in ENERGY_TEXT_TYPES:
                energy_type = ENERGY_TEXT_TYPES[cls]
                break

        if energy_type:
            # Replace the span with a text node
            span.replace_with(NavigableString(energy_type))


def extract_effect_text(element: Tag) -> str | None:
    """Extract effect text from an element, properly handling energy-text spans.

    This is the correct way to extract effect text that may contain energy type
    icons. It replaces the energy-text spans with readable text before extraction.

    Args:
        element: BeautifulSoup element containing effect text.

    Returns:
        Cleaned effect text, or None if empty.
    """
    # Work on a deep copy to avoid modifying the original soup
    # BeautifulSoup's copy.copy is shallow - we need to re-parse
    from bs4 import BeautifulSoup as BS

    element_copy = BS(str(element), "html.parser")
    replace_energy_text_spans(element_copy)
    # Use separator=" " to properly join text nodes with spaces
    # strip=True removes leading/trailing whitespace from each text node
    raw_text = element_copy.get_text(separator=" ", strip=True)
    # Normalize multiple spaces to single spaces
    raw_text = re.sub(r"\s+", " ", raw_text).strip()
    # Still apply artifact fixes as fallback for any edge cases
    return clean_effect_text(raw_text)


def clean_effect_text(text: str | None) -> str | None:
    """Clean scraped effect text by fixing common artifacts.

    This is a fallback for any text artifacts not caught by energy-text span
    replacement. It handles edge cases like merged words.

    Args:
        text: The raw effect text from scraping.

    Returns:
        Cleaned text with artifacts fixed, or None if input was None/empty.
    """
    if not text:
        return None

    result = text
    for pattern, replacement in TEXT_ARTIFACT_FIXES:
        result = re.sub(pattern, replacement, result)

    return result if result else None


# =============================================================================
# Logging Setup
# =============================================================================

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)


# =============================================================================
# Data Classes
# =============================================================================


@dataclass
class Attack:
    """A Pokemon's attack."""

    name: str
    cost: list[str]
    damage: int | None
    damage_modifier: str | None  # "+", "x", or None
    effect_text: str | None
    effect_id: str | None = None  # To be mapped later

    def to_dict(self) -> dict[str, Any]:
        """Convert to dictionary for JSON serialization."""
        return {
            "name": self.name,
            "cost": self.cost,
            "damage": self.damage,
            "damage_modifier": self.damage_modifier,
            "effect_text": self.effect_text,
            "effect_id": self.effect_id,
        }


@dataclass
class Ability:
    """A Pokemon's ability."""

    name: str
    effect_text: str
    effect_id: str | None = None  # To be mapped later

    def to_dict(self) -> dict[str, Any]:
        """Convert to dictionary for JSON serialization."""
        return {
            "name": self.name,
            "effect_text": self.effect_text,
            "effect_id": self.effect_id,
        }


@dataclass
class Card:
    """Complete card data."""

    id: str
    name: str
    set_code: str
    set_name: str
    card_number: int
    rarity: str
    card_type: str  # "pokemon", "trainer", "energy"
    image_url: str | None = None  # URL to card image for offline caching
    image_file: str | None = None  # Local path to downloaded image (relative to images dir)
    hp: int | None = None
    pokemon_type: str | None = None
    stage: str | None = None  # "basic", "stage_1", "stage_2"
    evolves_from: str | None = None
    is_ex: bool = False
    abilities: list[Ability] = field(default_factory=list)
    attacks: list[Attack] = field(default_factory=list)
    weakness_type: str | None = None
    weakness_value: int | None = None
    resistance_type: str | None = None
    resistance_value: int | None = None
    retreat_cost: int = 0
    flavor_text: str | None = None
    illustrator: str | None = None
    source_url: str = ""

    def to_dict(self) -> dict[str, Any]:
        """Convert to dictionary for JSON serialization."""
        data: dict[str, Any] = {
            "id": self.id,
            "name": self.name,
            "set_code": self.set_code,
            "set_name": self.set_name,
            "card_number": self.card_number,
            "rarity": self.rarity,
            "card_type": self.card_type,
            "image_url": self.image_url,
            "image_file": self.image_file,
            "source_url": self.source_url,
        }

        if self.card_type == "pokemon":
            data.update(
                {
                    "hp": self.hp,
                    "pokemon_type": self.pokemon_type,
                    "stage": self.stage,
                    "evolves_from": self.evolves_from,
                    "is_ex": self.is_ex,
                    "abilities": [a.to_dict() for a in self.abilities],
                    "attacks": [a.to_dict() for a in self.attacks],
                    "weakness": (
                        {"type": self.weakness_type, "value": self.weakness_value}
                        if self.weakness_type
                        else None
                    ),
                    "resistance": (
                        {"type": self.resistance_type, "value": self.resistance_value}
                        if self.resistance_type
                        else None
                    ),
                    "retreat_cost": self.retreat_cost,
                    "flavor_text": self.flavor_text,
                    "illustrator": self.illustrator,
                }
            )
        elif self.card_type == "trainer":
            data.update(
                {
                    "trainer_type": self.stage,  # Reusing stage field for trainer type
                    "effect_text": self.flavor_text,  # Trainer effect
                    "illustrator": self.illustrator,
                }
            )
        elif self.card_type == "energy":
            data.update(
                {
                    "energy_type": self.pokemon_type,
                    "illustrator": self.illustrator,
                }
            )

        return data


# =============================================================================
# Scraper Class
# =============================================================================


class PokemonPocketScraper:
    """Scraper for Pokemon TCG Pocket card data from pokemon-zone.com."""

    def __init__(
        self,
        data_dir: Path = DATA_DIR,
        images_dir: Path = IMAGES_DIR,
        download_images: bool = False,
    ):
        """Initialize the scraper.

        Args:
            data_dir: Directory to save card data files.
            images_dir: Directory to save downloaded card images.
            download_images: Whether to download card images.
        """
        self.data_dir = data_dir
        self.images_dir = images_dir
        self.download_images = download_images
        self.session = requests.Session()
        self.session.headers.update(
            {
                "User-Agent": "MantimonTCG-CardScraper/1.0 (https://github.com/mantimon-tcg)",
                "Accept": "text/html,application/xhtml+xml",
            }
        )
        self.errors: list[dict[str, Any]] = []
        self.image_stats = {"downloaded": 0, "skipped": 0, "failed": 0}

    def fetch_page(self, url: str) -> BeautifulSoup | None:
        """Fetch a page with retry logic.

        Args:
            url: URL to fetch.

        Returns:
            BeautifulSoup object or None if all retries failed.
        """
        for attempt in range(MAX_RETRIES):
            try:
                response = self.session.get(url, timeout=30)
                response.raise_for_status()
                return BeautifulSoup(response.text, "html.parser")
            except requests.RequestException as e:
                logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for {url}: {e}")
                if attempt < MAX_RETRIES - 1:
                    time.sleep(RETRY_DELAY)

        self.errors.append({"url": url, "error": "Max retries exceeded"})
        return None

    def download_image(self, card: Card) -> str | None:
        """Download a card image and save it locally.

        Args:
            card: Card object with image_url set.

        Returns:
            Relative path to the saved image, or None if download failed.
        """
        if not card.image_url:
            return None

        # Create directory structure: images/{set_code}/
        set_dir = self.images_dir / card.set_code
        set_dir.mkdir(parents=True, exist_ok=True)

        # Determine file extension from URL
        url_path = card.image_url.split("?")[0]  # Remove query params
        ext = Path(url_path).suffix or ".webp"

        # Generate filename: {number:03d}-{name}{ext}
        url_name = card.id.split("-", 2)[2]  # Get name part from ID
        filename = f"{card.card_number:03d}-{url_name}{ext}"
        filepath = set_dir / filename
        relative_path = f"{card.set_code}/{filename}"

        # Skip if already downloaded
        if filepath.exists():
            logger.debug(f"Image exists, skipping: {relative_path}")
            self.image_stats["skipped"] += 1
            return relative_path

        # Download the image with appropriate headers
        image_headers = {
            "Accept": "image/webp,image/apng,image/*,*/*;q=0.8",
            "Referer": "https://www.pokemon-zone.com/",
        }
        for attempt in range(MAX_RETRIES):
            try:
                response = self.session.get(card.image_url, timeout=30, headers=image_headers)
                response.raise_for_status()

                # Verify it's an image
                content_type = response.headers.get("content-type", "")
                if not content_type.startswith("image/"):
                    logger.warning(f"Not an image: {content_type} for {card.image_url}")
                    self.image_stats["failed"] += 1
                    return None

                # Save the image
                with open(filepath, "wb") as f:
                    f.write(response.content)

                logger.debug(f"Downloaded: {relative_path} ({len(response.content)} bytes)")
                self.image_stats["downloaded"] += 1
                return relative_path

            except requests.RequestException as e:
                logger.warning(f"Image download attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
                if attempt < MAX_RETRIES - 1:
                    time.sleep(RETRY_DELAY)

        logger.error(f"Failed to download image for {card.id}")
        self.image_stats["failed"] += 1
        return None

    def get_card_urls_for_set(self, set_code: str) -> list[tuple[int, str, str]]:
        """Get all card URLs for a set.

        Args:
            set_code: Set code (e.g., "a1", "a1a").

        Returns:
            List of (card_number, card_name, url) tuples.
        """
        set_url = f"{BASE_URL}/sets/{set_code}/"
        logger.info(f"Fetching set page: {set_url}")

        soup = self.fetch_page(set_url)
        if not soup:
            logger.error(f"Failed to fetch set page for {set_code}")
            return []

        cards: list[tuple[int, str, str]] = []
        # Find all card links - they follow pattern /cards/{set}/{number}/{name}/
        pattern = re.compile(rf"^/cards/{set_code}/(\d+)/([^/]+)/$")

        for link in soup.find_all("a", href=pattern):
            href = link.get("href", "")
            match = pattern.match(href)
            if match:
                card_number = int(match.group(1))
                card_name = match.group(2)
                # Avoid duplicates (page may have multiple links to same card)
                card_tuple = (card_number, card_name, f"{BASE_URL}{href}")
                if card_tuple not in cards:
                    cards.append(card_tuple)

        # Sort by card number
        cards.sort(key=lambda x: x[0])
        logger.info(f"Found {len(cards)} cards in set {set_code}")

        return cards

    def parse_energy_type(self, element: Tag | None) -> str | None:
        """Extract energy type from an element containing an energy icon.

        Args:
            element: BeautifulSoup element that may contain energy icons.

        Returns:
            Energy type string or None.
        """
        if not element:
            return None

        for icon in element.find_all("span", class_=re.compile(r"energy-icon--type-")):
            for cls in icon.get("class", []):
                if cls in ENERGY_TYPES:
                    return ENERGY_TYPES[cls]

        return None

    def parse_attack(self, attack_row: Tag) -> Attack | None:
        """Parse an attack from an attack-summary-row element.

        Args:
            attack_row: BeautifulSoup element for the attack row.

        Returns:
            Attack object or None if parsing failed.
        """
        try:
            # Get attack name
            name_elem = attack_row.find(class_="attack-summary-row__name")
            if not name_elem:
                return None
            name = name_elem.get_text(strip=True)

            # Get energy cost
            cost: list[str] = []
            costs_elem = attack_row.find(class_="attack-summary-row__costs")
            if costs_elem:
                for cost_icon in costs_elem.find_all("span", class_=re.compile(r"energy-icon")):
                    for cls in cost_icon.get("class", []):
                        if cls in ENERGY_TYPES:
                            cost.append(ENERGY_TYPES[cls])

            # Get damage
            damage: int | None = None
            damage_modifier: str | None = None
            damage_elem = attack_row.find(class_="attack-summary-row__damage")
            if damage_elem:
                damage_text = damage_elem.get_text(strip=True)
                # Parse damage like "60", "50+", "100x"
                match = re.match(r"(\d+)([+x])?", damage_text)
                if match:
                    damage = int(match.group(1))
                    damage_modifier = match.group(2)

            # Get effect text (using extract_effect_text to preserve energy types)
            effect_text: str | None = None
            footer_elem = attack_row.find(class_="attack-summary-row__footer")
            if footer_elem:
                effect_text = extract_effect_text(footer_elem)

            return Attack(
                name=name,
                cost=cost,
                damage=damage,
                damage_modifier=damage_modifier,
                effect_text=effect_text,
            )

        except Exception as e:
            logger.warning(f"Failed to parse attack: {e}")
            return None

    def parse_ability(self, ability_row: Tag) -> Ability | None:
        """Parse an ability from an ability-summary-row element.

        Args:
            ability_row: BeautifulSoup element for the ability row.

        Returns:
            Ability object or None if parsing failed.
        """
        try:
            # Get ability name (text after "Ability" badge)
            name_elem = ability_row.find(class_="ability-summary-row__name")
            if not name_elem:
                return None

            # Remove the "Ability" badge text to get just the name
            name_text = name_elem.get_text(strip=True)
            name = re.sub(r"^Ability\s*", "", name_text)

            # Get effect text (using extract_effect_text to preserve energy types)
            desc_elem = ability_row.find(class_="ability-summary-row__description")
            effect_text = extract_effect_text(desc_elem) if desc_elem else ""

            return Ability(name=name, effect_text=effect_text)

        except Exception as e:
            logger.warning(f"Failed to parse ability: {e}")
            return None

    def parse_card_page(self, soup: BeautifulSoup, url: str, set_code: str) -> Card | None:
        """Parse a card page into a Card object.

        Args:
            soup: BeautifulSoup object of the card page.
            url: URL of the card page (for error logging).
            set_code: Set code for this card.

        Returns:
            Card object or None if parsing failed.
        """
        try:
            # Extract card number and name from URL
            match = re.search(rf"/cards/{set_code}/(\d+)/([^/]+)/", url)
            if not match:
                logger.error(f"Could not parse card URL: {url}")
                return None

            card_number = int(match.group(1))
            url_name = match.group(2)

            # Get card name from page
            name_elem = soup.find("h1")
            if not name_elem:
                logger.error(f"Could not find card name on page: {url}")
                return None
            name = name_elem.get_text(strip=True)

            # Determine card type - look for specific card type indicators
            # Trainers have "Trainer | Supporter/Item/Stadium" text
            # Energy cards have specific energy type text
            # Pokemon cards have "Pokémon | Basic/Stage 1/Stage 2" text
            card_type = "pokemon"  # Default
            trainer_type: str | None = None

            # Check for Fossil cards first (they appear as Pokemon on the site but are Items)
            if name in FOSSIL_CARDS:
                card_type = "trainer"
                trainer_type = "item"
                logger.info(f"Detected fossil card as Trainer/Item: {name}")
            # Check for Trainer (more specific match)
            elif soup.find(
                string=re.compile(r"Trainer\s*\|\s*(Supporter|Item|Stadium)", re.IGNORECASE)
            ):
                card_type = "trainer"
            # Check for Energy
            elif soup.find(string=re.compile(r"Energy\s*$", re.IGNORECASE)):
                card_type = "energy"

            # Create card ID
            card_id = f"{set_code}-{card_number:03d}-{url_name}"

            # Get rarity from CSS class (rarity-icon--rarity-X)
            rarity = "Unknown"
            rarity_icon = soup.find("span", class_="rarity-icon")
            if rarity_icon:
                for cls in rarity_icon.get("class", []):
                    if "rarity-icon--rarity-" in cls:
                        rarity_code = cls.replace("rarity-icon--rarity-", "")
                        rarity = RARITY_CODES.get(rarity_code, rarity_code)
                        break

            # Get card image URL (first image in card-detail__card section)
            image_url: str | None = None
            card_section = soup.find("div", class_="card-detail__card")
            if card_section:
                img = card_section.find("img")
                if img:
                    image_url = img.get("src")
                    # Remove query params to get full resolution
                    if image_url and "?" in image_url:
                        image_url = image_url.split("?")[0]

            # Initialize card
            # is_ex: Check if name ends with " ex" (case insensitive)
            # This avoids false positives like "Exeggutor"
            is_ex = name.lower().endswith(" ex")

            card = Card(
                id=card_id,
                name=name,
                set_code=set_code,
                set_name=SETS.get(set_code, {}).get("name", set_code),
                card_number=card_number,
                rarity=rarity,
                card_type=card_type,
                image_url=image_url,
                source_url=url,
                is_ex=is_ex,
            )

            if card_type == "pokemon":
                self._parse_pokemon_details(soup, card)
            elif card_type == "trainer":
                # For fossil cards, we already know the trainer_type
                if trainer_type:
                    card.stage = trainer_type
                self._parse_trainer_details(soup, card)
            elif card_type == "energy":
                self._parse_energy_details(soup, card)

            return card

        except Exception as e:
            logger.error(f"Failed to parse card page {url}: {e}")
            self.errors.append({"url": url, "error": str(e)})
            return None

    def _parse_pokemon_details(self, soup: BeautifulSoup, card: Card) -> None:
        """Parse Pokemon-specific details from the page.

        Args:
            soup: BeautifulSoup object of the card page.
            card: Card object to populate.
        """
        # Get HP
        hp_match = soup.find(string=re.compile(r"HP\s*(\d+)", re.IGNORECASE))
        if hp_match:
            hp_num = re.search(r"(\d+)", str(hp_match))
            if hp_num:
                card.hp = int(hp_num.group(1))
        else:
            # Try finding HP in the stat display
            hp_elem = soup.find("span", string="HP")
            if hp_elem:
                hp_value = hp_elem.find_next("span")
                if hp_value:
                    hp_text = hp_value.get_text(strip=True)
                    hp_num = re.search(r"(\d+)", hp_text)
                    if hp_num:
                        card.hp = int(hp_num.group(1))

        # Get Pokemon type from first energy icon NOT in an attack row
        # The card's type icon is in the header area, not in attack-summary-row__cost
        for icon in soup.find_all("span", class_=re.compile(r"energy-icon--type-")):
            parent = icon.parent
            parent_classes = parent.get("class", []) if parent else []
            # Skip if this is an attack cost icon
            if "attack-summary-row__cost" not in parent_classes:
                for cls in icon.get("class", []):
                    if cls in ENERGY_TYPES:
                        card.pokemon_type = ENERGY_TYPES[cls]
                        break
                if card.pokemon_type:
                    break

        # Get stage and evolution info
        stage_text = soup.find(string=re.compile(r"Basic|Stage 1|Stage 2", re.IGNORECASE))
        if stage_text:
            stage_lower = str(stage_text).lower()
            if "stage 2" in stage_lower:
                card.stage = "stage_2"
            elif "stage 1" in stage_lower:
                card.stage = "stage_1"
            elif "basic" in stage_lower:
                card.stage = "basic"

        # Get evolves_from
        evolves_match = soup.find(string=re.compile(r"Evolves from", re.IGNORECASE))
        if evolves_match:
            # Try to find the Pokemon name link nearby
            parent = evolves_match.parent if hasattr(evolves_match, "parent") else None
            if parent:
                link = parent.find("a")
                if link:
                    card.evolves_from = link.get_text(strip=True)

        # Get abilities
        for ability_row in soup.find_all(class_="ability-summary-row"):
            ability = self.parse_ability(ability_row)
            if ability:
                card.abilities.append(ability)

        # Get attacks
        for attack_row in soup.find_all(class_="attack-summary-row"):
            attack = self.parse_attack(attack_row)
            if attack:
                card.attacks.append(attack)

        # Get weakness
        weakness_section = soup.find(string=re.compile(r"Weakness", re.IGNORECASE))
        if weakness_section:
            parent = weakness_section.parent
            if parent:
                card.weakness_type = self.parse_energy_type(parent.parent)
                # Look for +20 pattern
                value_match = re.search(
                    r"\+(\d+)", parent.parent.get_text() if parent.parent else ""
                )
                if value_match:
                    card.weakness_value = int(value_match.group(1))

        # Get retreat cost (count colorless energy icons in retreat section)
        retreat_section = soup.find(string=re.compile(r"Retreat", re.IGNORECASE))
        if retreat_section:
            parent = retreat_section.parent
            if parent and parent.parent:
                retreat_icons = parent.parent.find_all(
                    "span", class_=re.compile(r"energy-icon--type-colorless")
                )
                card.retreat_cost = len(retreat_icons)

        # Get illustrator
        illustrator_match = soup.find(string=re.compile(r"Illustrated by", re.IGNORECASE))
        if illustrator_match:
            card.illustrator = str(illustrator_match).replace("Illustrated by", "").strip()

        # Get flavor text (Pokemon description)
        # This is usually in a paragraph after the attacks section
        for p in soup.find_all("p"):
            text = p.get_text(strip=True)
            if text and len(text) > 50 and "Illustrated" not in text and "Artwork" not in text:
                card.flavor_text = text
                break

    def _parse_trainer_details(self, soup: BeautifulSoup, card: Card) -> None:
        """Parse Trainer-specific details from the page.

        Args:
            soup: BeautifulSoup object of the card page.
            card: Card object to populate.
        """
        # Get trainer type (Item, Supporter, Stadium) from "Trainer | Type" text
        type_match = soup.find(
            string=re.compile(r"Trainer\s*\|\s*(Supporter|Item|Stadium)", re.IGNORECASE)
        )
        if type_match:
            match = re.search(r"(Supporter|Item|Stadium)", str(type_match), re.IGNORECASE)
            if match:
                card.stage = match.group(1).lower()

        # Get effect text - look for card-detail__content-body
        # Using extract_effect_text to preserve energy type references
        content_body = soup.find("div", class_="card-detail__content-body")
        if content_body:
            # Extract effect text with proper energy type handling
            effect_text = extract_effect_text(content_body)
            # Remove illustrator info at the end
            if effect_text and "Illustrated by" in effect_text:
                effect_text = effect_text.split("Illustrated by")[0].strip()
            if effect_text:
                card.flavor_text = effect_text
        else:
            # Fallback: look for any paragraph with effect-like text
            for elem in soup.find_all("p"):
                effect_text = extract_effect_text(elem)
                if (
                    effect_text
                    and len(effect_text) > 20
                    and "Illustrated" not in effect_text
                    and "Artwork" not in effect_text
                    and "Pokemon Zone" not in effect_text
                    and "unofficial" not in effect_text.lower()
                ):
                    card.flavor_text = effect_text
                    break

        # Get illustrator
        illustrator_match = soup.find(string=re.compile(r"Illustrated by", re.IGNORECASE))
        if illustrator_match:
            card.illustrator = str(illustrator_match).replace("Illustrated by", "").strip()

    def _parse_energy_details(self, soup: BeautifulSoup, card: Card) -> None:
        """Parse Energy-specific details from the page.

        Args:
            soup: BeautifulSoup object of the card page.
            card: Card object to populate.
        """
        # Get energy type from the page
        card.pokemon_type = self.parse_energy_type(soup)

        # Get illustrator
        illustrator_match = soup.find(string=re.compile(r"Illustrated by", re.IGNORECASE))
        if illustrator_match:
            card.illustrator = str(illustrator_match).replace("Illustrated by", "").strip()

    def scrape_card(self, card_id: str) -> Card | None:
        """Scrape a single card by ID.

        Args:
            card_id: Card ID in format "{set}-{number}-{name}" (e.g., "a1-132-gardevoir").

        Returns:
            Card object or None if scraping failed.
        """
        # Parse card ID
        match = re.match(r"([a-z0-9]+)-(\d+)-(.+)", card_id)
        if not match:
            logger.error(f"Invalid card ID format: {card_id}")
            return None

        set_code = match.group(1)
        card_number = int(match.group(2))
        card_name = match.group(3)

        url = f"{BASE_URL}/cards/{set_code}/{card_number}/{card_name}/"
        logger.info(f"Scraping card: {url}")

        soup = self.fetch_page(url)
        if not soup:
            return None

        card = self.parse_card_page(soup, url, set_code)

        # Download image if enabled
        if card and self.download_images and card.image_url:
            card.image_file = self.download_image(card)

        return card

    def scrape_set(self, set_code: str, limit: int | None = None) -> list[Card]:
        """Scrape all cards from a set.

        Args:
            set_code: Set code (e.g., "a1", "a1a").
            limit: Maximum number of cards to scrape (for testing).

        Returns:
            List of Card objects.
        """
        if set_code not in SETS:
            logger.error(f"Unknown set code: {set_code}")
            return []

        card_urls = self.get_card_urls_for_set(set_code)
        if limit:
            card_urls = card_urls[:limit]

        cards: list[Card] = []
        total = len(card_urls)

        for i, (card_number, card_name, url) in enumerate(card_urls, 1):
            logger.info(f"[{i}/{total}] Scraping: {card_name} (#{card_number})")

            soup = self.fetch_page(url)
            if soup:
                card = self.parse_card_page(soup, url, set_code)
                if card:
                    # Download image if enabled
                    if self.download_images and card.image_url:
                        card.image_file = self.download_image(card)
                        time.sleep(IMAGE_REQUEST_DELAY)

                    cards.append(card)
                    self.save_card(card)

            # Rate limiting
            if i < total:
                time.sleep(REQUEST_DELAY)

        return cards

    def save_card(self, card: Card) -> Path:
        """Save a card to a JSON file.

        Args:
            card: Card object to save.

        Returns:
            Path to the saved file.
        """
        set_dir = self.data_dir / card.set_code
        set_dir.mkdir(parents=True, exist_ok=True)

        # Generate filename: {number:03d}-{name}.json
        url_name = card.id.split("-", 2)[2]  # Get name part from ID
        filename = f"{card.card_number:03d}-{url_name}.json"
        filepath = set_dir / filename

        with open(filepath, "w", encoding="utf-8") as f:
            json.dump(card.to_dict(), f, indent=2, ensure_ascii=False)

        logger.debug(f"Saved: {filepath}")
        return filepath

    def generate_index(self) -> Path:
        """Generate the combined index file from existing card files.

        Returns:
            Path to the index file.
        """
        logger.info("Generating index...")

        index: dict[str, Any] = {
            "generated_at": datetime.now(UTC).isoformat(),
            "schema_version": "1.0",
            "sets": {},
            "cards": [],
            "total_cards": 0,
        }

        for set_code in SETS:
            set_dir = self.data_dir / set_code
            if not set_dir.exists():
                continue

            card_files = sorted(set_dir.glob("*.json"))
            index["sets"][set_code] = {
                "name": SETS[set_code]["name"],
                "card_count": len(card_files),
            }

            for card_file in card_files:
                relative_path = f"{set_code}/{card_file.name}"
                with open(card_file, encoding="utf-8") as f:
                    card_data = json.load(f)

                index["cards"].append(
                    {
                        "id": card_data["id"],
                        "name": card_data["name"],
                        "set_code": set_code,
                        "card_number": card_data["card_number"],
                        "file": relative_path,
                    }
                )

        index["total_cards"] = len(index["cards"])

        # Sort cards by set and number
        index["cards"].sort(key=lambda x: (x["set_code"], x["card_number"]))

        index_path = self.data_dir / "_index.json"
        with open(index_path, "w", encoding="utf-8") as f:
            json.dump(index, f, indent=2, ensure_ascii=False)

        logger.info(f"Index generated: {index_path} ({index['total_cards']} cards)")
        return index_path

    def save_errors(self) -> Path | None:
        """Save error log if there were any errors.

        Returns:
            Path to the error log file, or None if no errors.
        """
        if not self.errors:
            return None

        error_log = self.data_dir / "_errors.log"
        with open(error_log, "w", encoding="utf-8") as f:
            f.write(f"Scraping errors - {datetime.now(UTC).isoformat()}\n")
            f.write("=" * 60 + "\n\n")
            for error in self.errors:
                f.write(f"URL: {error['url']}\n")
                f.write(f"Error: {error['error']}\n\n")

        logger.warning(f"Errors logged to: {error_log}")
        return error_log

    def download_images_for_existing_cards(self, set_code: str | None = None) -> int:
        """Download images for cards that already have JSON files.

        This is useful for downloading images separately from scraping,
        or for retrying failed image downloads.

        Args:
            set_code: Optional set code to limit downloads to a specific set.

        Returns:
            Number of images downloaded.
        """
        sets_to_process = [set_code] if set_code else list(SETS.keys())
        total_downloaded = 0

        for sc in sets_to_process:
            set_dir = self.data_dir / sc
            if not set_dir.exists():
                logger.warning(f"No card data found for set {sc}")
                continue

            card_files = sorted(set_dir.glob("*.json"))
            logger.info(f"Processing {len(card_files)} cards from set {sc}")

            for i, card_file in enumerate(card_files, 1):
                with open(card_file, encoding="utf-8") as f:
                    card_data = json.load(f)

                image_url = card_data.get("image_url")
                existing_file = card_data.get("image_file")

                if not image_url:
                    continue

                # Create a minimal Card object for download
                card = Card(
                    id=card_data["id"],
                    name=card_data["name"],
                    set_code=card_data["set_code"],
                    set_name=card_data["set_name"],
                    card_number=card_data["card_number"],
                    rarity=card_data["rarity"],
                    card_type=card_data["card_type"],
                    image_url=image_url,
                )

                # Check if image already exists
                if existing_file:
                    image_path = self.images_dir / existing_file
                    if image_path.exists():
                        self.image_stats["skipped"] += 1
                        continue

                logger.info(f"[{i}/{len(card_files)}] Downloading: {card.name}")
                image_file = self.download_image(card)

                if image_file:
                    # Update the JSON file with the image path
                    card_data["image_file"] = image_file
                    with open(card_file, "w", encoding="utf-8") as f:
                        json.dump(card_data, f, indent=2, ensure_ascii=False)
                    total_downloaded += 1

                time.sleep(IMAGE_REQUEST_DELAY)

        return total_downloaded

    def log_image_stats(self) -> None:
        """Log image download statistics."""
        stats = self.image_stats
        total = stats["downloaded"] + stats["skipped"] + stats["failed"]
        if total > 0:
            logger.info(
                f"Images: {stats['downloaded']} downloaded, "
                f"{stats['skipped']} skipped, {stats['failed']} failed"
            )


# =============================================================================
# CLI
# =============================================================================


def main() -> int:
    """Main entry point for the scraper CLI."""
    parser = argparse.ArgumentParser(
        description="Scrape Pokemon TCG Pocket card data from pokemon-zone.com",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    # Scrape entire set
    uv run python scripts/scrape_pokemon_pocket.py --set a1

    # Scrape with limit (for testing)
    uv run python scripts/scrape_pokemon_pocket.py --set a1 --limit 5

    # Scrape set with images
    uv run python scripts/scrape_pokemon_pocket.py --set a1 --images

    # Download images for existing card data
    uv run python scripts/scrape_pokemon_pocket.py --download-images
    uv run python scripts/scrape_pokemon_pocket.py --download-images --set a1

    # Scrape single card by ID
    uv run python scripts/scrape_pokemon_pocket.py --card a1-132-gardevoir

    # Regenerate index from existing card files
    uv run python scripts/scrape_pokemon_pocket.py --reindex
        """,
    )

    # Main action group - what operation to perform
    action_group = parser.add_mutually_exclusive_group(required=True)
    action_group.add_argument(
        "--set",
        choices=list(SETS.keys()),
        help="Scrape all cards from a set",
    )
    action_group.add_argument(
        "--card",
        type=str,
        help="Scrape a single card by ID (e.g., a1-132-gardevoir)",
    )
    action_group.add_argument(
        "--reindex",
        action="store_true",
        help="Regenerate index from existing card files",
    )
    action_group.add_argument(
        "--download-images",
        choices=list(SETS.keys()) + ["all"],
        nargs="?",
        const="all",
        help="Download images for existing card data (specify set or 'all')",
    )

    parser.add_argument(
        "--limit",
        type=int,
        help="Maximum number of cards to scrape (for testing)",
    )
    parser.add_argument(
        "--images",
        action="store_true",
        help="Download card images while scraping",
    )

    args = parser.parse_args()

    scraper = PokemonPocketScraper(download_images=args.images)

    if args.reindex:
        scraper.generate_index()
        return 0

    if args.download_images:
        # Download images for existing cards
        set_code = None if args.download_images == "all" else args.download_images
        set_info = f"set {set_code}" if set_code else "all sets"
        logger.info(f"Downloading images for existing card data ({set_info})...")
        scraper.download_images = True  # Enable downloads
        downloaded = scraper.download_images_for_existing_cards(set_code)
        scraper.log_image_stats()
        logger.info(f"Image download complete: {downloaded} new images")
        return 0

    if args.card:
        card = scraper.scrape_card(args.card)
        if card:
            scraper.save_card(card)
            scraper.generate_index()
            scraper.log_image_stats()
            logger.info(f"Successfully scraped: {card.name}")
            return 0
        else:
            logger.error(f"Failed to scrape card: {args.card}")
            return 1

    if args.set:
        cards = scraper.scrape_set(args.set, limit=args.limit)
        scraper.generate_index()
        scraper.save_errors()
        scraper.log_image_stats()

        success_count = len(cards)
        error_count = len(scraper.errors)
        total = success_count + error_count

        logger.info(f"Scraping complete: {success_count}/{total} cards succeeded")
        if error_count > 0:
            logger.warning(f"{error_count} errors occurred (see _errors.log)")
            return 1
        return 0

    return 1


if __name__ == "__main__":
    sys.exit(main())