#!/usr/bin/env python """Scrape Pokemon TCG Pocket card data from pokemon-zone.com. This script fetches card data from the Genetic Apex (A1) and Mythical Island (A1a) sets and saves them as individual JSON files for use in the Mantimon TCG game engine. Usage: # Scrape entire set uv run python scripts/scrape_pokemon_pocket.py --set a1 # Scrape with limit (for testing) uv run python scripts/scrape_pokemon_pocket.py --set a1 --limit 5 # Scrape single card by ID uv run python scripts/scrape_pokemon_pocket.py --card a1-132-gardevoir # Regenerate index from existing card files uv run python scripts/scrape_pokemon_pocket.py --reindex Output: - Individual card files: data/raw/{set}/{number}-{name}.json - Combined index: data/raw/_index.json - Error log: data/raw/_errors.log """ import argparse import json import logging import re import sys import time from dataclasses import dataclass, field from datetime import UTC, datetime from pathlib import Path from typing import Any import requests from bs4 import BeautifulSoup, Tag # ============================================================================= # Configuration # ============================================================================= BASE_URL = "https://www.pokemon-zone.com" DATA_DIR = Path(__file__).parent.parent / "data" / "raw" IMAGES_DIR = Path(__file__).parent.parent / "data" / "images" REQUEST_DELAY = 1.5 # seconds between requests IMAGE_REQUEST_DELAY = 0.5 # faster for images (different server) MAX_RETRIES = 3 RETRY_DELAY = 5 # seconds # Set info for validation and metadata SETS = { "a1": {"name": "Genetic Apex", "expected_cards": 286}, "a1a": {"name": "Mythical Island", "expected_cards": 86}, } # Energy type mapping from CSS classes (for attack cost icons) ENERGY_TYPES = { "energy-icon--type-grass": "grass", "energy-icon--type-fire": "fire", "energy-icon--type-water": "water", "energy-icon--type-lightning": "lightning", "energy-icon--type-psychic": "psychic", "energy-icon--type-fighting": "fighting", "energy-icon--type-darkness": "darkness", "energy-icon--type-metal": "metal", "energy-icon--type-colorless": "colorless", "energy-icon--type-dragon": "dragon", } # Energy text mapping from CSS classes (for inline text references) # These appear in effect text like "Discard a Energy" ENERGY_TEXT_TYPES = { "energy-text--type-grass": "Grass", "energy-text--type-fire": "Fire", "energy-text--type-water": "Water", "energy-text--type-lightning": "Lightning", "energy-text--type-psychic": "Psychic", "energy-text--type-fighting": "Fighting", "energy-text--type-darkness": "Darkness", "energy-text--type-metal": "Metal", "energy-text--type-colorless": "Colorless", "energy-text--type-dragon": "Dragon", } # Rarity code mapping from CSS classes (rarity-icon--rarity-X) RARITY_CODES = { "C": "Common", "U": "Uncommon", "R": "Rare", "RR": "Double Rare", "AR": "Art Rare", "SAR": "Special Art Rare", "UR": "Ultra Rare", "IM": "Immersive", "S": "Shiny", "CR": "Crown Rare", } # Fossil cards that are Trainer/Item cards, not Pokemon # These are often misclassified because they have Pokemon-like layouts on the source site FOSSIL_CARDS = { "Helix Fossil", "Dome Fossil", "Old Amber", } # Text artifact patterns to fix (caused by stripped energy icons) # Format: (pattern, replacement) # Energy icons render as empty strings, merging adjacent text TEXT_ARTIFACT_FIXES = [ # Energy-related artifacts (r"\baEnergy\b", "an Energy"), (r"\bofEnergy\b", "of Energy"), (r"\bextraEnergy\b", "extra Energy"), (r"\battachedEnergy\b", "attached Energy"), (r"\banyEnergy\b", "any Energy"), (r"(\d+)Energy\b", r"\1 Energy"), # "2Energy" -> "2 Energy" (r"(\d+)-HP\b", r"\1 HP"), # "40-HP" -> "40 HP" # Pokemon-related artifacts (r"\bBasicPokémon\b", "Basic Pokémon"), (r"\bBenchedPokémon\b", "Benched Pokémon"), (r"\bthePokémon\b", "the Pokémon"), (r"\bthisPokémon\b", "this Pokémon"), (r"\byourPokémon\b", "your Pokémon"), (r"\bActivePokémon\b", "Active Pokémon"), (r"\bDefendingPokémon\b", "Defending Pokémon"), (r"\bopponent'sPokémon\b", "opponent's Pokémon"), (r"\bOpponent'sPokémon\b", "Opponent's Pokémon"), # Other common artifacts (r"\bthatPokémon\b", "that Pokémon"), (r"\beachPokémon\b", "each Pokémon"), (r"\baPokémon\b", "a Pokémon"), ] def replace_energy_text_spans(element: Tag) -> None: """Replace energy-text spans with their text representation in-place. The source website uses to render inline energy type icons in effect text. When BeautifulSoup extracts text with get_text(), these empty spans disappear, merging adjacent words. This function finds all such spans and replaces them with their text equivalent (e.g., "Fire") BEFORE text extraction, preserving the energy type information. Args: element: BeautifulSoup element to process in-place. The element is modified directly - energy-text spans are replaced with NavigableString text. Example: >>> soup = BeautifulSoup('
Discard a Energy
') >>> div = soup.find('div') >>> replace_energy_text_spans(div) >>> div.get_text() 'Discard a Fire Energy' """ from bs4 import NavigableString # Find all energy-text spans (they may have multiple classes) for span in element.find_all("span", class_=re.compile(r"energy-text--type-")): energy_type = None for cls in span.get("class", []): if cls in ENERGY_TEXT_TYPES: energy_type = ENERGY_TEXT_TYPES[cls] break if energy_type: # Replace the span with a text node span.replace_with(NavigableString(energy_type)) def extract_effect_text(element: Tag) -> str | None: """Extract effect text from an element, properly handling energy-text spans. This is the correct way to extract effect text that may contain energy type icons. It replaces the energy-text spans with readable text before extraction. Args: element: BeautifulSoup element containing effect text. Returns: Cleaned effect text, or None if empty. """ # Work on a deep copy to avoid modifying the original soup # BeautifulSoup's copy.copy is shallow - we need to re-parse from bs4 import BeautifulSoup as BS element_copy = BS(str(element), "html.parser") replace_energy_text_spans(element_copy) # Use separator=" " to properly join text nodes with spaces # strip=True removes leading/trailing whitespace from each text node raw_text = element_copy.get_text(separator=" ", strip=True) # Normalize multiple spaces to single spaces raw_text = re.sub(r"\s+", " ", raw_text).strip() # Still apply artifact fixes as fallback for any edge cases return clean_effect_text(raw_text) def clean_effect_text(text: str | None) -> str | None: """Clean scraped effect text by fixing common artifacts. This is a fallback for any text artifacts not caught by energy-text span replacement. It handles edge cases like merged words. Args: text: The raw effect text from scraping. Returns: Cleaned text with artifacts fixed, or None if input was None/empty. """ if not text: return None result = text for pattern, replacement in TEXT_ARTIFACT_FIXES: result = re.sub(pattern, replacement, result) return result if result else None # ============================================================================= # Logging Setup # ============================================================================= logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%H:%M:%S", ) logger = logging.getLogger(__name__) # ============================================================================= # Data Classes # ============================================================================= @dataclass class Attack: """A Pokemon's attack.""" name: str cost: list[str] damage: int | None damage_modifier: str | None # "+", "x", or None effect_text: str | None effect_id: str | None = None # To be mapped later def to_dict(self) -> dict[str, Any]: """Convert to dictionary for JSON serialization.""" return { "name": self.name, "cost": self.cost, "damage": self.damage, "damage_modifier": self.damage_modifier, "effect_text": self.effect_text, "effect_id": self.effect_id, } @dataclass class Ability: """A Pokemon's ability.""" name: str effect_text: str effect_id: str | None = None # To be mapped later def to_dict(self) -> dict[str, Any]: """Convert to dictionary for JSON serialization.""" return { "name": self.name, "effect_text": self.effect_text, "effect_id": self.effect_id, } @dataclass class Card: """Complete card data.""" id: str name: str set_code: str set_name: str card_number: int rarity: str card_type: str # "pokemon", "trainer", "energy" image_url: str | None = None # URL to card image for offline caching image_file: str | None = None # Local path to downloaded image (relative to images dir) hp: int | None = None pokemon_type: str | None = None stage: str | None = None # "basic", "stage_1", "stage_2" evolves_from: str | None = None is_ex: bool = False abilities: list[Ability] = field(default_factory=list) attacks: list[Attack] = field(default_factory=list) weakness_type: str | None = None weakness_value: int | None = None resistance_type: str | None = None resistance_value: int | None = None retreat_cost: int = 0 flavor_text: str | None = None illustrator: str | None = None source_url: str = "" def to_dict(self) -> dict[str, Any]: """Convert to dictionary for JSON serialization.""" data: dict[str, Any] = { "id": self.id, "name": self.name, "set_code": self.set_code, "set_name": self.set_name, "card_number": self.card_number, "rarity": self.rarity, "card_type": self.card_type, "image_url": self.image_url, "image_file": self.image_file, "source_url": self.source_url, } if self.card_type == "pokemon": data.update( { "hp": self.hp, "pokemon_type": self.pokemon_type, "stage": self.stage, "evolves_from": self.evolves_from, "is_ex": self.is_ex, "abilities": [a.to_dict() for a in self.abilities], "attacks": [a.to_dict() for a in self.attacks], "weakness": ( {"type": self.weakness_type, "value": self.weakness_value} if self.weakness_type else None ), "resistance": ( {"type": self.resistance_type, "value": self.resistance_value} if self.resistance_type else None ), "retreat_cost": self.retreat_cost, "flavor_text": self.flavor_text, "illustrator": self.illustrator, } ) elif self.card_type == "trainer": data.update( { "trainer_type": self.stage, # Reusing stage field for trainer type "effect_text": self.flavor_text, # Trainer effect "illustrator": self.illustrator, } ) elif self.card_type == "energy": data.update( { "energy_type": self.pokemon_type, "illustrator": self.illustrator, } ) return data # ============================================================================= # Scraper Class # ============================================================================= class PokemonPocketScraper: """Scraper for Pokemon TCG Pocket card data from pokemon-zone.com.""" def __init__( self, data_dir: Path = DATA_DIR, images_dir: Path = IMAGES_DIR, download_images: bool = False, ): """Initialize the scraper. Args: data_dir: Directory to save card data files. images_dir: Directory to save downloaded card images. download_images: Whether to download card images. """ self.data_dir = data_dir self.images_dir = images_dir self.download_images = download_images self.session = requests.Session() self.session.headers.update( { "User-Agent": "MantimonTCG-CardScraper/1.0 (https://github.com/mantimon-tcg)", "Accept": "text/html,application/xhtml+xml", } ) self.errors: list[dict[str, Any]] = [] self.image_stats = {"downloaded": 0, "skipped": 0, "failed": 0} def fetch_page(self, url: str) -> BeautifulSoup | None: """Fetch a page with retry logic. Args: url: URL to fetch. Returns: BeautifulSoup object or None if all retries failed. """ for attempt in range(MAX_RETRIES): try: response = self.session.get(url, timeout=30) response.raise_for_status() return BeautifulSoup(response.text, "html.parser") except requests.RequestException as e: logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for {url}: {e}") if attempt < MAX_RETRIES - 1: time.sleep(RETRY_DELAY) self.errors.append({"url": url, "error": "Max retries exceeded"}) return None def download_image(self, card: Card) -> str | None: """Download a card image and save it locally. Args: card: Card object with image_url set. Returns: Relative path to the saved image, or None if download failed. """ if not card.image_url: return None # Create directory structure: images/{set_code}/ set_dir = self.images_dir / card.set_code set_dir.mkdir(parents=True, exist_ok=True) # Determine file extension from URL url_path = card.image_url.split("?")[0] # Remove query params ext = Path(url_path).suffix or ".webp" # Generate filename: {number:03d}-{name}{ext} url_name = card.id.split("-", 2)[2] # Get name part from ID filename = f"{card.card_number:03d}-{url_name}{ext}" filepath = set_dir / filename relative_path = f"{card.set_code}/{filename}" # Skip if already downloaded if filepath.exists(): logger.debug(f"Image exists, skipping: {relative_path}") self.image_stats["skipped"] += 1 return relative_path # Download the image with appropriate headers image_headers = { "Accept": "image/webp,image/apng,image/*,*/*;q=0.8", "Referer": "https://www.pokemon-zone.com/", } for attempt in range(MAX_RETRIES): try: response = self.session.get(card.image_url, timeout=30, headers=image_headers) response.raise_for_status() # Verify it's an image content_type = response.headers.get("content-type", "") if not content_type.startswith("image/"): logger.warning(f"Not an image: {content_type} for {card.image_url}") self.image_stats["failed"] += 1 return None # Save the image with open(filepath, "wb") as f: f.write(response.content) logger.debug(f"Downloaded: {relative_path} ({len(response.content)} bytes)") self.image_stats["downloaded"] += 1 return relative_path except requests.RequestException as e: logger.warning(f"Image download attempt {attempt + 1}/{MAX_RETRIES} failed: {e}") if attempt < MAX_RETRIES - 1: time.sleep(RETRY_DELAY) logger.error(f"Failed to download image for {card.id}") self.image_stats["failed"] += 1 return None def get_card_urls_for_set(self, set_code: str) -> list[tuple[int, str, str]]: """Get all card URLs for a set. Args: set_code: Set code (e.g., "a1", "a1a"). Returns: List of (card_number, card_name, url) tuples. """ set_url = f"{BASE_URL}/sets/{set_code}/" logger.info(f"Fetching set page: {set_url}") soup = self.fetch_page(set_url) if not soup: logger.error(f"Failed to fetch set page for {set_code}") return [] cards: list[tuple[int, str, str]] = [] # Find all card links - they follow pattern /cards/{set}/{number}/{name}/ pattern = re.compile(rf"^/cards/{set_code}/(\d+)/([^/]+)/$") for link in soup.find_all("a", href=pattern): href = link.get("href", "") match = pattern.match(href) if match: card_number = int(match.group(1)) card_name = match.group(2) # Avoid duplicates (page may have multiple links to same card) card_tuple = (card_number, card_name, f"{BASE_URL}{href}") if card_tuple not in cards: cards.append(card_tuple) # Sort by card number cards.sort(key=lambda x: x[0]) logger.info(f"Found {len(cards)} cards in set {set_code}") return cards def parse_energy_type(self, element: Tag | None) -> str | None: """Extract energy type from an element containing an energy icon. Args: element: BeautifulSoup element that may contain energy icons. Returns: Energy type string or None. """ if not element: return None for icon in element.find_all("span", class_=re.compile(r"energy-icon--type-")): for cls in icon.get("class", []): if cls in ENERGY_TYPES: return ENERGY_TYPES[cls] return None def parse_attack(self, attack_row: Tag) -> Attack | None: """Parse an attack from an attack-summary-row element. Args: attack_row: BeautifulSoup element for the attack row. Returns: Attack object or None if parsing failed. """ try: # Get attack name name_elem = attack_row.find(class_="attack-summary-row__name") if not name_elem: return None name = name_elem.get_text(strip=True) # Get energy cost cost: list[str] = [] costs_elem = attack_row.find(class_="attack-summary-row__costs") if costs_elem: for cost_icon in costs_elem.find_all("span", class_=re.compile(r"energy-icon")): for cls in cost_icon.get("class", []): if cls in ENERGY_TYPES: cost.append(ENERGY_TYPES[cls]) # Get damage damage: int | None = None damage_modifier: str | None = None damage_elem = attack_row.find(class_="attack-summary-row__damage") if damage_elem: damage_text = damage_elem.get_text(strip=True) # Parse damage like "60", "50+", "100x" match = re.match(r"(\d+)([+x])?", damage_text) if match: damage = int(match.group(1)) damage_modifier = match.group(2) # Get effect text (using extract_effect_text to preserve energy types) effect_text: str | None = None footer_elem = attack_row.find(class_="attack-summary-row__footer") if footer_elem: effect_text = extract_effect_text(footer_elem) return Attack( name=name, cost=cost, damage=damage, damage_modifier=damage_modifier, effect_text=effect_text, ) except Exception as e: logger.warning(f"Failed to parse attack: {e}") return None def parse_ability(self, ability_row: Tag) -> Ability | None: """Parse an ability from an ability-summary-row element. Args: ability_row: BeautifulSoup element for the ability row. Returns: Ability object or None if parsing failed. """ try: # Get ability name (text after "Ability" badge) name_elem = ability_row.find(class_="ability-summary-row__name") if not name_elem: return None # Remove the "Ability" badge text to get just the name name_text = name_elem.get_text(strip=True) name = re.sub(r"^Ability\s*", "", name_text) # Get effect text (using extract_effect_text to preserve energy types) desc_elem = ability_row.find(class_="ability-summary-row__description") effect_text = extract_effect_text(desc_elem) if desc_elem else "" return Ability(name=name, effect_text=effect_text) except Exception as e: logger.warning(f"Failed to parse ability: {e}") return None def parse_card_page(self, soup: BeautifulSoup, url: str, set_code: str) -> Card | None: """Parse a card page into a Card object. Args: soup: BeautifulSoup object of the card page. url: URL of the card page (for error logging). set_code: Set code for this card. Returns: Card object or None if parsing failed. """ try: # Extract card number and name from URL match = re.search(rf"/cards/{set_code}/(\d+)/([^/]+)/", url) if not match: logger.error(f"Could not parse card URL: {url}") return None card_number = int(match.group(1)) url_name = match.group(2) # Get card name from page name_elem = soup.find("h1") if not name_elem: logger.error(f"Could not find card name on page: {url}") return None name = name_elem.get_text(strip=True) # Determine card type - look for specific card type indicators # Trainers have "Trainer | Supporter/Item/Stadium" text # Energy cards have specific energy type text # Pokemon cards have "Pokémon | Basic/Stage 1/Stage 2" text card_type = "pokemon" # Default trainer_type: str | None = None # Check for Fossil cards first (they appear as Pokemon on the site but are Items) if name in FOSSIL_CARDS: card_type = "trainer" trainer_type = "item" logger.info(f"Detected fossil card as Trainer/Item: {name}") # Check for Trainer (more specific match) elif soup.find( string=re.compile(r"Trainer\s*\|\s*(Supporter|Item|Stadium)", re.IGNORECASE) ): card_type = "trainer" # Check for Energy elif soup.find(string=re.compile(r"Energy\s*$", re.IGNORECASE)): card_type = "energy" # Create card ID card_id = f"{set_code}-{card_number:03d}-{url_name}" # Get rarity from CSS class (rarity-icon--rarity-X) rarity = "Unknown" rarity_icon = soup.find("span", class_="rarity-icon") if rarity_icon: for cls in rarity_icon.get("class", []): if "rarity-icon--rarity-" in cls: rarity_code = cls.replace("rarity-icon--rarity-", "") rarity = RARITY_CODES.get(rarity_code, rarity_code) break # Get card image URL (first image in card-detail__card section) image_url: str | None = None card_section = soup.find("div", class_="card-detail__card") if card_section: img = card_section.find("img") if img: image_url = img.get("src") # Remove query params to get full resolution if image_url and "?" in image_url: image_url = image_url.split("?")[0] # Initialize card # is_ex: Check if name ends with " ex" (case insensitive) # This avoids false positives like "Exeggutor" is_ex = name.lower().endswith(" ex") card = Card( id=card_id, name=name, set_code=set_code, set_name=SETS.get(set_code, {}).get("name", set_code), card_number=card_number, rarity=rarity, card_type=card_type, image_url=image_url, source_url=url, is_ex=is_ex, ) if card_type == "pokemon": self._parse_pokemon_details(soup, card) elif card_type == "trainer": # For fossil cards, we already know the trainer_type if trainer_type: card.stage = trainer_type self._parse_trainer_details(soup, card) elif card_type == "energy": self._parse_energy_details(soup, card) return card except Exception as e: logger.error(f"Failed to parse card page {url}: {e}") self.errors.append({"url": url, "error": str(e)}) return None def _parse_pokemon_details(self, soup: BeautifulSoup, card: Card) -> None: """Parse Pokemon-specific details from the page. Args: soup: BeautifulSoup object of the card page. card: Card object to populate. """ # Get HP hp_match = soup.find(string=re.compile(r"HP\s*(\d+)", re.IGNORECASE)) if hp_match: hp_num = re.search(r"(\d+)", str(hp_match)) if hp_num: card.hp = int(hp_num.group(1)) else: # Try finding HP in the stat display hp_elem = soup.find("span", string="HP") if hp_elem: hp_value = hp_elem.find_next("span") if hp_value: hp_text = hp_value.get_text(strip=True) hp_num = re.search(r"(\d+)", hp_text) if hp_num: card.hp = int(hp_num.group(1)) # Get Pokemon type from first energy icon NOT in an attack row # The card's type icon is in the header area, not in attack-summary-row__cost for icon in soup.find_all("span", class_=re.compile(r"energy-icon--type-")): parent = icon.parent parent_classes = parent.get("class", []) if parent else [] # Skip if this is an attack cost icon if "attack-summary-row__cost" not in parent_classes: for cls in icon.get("class", []): if cls in ENERGY_TYPES: card.pokemon_type = ENERGY_TYPES[cls] break if card.pokemon_type: break # Get stage and evolution info stage_text = soup.find(string=re.compile(r"Basic|Stage 1|Stage 2", re.IGNORECASE)) if stage_text: stage_lower = str(stage_text).lower() if "stage 2" in stage_lower: card.stage = "stage_2" elif "stage 1" in stage_lower: card.stage = "stage_1" elif "basic" in stage_lower: card.stage = "basic" # Get evolves_from evolves_match = soup.find(string=re.compile(r"Evolves from", re.IGNORECASE)) if evolves_match: # Try to find the Pokemon name link nearby parent = evolves_match.parent if hasattr(evolves_match, "parent") else None if parent: link = parent.find("a") if link: card.evolves_from = link.get_text(strip=True) # Get abilities for ability_row in soup.find_all(class_="ability-summary-row"): ability = self.parse_ability(ability_row) if ability: card.abilities.append(ability) # Get attacks for attack_row in soup.find_all(class_="attack-summary-row"): attack = self.parse_attack(attack_row) if attack: card.attacks.append(attack) # Get weakness weakness_section = soup.find(string=re.compile(r"Weakness", re.IGNORECASE)) if weakness_section: parent = weakness_section.parent if parent: card.weakness_type = self.parse_energy_type(parent.parent) # Look for +20 pattern value_match = re.search( r"\+(\d+)", parent.parent.get_text() if parent.parent else "" ) if value_match: card.weakness_value = int(value_match.group(1)) # Get retreat cost (count colorless energy icons in retreat section) retreat_section = soup.find(string=re.compile(r"Retreat", re.IGNORECASE)) if retreat_section: parent = retreat_section.parent if parent and parent.parent: retreat_icons = parent.parent.find_all( "span", class_=re.compile(r"energy-icon--type-colorless") ) card.retreat_cost = len(retreat_icons) # Get illustrator illustrator_match = soup.find(string=re.compile(r"Illustrated by", re.IGNORECASE)) if illustrator_match: card.illustrator = str(illustrator_match).replace("Illustrated by", "").strip() # Get flavor text (Pokemon description) # This is usually in a paragraph after the attacks section for p in soup.find_all("p"): text = p.get_text(strip=True) if text and len(text) > 50 and "Illustrated" not in text and "Artwork" not in text: card.flavor_text = text break def _parse_trainer_details(self, soup: BeautifulSoup, card: Card) -> None: """Parse Trainer-specific details from the page. Args: soup: BeautifulSoup object of the card page. card: Card object to populate. """ # Get trainer type (Item, Supporter, Stadium) from "Trainer | Type" text type_match = soup.find( string=re.compile(r"Trainer\s*\|\s*(Supporter|Item|Stadium)", re.IGNORECASE) ) if type_match: match = re.search(r"(Supporter|Item|Stadium)", str(type_match), re.IGNORECASE) if match: card.stage = match.group(1).lower() # Get effect text - look for card-detail__content-body # Using extract_effect_text to preserve energy type references content_body = soup.find("div", class_="card-detail__content-body") if content_body: # Extract effect text with proper energy type handling effect_text = extract_effect_text(content_body) # Remove illustrator info at the end if effect_text and "Illustrated by" in effect_text: effect_text = effect_text.split("Illustrated by")[0].strip() if effect_text: card.flavor_text = effect_text else: # Fallback: look for any paragraph with effect-like text for elem in soup.find_all("p"): effect_text = extract_effect_text(elem) if ( effect_text and len(effect_text) > 20 and "Illustrated" not in effect_text and "Artwork" not in effect_text and "Pokemon Zone" not in effect_text and "unofficial" not in effect_text.lower() ): card.flavor_text = effect_text break # Get illustrator illustrator_match = soup.find(string=re.compile(r"Illustrated by", re.IGNORECASE)) if illustrator_match: card.illustrator = str(illustrator_match).replace("Illustrated by", "").strip() def _parse_energy_details(self, soup: BeautifulSoup, card: Card) -> None: """Parse Energy-specific details from the page. Args: soup: BeautifulSoup object of the card page. card: Card object to populate. """ # Get energy type from the page card.pokemon_type = self.parse_energy_type(soup) # Get illustrator illustrator_match = soup.find(string=re.compile(r"Illustrated by", re.IGNORECASE)) if illustrator_match: card.illustrator = str(illustrator_match).replace("Illustrated by", "").strip() def scrape_card(self, card_id: str) -> Card | None: """Scrape a single card by ID. Args: card_id: Card ID in format "{set}-{number}-{name}" (e.g., "a1-132-gardevoir"). Returns: Card object or None if scraping failed. """ # Parse card ID match = re.match(r"([a-z0-9]+)-(\d+)-(.+)", card_id) if not match: logger.error(f"Invalid card ID format: {card_id}") return None set_code = match.group(1) card_number = int(match.group(2)) card_name = match.group(3) url = f"{BASE_URL}/cards/{set_code}/{card_number}/{card_name}/" logger.info(f"Scraping card: {url}") soup = self.fetch_page(url) if not soup: return None card = self.parse_card_page(soup, url, set_code) # Download image if enabled if card and self.download_images and card.image_url: card.image_file = self.download_image(card) return card def scrape_set(self, set_code: str, limit: int | None = None) -> list[Card]: """Scrape all cards from a set. Args: set_code: Set code (e.g., "a1", "a1a"). limit: Maximum number of cards to scrape (for testing). Returns: List of Card objects. """ if set_code not in SETS: logger.error(f"Unknown set code: {set_code}") return [] card_urls = self.get_card_urls_for_set(set_code) if limit: card_urls = card_urls[:limit] cards: list[Card] = [] total = len(card_urls) for i, (card_number, card_name, url) in enumerate(card_urls, 1): logger.info(f"[{i}/{total}] Scraping: {card_name} (#{card_number})") soup = self.fetch_page(url) if soup: card = self.parse_card_page(soup, url, set_code) if card: # Download image if enabled if self.download_images and card.image_url: card.image_file = self.download_image(card) time.sleep(IMAGE_REQUEST_DELAY) cards.append(card) self.save_card(card) # Rate limiting if i < total: time.sleep(REQUEST_DELAY) return cards def save_card(self, card: Card) -> Path: """Save a card to a JSON file. Args: card: Card object to save. Returns: Path to the saved file. """ set_dir = self.data_dir / card.set_code set_dir.mkdir(parents=True, exist_ok=True) # Generate filename: {number:03d}-{name}.json url_name = card.id.split("-", 2)[2] # Get name part from ID filename = f"{card.card_number:03d}-{url_name}.json" filepath = set_dir / filename with open(filepath, "w", encoding="utf-8") as f: json.dump(card.to_dict(), f, indent=2, ensure_ascii=False) logger.debug(f"Saved: {filepath}") return filepath def generate_index(self) -> Path: """Generate the combined index file from existing card files. Returns: Path to the index file. """ logger.info("Generating index...") index: dict[str, Any] = { "generated_at": datetime.now(UTC).isoformat(), "schema_version": "1.0", "sets": {}, "cards": [], "total_cards": 0, } for set_code in SETS: set_dir = self.data_dir / set_code if not set_dir.exists(): continue card_files = sorted(set_dir.glob("*.json")) index["sets"][set_code] = { "name": SETS[set_code]["name"], "card_count": len(card_files), } for card_file in card_files: relative_path = f"{set_code}/{card_file.name}" with open(card_file, encoding="utf-8") as f: card_data = json.load(f) index["cards"].append( { "id": card_data["id"], "name": card_data["name"], "set_code": set_code, "card_number": card_data["card_number"], "file": relative_path, } ) index["total_cards"] = len(index["cards"]) # Sort cards by set and number index["cards"].sort(key=lambda x: (x["set_code"], x["card_number"])) index_path = self.data_dir / "_index.json" with open(index_path, "w", encoding="utf-8") as f: json.dump(index, f, indent=2, ensure_ascii=False) logger.info(f"Index generated: {index_path} ({index['total_cards']} cards)") return index_path def save_errors(self) -> Path | None: """Save error log if there were any errors. Returns: Path to the error log file, or None if no errors. """ if not self.errors: return None error_log = self.data_dir / "_errors.log" with open(error_log, "w", encoding="utf-8") as f: f.write(f"Scraping errors - {datetime.now(UTC).isoformat()}\n") f.write("=" * 60 + "\n\n") for error in self.errors: f.write(f"URL: {error['url']}\n") f.write(f"Error: {error['error']}\n\n") logger.warning(f"Errors logged to: {error_log}") return error_log def download_images_for_existing_cards(self, set_code: str | None = None) -> int: """Download images for cards that already have JSON files. This is useful for downloading images separately from scraping, or for retrying failed image downloads. Args: set_code: Optional set code to limit downloads to a specific set. Returns: Number of images downloaded. """ sets_to_process = [set_code] if set_code else list(SETS.keys()) total_downloaded = 0 for sc in sets_to_process: set_dir = self.data_dir / sc if not set_dir.exists(): logger.warning(f"No card data found for set {sc}") continue card_files = sorted(set_dir.glob("*.json")) logger.info(f"Processing {len(card_files)} cards from set {sc}") for i, card_file in enumerate(card_files, 1): with open(card_file, encoding="utf-8") as f: card_data = json.load(f) image_url = card_data.get("image_url") existing_file = card_data.get("image_file") if not image_url: continue # Create a minimal Card object for download card = Card( id=card_data["id"], name=card_data["name"], set_code=card_data["set_code"], set_name=card_data["set_name"], card_number=card_data["card_number"], rarity=card_data["rarity"], card_type=card_data["card_type"], image_url=image_url, ) # Check if image already exists if existing_file: image_path = self.images_dir / existing_file if image_path.exists(): self.image_stats["skipped"] += 1 continue logger.info(f"[{i}/{len(card_files)}] Downloading: {card.name}") image_file = self.download_image(card) if image_file: # Update the JSON file with the image path card_data["image_file"] = image_file with open(card_file, "w", encoding="utf-8") as f: json.dump(card_data, f, indent=2, ensure_ascii=False) total_downloaded += 1 time.sleep(IMAGE_REQUEST_DELAY) return total_downloaded def log_image_stats(self) -> None: """Log image download statistics.""" stats = self.image_stats total = stats["downloaded"] + stats["skipped"] + stats["failed"] if total > 0: logger.info( f"Images: {stats['downloaded']} downloaded, " f"{stats['skipped']} skipped, {stats['failed']} failed" ) # ============================================================================= # CLI # ============================================================================= def main() -> int: """Main entry point for the scraper CLI.""" parser = argparse.ArgumentParser( description="Scrape Pokemon TCG Pocket card data from pokemon-zone.com", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Scrape entire set uv run python scripts/scrape_pokemon_pocket.py --set a1 # Scrape with limit (for testing) uv run python scripts/scrape_pokemon_pocket.py --set a1 --limit 5 # Scrape set with images uv run python scripts/scrape_pokemon_pocket.py --set a1 --images # Download images for existing card data uv run python scripts/scrape_pokemon_pocket.py --download-images uv run python scripts/scrape_pokemon_pocket.py --download-images --set a1 # Scrape single card by ID uv run python scripts/scrape_pokemon_pocket.py --card a1-132-gardevoir # Regenerate index from existing card files uv run python scripts/scrape_pokemon_pocket.py --reindex """, ) # Main action group - what operation to perform action_group = parser.add_mutually_exclusive_group(required=True) action_group.add_argument( "--set", choices=list(SETS.keys()), help="Scrape all cards from a set", ) action_group.add_argument( "--card", type=str, help="Scrape a single card by ID (e.g., a1-132-gardevoir)", ) action_group.add_argument( "--reindex", action="store_true", help="Regenerate index from existing card files", ) action_group.add_argument( "--download-images", choices=list(SETS.keys()) + ["all"], nargs="?", const="all", help="Download images for existing card data (specify set or 'all')", ) parser.add_argument( "--limit", type=int, help="Maximum number of cards to scrape (for testing)", ) parser.add_argument( "--images", action="store_true", help="Download card images while scraping", ) args = parser.parse_args() scraper = PokemonPocketScraper(download_images=args.images) if args.reindex: scraper.generate_index() return 0 if args.download_images: # Download images for existing cards set_code = None if args.download_images == "all" else args.download_images set_info = f"set {set_code}" if set_code else "all sets" logger.info(f"Downloading images for existing card data ({set_info})...") scraper.download_images = True # Enable downloads downloaded = scraper.download_images_for_existing_cards(set_code) scraper.log_image_stats() logger.info(f"Image download complete: {downloaded} new images") return 0 if args.card: card = scraper.scrape_card(args.card) if card: scraper.save_card(card) scraper.generate_index() scraper.log_image_stats() logger.info(f"Successfully scraped: {card.name}") return 0 else: logger.error(f"Failed to scrape card: {args.card}") return 1 if args.set: cards = scraper.scrape_set(args.set, limit=args.limit) scraper.generate_index() scraper.save_errors() scraper.log_image_stats() success_count = len(cards) error_count = len(scraper.errors) total = success_count + error_count logger.info(f"Scraping complete: {success_count}/{total} cards succeeded") if error_count > 0: logger.warning(f"{error_count} errors occurred (see _errors.log)") return 1 return 0 return 1 if __name__ == "__main__": sys.exit(main())