#!/usr/bin/env python
"""Scrape Pokemon TCG Pocket card data from pokemon-zone.com.
This script fetches card data from the Genetic Apex (A1) and Mythical Island (A1a)
sets and saves them as individual JSON files for use in the Mantimon TCG game engine.
Usage:
# Scrape entire set
uv run python scripts/scrape_pokemon_pocket.py --set a1
# Scrape with limit (for testing)
uv run python scripts/scrape_pokemon_pocket.py --set a1 --limit 5
# Scrape single card by ID
uv run python scripts/scrape_pokemon_pocket.py --card a1-132-gardevoir
# Regenerate index from existing card files
uv run python scripts/scrape_pokemon_pocket.py --reindex
Output:
- Individual card files: data/raw/{set}/{number}-{name}.json
- Combined index: data/raw/_index.json
- Error log: data/raw/_errors.log
"""
import argparse
import json
import logging
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
import requests
from bs4 import BeautifulSoup, Tag
# =============================================================================
# Configuration
# =============================================================================
BASE_URL = "https://www.pokemon-zone.com"
DATA_DIR = Path(__file__).parent.parent / "data" / "raw"
IMAGES_DIR = Path(__file__).parent.parent / "data" / "images"
REQUEST_DELAY = 1.5 # seconds between requests
IMAGE_REQUEST_DELAY = 0.5 # faster for images (different server)
MAX_RETRIES = 3
RETRY_DELAY = 5 # seconds
# Set info for validation and metadata
SETS = {
"a1": {"name": "Genetic Apex", "expected_cards": 286},
"a1a": {"name": "Mythical Island", "expected_cards": 86},
}
# Energy type mapping from CSS classes (for attack cost icons)
ENERGY_TYPES = {
"energy-icon--type-grass": "grass",
"energy-icon--type-fire": "fire",
"energy-icon--type-water": "water",
"energy-icon--type-lightning": "lightning",
"energy-icon--type-psychic": "psychic",
"energy-icon--type-fighting": "fighting",
"energy-icon--type-darkness": "darkness",
"energy-icon--type-metal": "metal",
"energy-icon--type-colorless": "colorless",
"energy-icon--type-dragon": "dragon",
}
# Energy text mapping from CSS classes (for inline text references)
# These appear in effect text like "Discard a Energy"
ENERGY_TEXT_TYPES = {
"energy-text--type-grass": "Grass",
"energy-text--type-fire": "Fire",
"energy-text--type-water": "Water",
"energy-text--type-lightning": "Lightning",
"energy-text--type-psychic": "Psychic",
"energy-text--type-fighting": "Fighting",
"energy-text--type-darkness": "Darkness",
"energy-text--type-metal": "Metal",
"energy-text--type-colorless": "Colorless",
"energy-text--type-dragon": "Dragon",
}
# Rarity code mapping from CSS classes (rarity-icon--rarity-X)
RARITY_CODES = {
"C": "Common",
"U": "Uncommon",
"R": "Rare",
"RR": "Double Rare",
"AR": "Art Rare",
"SAR": "Special Art Rare",
"UR": "Ultra Rare",
"IM": "Immersive",
"S": "Shiny",
"CR": "Crown Rare",
}
# Fossil cards that are Trainer/Item cards, not Pokemon
# These are often misclassified because they have Pokemon-like layouts on the source site
FOSSIL_CARDS = {
"Helix Fossil",
"Dome Fossil",
"Old Amber",
}
# Text artifact patterns to fix (caused by stripped energy icons)
# Format: (pattern, replacement)
# Energy icons render as empty strings, merging adjacent text
TEXT_ARTIFACT_FIXES = [
# Energy-related artifacts
(r"\baEnergy\b", "an Energy"),
(r"\bofEnergy\b", "of Energy"),
(r"\bextraEnergy\b", "extra Energy"),
(r"\battachedEnergy\b", "attached Energy"),
(r"\banyEnergy\b", "any Energy"),
(r"(\d+)Energy\b", r"\1 Energy"), # "2Energy" -> "2 Energy"
(r"(\d+)-HP\b", r"\1 HP"), # "40-HP" -> "40 HP"
# Pokemon-related artifacts
(r"\bBasicPokémon\b", "Basic Pokémon"),
(r"\bBenchedPokémon\b", "Benched Pokémon"),
(r"\bthePokémon\b", "the Pokémon"),
(r"\bthisPokémon\b", "this Pokémon"),
(r"\byourPokémon\b", "your Pokémon"),
(r"\bActivePokémon\b", "Active Pokémon"),
(r"\bDefendingPokémon\b", "Defending Pokémon"),
(r"\bopponent'sPokémon\b", "opponent's Pokémon"),
(r"\bOpponent'sPokémon\b", "Opponent's Pokémon"),
# Other common artifacts
(r"\bthatPokémon\b", "that Pokémon"),
(r"\beachPokémon\b", "each Pokémon"),
(r"\baPokémon\b", "a Pokémon"),
]
def replace_energy_text_spans(element: Tag) -> None:
"""Replace energy-text spans with their text representation in-place.
The source website uses
to render inline energy type icons in effect text. When BeautifulSoup extracts
text with get_text(), these empty spans disappear, merging adjacent words.
This function finds all such spans and replaces them with their text equivalent
(e.g., "Fire") BEFORE text extraction, preserving the energy type information.
Args:
element: BeautifulSoup element to process in-place. The element is modified
directly - energy-text spans are replaced with NavigableString text.
Example:
>>> soup = BeautifulSoup('
Discard a Energy
')
>>> div = soup.find('div')
>>> replace_energy_text_spans(div)
>>> div.get_text()
'Discard a Fire Energy'
"""
from bs4 import NavigableString
# Find all energy-text spans (they may have multiple classes)
for span in element.find_all("span", class_=re.compile(r"energy-text--type-")):
energy_type = None
for cls in span.get("class", []):
if cls in ENERGY_TEXT_TYPES:
energy_type = ENERGY_TEXT_TYPES[cls]
break
if energy_type:
# Replace the span with a text node
span.replace_with(NavigableString(energy_type))
def extract_effect_text(element: Tag) -> str | None:
"""Extract effect text from an element, properly handling energy-text spans.
This is the correct way to extract effect text that may contain energy type
icons. It replaces the energy-text spans with readable text before extraction.
Args:
element: BeautifulSoup element containing effect text.
Returns:
Cleaned effect text, or None if empty.
"""
# Work on a deep copy to avoid modifying the original soup
# BeautifulSoup's copy.copy is shallow - we need to re-parse
from bs4 import BeautifulSoup as BS
element_copy = BS(str(element), "html.parser")
replace_energy_text_spans(element_copy)
# Use separator=" " to properly join text nodes with spaces
# strip=True removes leading/trailing whitespace from each text node
raw_text = element_copy.get_text(separator=" ", strip=True)
# Normalize multiple spaces to single spaces
raw_text = re.sub(r"\s+", " ", raw_text).strip()
# Still apply artifact fixes as fallback for any edge cases
return clean_effect_text(raw_text)
def clean_effect_text(text: str | None) -> str | None:
"""Clean scraped effect text by fixing common artifacts.
This is a fallback for any text artifacts not caught by energy-text span
replacement. It handles edge cases like merged words.
Args:
text: The raw effect text from scraping.
Returns:
Cleaned text with artifacts fixed, or None if input was None/empty.
"""
if not text:
return None
result = text
for pattern, replacement in TEXT_ARTIFACT_FIXES:
result = re.sub(pattern, replacement, result)
return result if result else None
# =============================================================================
# Logging Setup
# =============================================================================
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)
# =============================================================================
# Data Classes
# =============================================================================
@dataclass
class Attack:
"""A Pokemon's attack."""
name: str
cost: list[str]
damage: int | None
damage_modifier: str | None # "+", "x", or None
effect_text: str | None
effect_id: str | None = None # To be mapped later
def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary for JSON serialization."""
return {
"name": self.name,
"cost": self.cost,
"damage": self.damage,
"damage_modifier": self.damage_modifier,
"effect_text": self.effect_text,
"effect_id": self.effect_id,
}
@dataclass
class Ability:
"""A Pokemon's ability."""
name: str
effect_text: str
effect_id: str | None = None # To be mapped later
def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary for JSON serialization."""
return {
"name": self.name,
"effect_text": self.effect_text,
"effect_id": self.effect_id,
}
@dataclass
class Card:
"""Complete card data."""
id: str
name: str
set_code: str
set_name: str
card_number: int
rarity: str
card_type: str # "pokemon", "trainer", "energy"
image_url: str | None = None # URL to card image for offline caching
image_file: str | None = None # Local path to downloaded image (relative to images dir)
hp: int | None = None
pokemon_type: str | None = None
stage: str | None = None # "basic", "stage_1", "stage_2"
evolves_from: str | None = None
is_ex: bool = False
abilities: list[Ability] = field(default_factory=list)
attacks: list[Attack] = field(default_factory=list)
weakness_type: str | None = None
weakness_value: int | None = None
resistance_type: str | None = None
resistance_value: int | None = None
retreat_cost: int = 0
flavor_text: str | None = None
illustrator: str | None = None
source_url: str = ""
def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary for JSON serialization."""
data: dict[str, Any] = {
"id": self.id,
"name": self.name,
"set_code": self.set_code,
"set_name": self.set_name,
"card_number": self.card_number,
"rarity": self.rarity,
"card_type": self.card_type,
"image_url": self.image_url,
"image_file": self.image_file,
"source_url": self.source_url,
}
if self.card_type == "pokemon":
data.update(
{
"hp": self.hp,
"pokemon_type": self.pokemon_type,
"stage": self.stage,
"evolves_from": self.evolves_from,
"is_ex": self.is_ex,
"abilities": [a.to_dict() for a in self.abilities],
"attacks": [a.to_dict() for a in self.attacks],
"weakness": (
{"type": self.weakness_type, "value": self.weakness_value}
if self.weakness_type
else None
),
"resistance": (
{"type": self.resistance_type, "value": self.resistance_value}
if self.resistance_type
else None
),
"retreat_cost": self.retreat_cost,
"flavor_text": self.flavor_text,
"illustrator": self.illustrator,
}
)
elif self.card_type == "trainer":
data.update(
{
"trainer_type": self.stage, # Reusing stage field for trainer type
"effect_text": self.flavor_text, # Trainer effect
"illustrator": self.illustrator,
}
)
elif self.card_type == "energy":
data.update(
{
"energy_type": self.pokemon_type,
"illustrator": self.illustrator,
}
)
return data
# =============================================================================
# Scraper Class
# =============================================================================
class PokemonPocketScraper:
"""Scraper for Pokemon TCG Pocket card data from pokemon-zone.com."""
def __init__(
self,
data_dir: Path = DATA_DIR,
images_dir: Path = IMAGES_DIR,
download_images: bool = False,
):
"""Initialize the scraper.
Args:
data_dir: Directory to save card data files.
images_dir: Directory to save downloaded card images.
download_images: Whether to download card images.
"""
self.data_dir = data_dir
self.images_dir = images_dir
self.download_images = download_images
self.session = requests.Session()
self.session.headers.update(
{
"User-Agent": "MantimonTCG-CardScraper/1.0 (https://github.com/mantimon-tcg)",
"Accept": "text/html,application/xhtml+xml",
}
)
self.errors: list[dict[str, Any]] = []
self.image_stats = {"downloaded": 0, "skipped": 0, "failed": 0}
def fetch_page(self, url: str) -> BeautifulSoup | None:
"""Fetch a page with retry logic.
Args:
url: URL to fetch.
Returns:
BeautifulSoup object or None if all retries failed.
"""
for attempt in range(MAX_RETRIES):
try:
response = self.session.get(url, timeout=30)
response.raise_for_status()
return BeautifulSoup(response.text, "html.parser")
except requests.RequestException as e:
logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for {url}: {e}")
if attempt < MAX_RETRIES - 1:
time.sleep(RETRY_DELAY)
self.errors.append({"url": url, "error": "Max retries exceeded"})
return None
def download_image(self, card: Card) -> str | None:
"""Download a card image and save it locally.
Args:
card: Card object with image_url set.
Returns:
Relative path to the saved image, or None if download failed.
"""
if not card.image_url:
return None
# Create directory structure: images/{set_code}/
set_dir = self.images_dir / card.set_code
set_dir.mkdir(parents=True, exist_ok=True)
# Determine file extension from URL
url_path = card.image_url.split("?")[0] # Remove query params
ext = Path(url_path).suffix or ".webp"
# Generate filename: {number:03d}-{name}{ext}
url_name = card.id.split("-", 2)[2] # Get name part from ID
filename = f"{card.card_number:03d}-{url_name}{ext}"
filepath = set_dir / filename
relative_path = f"{card.set_code}/{filename}"
# Skip if already downloaded
if filepath.exists():
logger.debug(f"Image exists, skipping: {relative_path}")
self.image_stats["skipped"] += 1
return relative_path
# Download the image with appropriate headers
image_headers = {
"Accept": "image/webp,image/apng,image/*,*/*;q=0.8",
"Referer": "https://www.pokemon-zone.com/",
}
for attempt in range(MAX_RETRIES):
try:
response = self.session.get(card.image_url, timeout=30, headers=image_headers)
response.raise_for_status()
# Verify it's an image
content_type = response.headers.get("content-type", "")
if not content_type.startswith("image/"):
logger.warning(f"Not an image: {content_type} for {card.image_url}")
self.image_stats["failed"] += 1
return None
# Save the image
with open(filepath, "wb") as f:
f.write(response.content)
logger.debug(f"Downloaded: {relative_path} ({len(response.content)} bytes)")
self.image_stats["downloaded"] += 1
return relative_path
except requests.RequestException as e:
logger.warning(f"Image download attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
if attempt < MAX_RETRIES - 1:
time.sleep(RETRY_DELAY)
logger.error(f"Failed to download image for {card.id}")
self.image_stats["failed"] += 1
return None
def get_card_urls_for_set(self, set_code: str) -> list[tuple[int, str, str]]:
"""Get all card URLs for a set.
Args:
set_code: Set code (e.g., "a1", "a1a").
Returns:
List of (card_number, card_name, url) tuples.
"""
set_url = f"{BASE_URL}/sets/{set_code}/"
logger.info(f"Fetching set page: {set_url}")
soup = self.fetch_page(set_url)
if not soup:
logger.error(f"Failed to fetch set page for {set_code}")
return []
cards: list[tuple[int, str, str]] = []
# Find all card links - they follow pattern /cards/{set}/{number}/{name}/
pattern = re.compile(rf"^/cards/{set_code}/(\d+)/([^/]+)/$")
for link in soup.find_all("a", href=pattern):
href = link.get("href", "")
match = pattern.match(href)
if match:
card_number = int(match.group(1))
card_name = match.group(2)
# Avoid duplicates (page may have multiple links to same card)
card_tuple = (card_number, card_name, f"{BASE_URL}{href}")
if card_tuple not in cards:
cards.append(card_tuple)
# Sort by card number
cards.sort(key=lambda x: x[0])
logger.info(f"Found {len(cards)} cards in set {set_code}")
return cards
def parse_energy_type(self, element: Tag | None) -> str | None:
"""Extract energy type from an element containing an energy icon.
Args:
element: BeautifulSoup element that may contain energy icons.
Returns:
Energy type string or None.
"""
if not element:
return None
for icon in element.find_all("span", class_=re.compile(r"energy-icon--type-")):
for cls in icon.get("class", []):
if cls in ENERGY_TYPES:
return ENERGY_TYPES[cls]
return None
def parse_attack(self, attack_row: Tag) -> Attack | None:
"""Parse an attack from an attack-summary-row element.
Args:
attack_row: BeautifulSoup element for the attack row.
Returns:
Attack object or None if parsing failed.
"""
try:
# Get attack name
name_elem = attack_row.find(class_="attack-summary-row__name")
if not name_elem:
return None
name = name_elem.get_text(strip=True)
# Get energy cost
cost: list[str] = []
costs_elem = attack_row.find(class_="attack-summary-row__costs")
if costs_elem:
for cost_icon in costs_elem.find_all("span", class_=re.compile(r"energy-icon")):
for cls in cost_icon.get("class", []):
if cls in ENERGY_TYPES:
cost.append(ENERGY_TYPES[cls])
# Get damage
damage: int | None = None
damage_modifier: str | None = None
damage_elem = attack_row.find(class_="attack-summary-row__damage")
if damage_elem:
damage_text = damage_elem.get_text(strip=True)
# Parse damage like "60", "50+", "100x"
match = re.match(r"(\d+)([+x])?", damage_text)
if match:
damage = int(match.group(1))
damage_modifier = match.group(2)
# Get effect text (using extract_effect_text to preserve energy types)
effect_text: str | None = None
footer_elem = attack_row.find(class_="attack-summary-row__footer")
if footer_elem:
effect_text = extract_effect_text(footer_elem)
return Attack(
name=name,
cost=cost,
damage=damage,
damage_modifier=damage_modifier,
effect_text=effect_text,
)
except Exception as e:
logger.warning(f"Failed to parse attack: {e}")
return None
def parse_ability(self, ability_row: Tag) -> Ability | None:
"""Parse an ability from an ability-summary-row element.
Args:
ability_row: BeautifulSoup element for the ability row.
Returns:
Ability object or None if parsing failed.
"""
try:
# Get ability name (text after "Ability" badge)
name_elem = ability_row.find(class_="ability-summary-row__name")
if not name_elem:
return None
# Remove the "Ability" badge text to get just the name
name_text = name_elem.get_text(strip=True)
name = re.sub(r"^Ability\s*", "", name_text)
# Get effect text (using extract_effect_text to preserve energy types)
desc_elem = ability_row.find(class_="ability-summary-row__description")
effect_text = extract_effect_text(desc_elem) if desc_elem else ""
return Ability(name=name, effect_text=effect_text)
except Exception as e:
logger.warning(f"Failed to parse ability: {e}")
return None
def parse_card_page(self, soup: BeautifulSoup, url: str, set_code: str) -> Card | None:
"""Parse a card page into a Card object.
Args:
soup: BeautifulSoup object of the card page.
url: URL of the card page (for error logging).
set_code: Set code for this card.
Returns:
Card object or None if parsing failed.
"""
try:
# Extract card number and name from URL
match = re.search(rf"/cards/{set_code}/(\d+)/([^/]+)/", url)
if not match:
logger.error(f"Could not parse card URL: {url}")
return None
card_number = int(match.group(1))
url_name = match.group(2)
# Get card name from page
name_elem = soup.find("h1")
if not name_elem:
logger.error(f"Could not find card name on page: {url}")
return None
name = name_elem.get_text(strip=True)
# Determine card type - look for specific card type indicators
# Trainers have "Trainer | Supporter/Item/Stadium" text
# Energy cards have specific energy type text
# Pokemon cards have "Pokémon | Basic/Stage 1/Stage 2" text
card_type = "pokemon" # Default
trainer_type: str | None = None
# Check for Fossil cards first (they appear as Pokemon on the site but are Items)
if name in FOSSIL_CARDS:
card_type = "trainer"
trainer_type = "item"
logger.info(f"Detected fossil card as Trainer/Item: {name}")
# Check for Trainer (more specific match)
elif soup.find(
string=re.compile(r"Trainer\s*\|\s*(Supporter|Item|Stadium)", re.IGNORECASE)
):
card_type = "trainer"
# Check for Energy
elif soup.find(string=re.compile(r"Energy\s*$", re.IGNORECASE)):
card_type = "energy"
# Create card ID
card_id = f"{set_code}-{card_number:03d}-{url_name}"
# Get rarity from CSS class (rarity-icon--rarity-X)
rarity = "Unknown"
rarity_icon = soup.find("span", class_="rarity-icon")
if rarity_icon:
for cls in rarity_icon.get("class", []):
if "rarity-icon--rarity-" in cls:
rarity_code = cls.replace("rarity-icon--rarity-", "")
rarity = RARITY_CODES.get(rarity_code, rarity_code)
break
# Get card image URL (first image in card-detail__card section)
image_url: str | None = None
card_section = soup.find("div", class_="card-detail__card")
if card_section:
img = card_section.find("img")
if img:
image_url = img.get("src")
# Remove query params to get full resolution
if image_url and "?" in image_url:
image_url = image_url.split("?")[0]
# Initialize card
# is_ex: Check if name ends with " ex" (case insensitive)
# This avoids false positives like "Exeggutor"
is_ex = name.lower().endswith(" ex")
card = Card(
id=card_id,
name=name,
set_code=set_code,
set_name=SETS.get(set_code, {}).get("name", set_code),
card_number=card_number,
rarity=rarity,
card_type=card_type,
image_url=image_url,
source_url=url,
is_ex=is_ex,
)
if card_type == "pokemon":
self._parse_pokemon_details(soup, card)
elif card_type == "trainer":
# For fossil cards, we already know the trainer_type
if trainer_type:
card.stage = trainer_type
self._parse_trainer_details(soup, card)
elif card_type == "energy":
self._parse_energy_details(soup, card)
return card
except Exception as e:
logger.error(f"Failed to parse card page {url}: {e}")
self.errors.append({"url": url, "error": str(e)})
return None
def _parse_pokemon_details(self, soup: BeautifulSoup, card: Card) -> None:
"""Parse Pokemon-specific details from the page.
Args:
soup: BeautifulSoup object of the card page.
card: Card object to populate.
"""
# Get HP
hp_match = soup.find(string=re.compile(r"HP\s*(\d+)", re.IGNORECASE))
if hp_match:
hp_num = re.search(r"(\d+)", str(hp_match))
if hp_num:
card.hp = int(hp_num.group(1))
else:
# Try finding HP in the stat display
hp_elem = soup.find("span", string="HP")
if hp_elem:
hp_value = hp_elem.find_next("span")
if hp_value:
hp_text = hp_value.get_text(strip=True)
hp_num = re.search(r"(\d+)", hp_text)
if hp_num:
card.hp = int(hp_num.group(1))
# Get Pokemon type from first energy icon NOT in an attack row
# The card's type icon is in the header area, not in attack-summary-row__cost
for icon in soup.find_all("span", class_=re.compile(r"energy-icon--type-")):
parent = icon.parent
parent_classes = parent.get("class", []) if parent else []
# Skip if this is an attack cost icon
if "attack-summary-row__cost" not in parent_classes:
for cls in icon.get("class", []):
if cls in ENERGY_TYPES:
card.pokemon_type = ENERGY_TYPES[cls]
break
if card.pokemon_type:
break
# Get stage and evolution info
stage_text = soup.find(string=re.compile(r"Basic|Stage 1|Stage 2", re.IGNORECASE))
if stage_text:
stage_lower = str(stage_text).lower()
if "stage 2" in stage_lower:
card.stage = "stage_2"
elif "stage 1" in stage_lower:
card.stage = "stage_1"
elif "basic" in stage_lower:
card.stage = "basic"
# Get evolves_from
evolves_match = soup.find(string=re.compile(r"Evolves from", re.IGNORECASE))
if evolves_match:
# Try to find the Pokemon name link nearby
parent = evolves_match.parent if hasattr(evolves_match, "parent") else None
if parent:
link = parent.find("a")
if link:
card.evolves_from = link.get_text(strip=True)
# Get abilities
for ability_row in soup.find_all(class_="ability-summary-row"):
ability = self.parse_ability(ability_row)
if ability:
card.abilities.append(ability)
# Get attacks
for attack_row in soup.find_all(class_="attack-summary-row"):
attack = self.parse_attack(attack_row)
if attack:
card.attacks.append(attack)
# Get weakness
weakness_section = soup.find(string=re.compile(r"Weakness", re.IGNORECASE))
if weakness_section:
parent = weakness_section.parent
if parent:
card.weakness_type = self.parse_energy_type(parent.parent)
# Look for +20 pattern
value_match = re.search(
r"\+(\d+)", parent.parent.get_text() if parent.parent else ""
)
if value_match:
card.weakness_value = int(value_match.group(1))
# Get retreat cost (count colorless energy icons in retreat section)
retreat_section = soup.find(string=re.compile(r"Retreat", re.IGNORECASE))
if retreat_section:
parent = retreat_section.parent
if parent and parent.parent:
retreat_icons = parent.parent.find_all(
"span", class_=re.compile(r"energy-icon--type-colorless")
)
card.retreat_cost = len(retreat_icons)
# Get illustrator
illustrator_match = soup.find(string=re.compile(r"Illustrated by", re.IGNORECASE))
if illustrator_match:
card.illustrator = str(illustrator_match).replace("Illustrated by", "").strip()
# Get flavor text (Pokemon description)
# This is usually in a paragraph after the attacks section
for p in soup.find_all("p"):
text = p.get_text(strip=True)
if text and len(text) > 50 and "Illustrated" not in text and "Artwork" not in text:
card.flavor_text = text
break
def _parse_trainer_details(self, soup: BeautifulSoup, card: Card) -> None:
"""Parse Trainer-specific details from the page.
Args:
soup: BeautifulSoup object of the card page.
card: Card object to populate.
"""
# Get trainer type (Item, Supporter, Stadium) from "Trainer | Type" text
type_match = soup.find(
string=re.compile(r"Trainer\s*\|\s*(Supporter|Item|Stadium)", re.IGNORECASE)
)
if type_match:
match = re.search(r"(Supporter|Item|Stadium)", str(type_match), re.IGNORECASE)
if match:
card.stage = match.group(1).lower()
# Get effect text - look for card-detail__content-body
# Using extract_effect_text to preserve energy type references
content_body = soup.find("div", class_="card-detail__content-body")
if content_body:
# Extract effect text with proper energy type handling
effect_text = extract_effect_text(content_body)
# Remove illustrator info at the end
if effect_text and "Illustrated by" in effect_text:
effect_text = effect_text.split("Illustrated by")[0].strip()
if effect_text:
card.flavor_text = effect_text
else:
# Fallback: look for any paragraph with effect-like text
for elem in soup.find_all("p"):
effect_text = extract_effect_text(elem)
if (
effect_text
and len(effect_text) > 20
and "Illustrated" not in effect_text
and "Artwork" not in effect_text
and "Pokemon Zone" not in effect_text
and "unofficial" not in effect_text.lower()
):
card.flavor_text = effect_text
break
# Get illustrator
illustrator_match = soup.find(string=re.compile(r"Illustrated by", re.IGNORECASE))
if illustrator_match:
card.illustrator = str(illustrator_match).replace("Illustrated by", "").strip()
def _parse_energy_details(self, soup: BeautifulSoup, card: Card) -> None:
"""Parse Energy-specific details from the page.
Args:
soup: BeautifulSoup object of the card page.
card: Card object to populate.
"""
# Get energy type from the page
card.pokemon_type = self.parse_energy_type(soup)
# Get illustrator
illustrator_match = soup.find(string=re.compile(r"Illustrated by", re.IGNORECASE))
if illustrator_match:
card.illustrator = str(illustrator_match).replace("Illustrated by", "").strip()
def scrape_card(self, card_id: str) -> Card | None:
"""Scrape a single card by ID.
Args:
card_id: Card ID in format "{set}-{number}-{name}" (e.g., "a1-132-gardevoir").
Returns:
Card object or None if scraping failed.
"""
# Parse card ID
match = re.match(r"([a-z0-9]+)-(\d+)-(.+)", card_id)
if not match:
logger.error(f"Invalid card ID format: {card_id}")
return None
set_code = match.group(1)
card_number = int(match.group(2))
card_name = match.group(3)
url = f"{BASE_URL}/cards/{set_code}/{card_number}/{card_name}/"
logger.info(f"Scraping card: {url}")
soup = self.fetch_page(url)
if not soup:
return None
card = self.parse_card_page(soup, url, set_code)
# Download image if enabled
if card and self.download_images and card.image_url:
card.image_file = self.download_image(card)
return card
def scrape_set(self, set_code: str, limit: int | None = None) -> list[Card]:
"""Scrape all cards from a set.
Args:
set_code: Set code (e.g., "a1", "a1a").
limit: Maximum number of cards to scrape (for testing).
Returns:
List of Card objects.
"""
if set_code not in SETS:
logger.error(f"Unknown set code: {set_code}")
return []
card_urls = self.get_card_urls_for_set(set_code)
if limit:
card_urls = card_urls[:limit]
cards: list[Card] = []
total = len(card_urls)
for i, (card_number, card_name, url) in enumerate(card_urls, 1):
logger.info(f"[{i}/{total}] Scraping: {card_name} (#{card_number})")
soup = self.fetch_page(url)
if soup:
card = self.parse_card_page(soup, url, set_code)
if card:
# Download image if enabled
if self.download_images and card.image_url:
card.image_file = self.download_image(card)
time.sleep(IMAGE_REQUEST_DELAY)
cards.append(card)
self.save_card(card)
# Rate limiting
if i < total:
time.sleep(REQUEST_DELAY)
return cards
def save_card(self, card: Card) -> Path:
"""Save a card to a JSON file.
Args:
card: Card object to save.
Returns:
Path to the saved file.
"""
set_dir = self.data_dir / card.set_code
set_dir.mkdir(parents=True, exist_ok=True)
# Generate filename: {number:03d}-{name}.json
url_name = card.id.split("-", 2)[2] # Get name part from ID
filename = f"{card.card_number:03d}-{url_name}.json"
filepath = set_dir / filename
with open(filepath, "w", encoding="utf-8") as f:
json.dump(card.to_dict(), f, indent=2, ensure_ascii=False)
logger.debug(f"Saved: {filepath}")
return filepath
def generate_index(self) -> Path:
"""Generate the combined index file from existing card files.
Returns:
Path to the index file.
"""
logger.info("Generating index...")
index: dict[str, Any] = {
"generated_at": datetime.now(UTC).isoformat(),
"schema_version": "1.0",
"sets": {},
"cards": [],
"total_cards": 0,
}
for set_code in SETS:
set_dir = self.data_dir / set_code
if not set_dir.exists():
continue
card_files = sorted(set_dir.glob("*.json"))
index["sets"][set_code] = {
"name": SETS[set_code]["name"],
"card_count": len(card_files),
}
for card_file in card_files:
relative_path = f"{set_code}/{card_file.name}"
with open(card_file, encoding="utf-8") as f:
card_data = json.load(f)
index["cards"].append(
{
"id": card_data["id"],
"name": card_data["name"],
"set_code": set_code,
"card_number": card_data["card_number"],
"file": relative_path,
}
)
index["total_cards"] = len(index["cards"])
# Sort cards by set and number
index["cards"].sort(key=lambda x: (x["set_code"], x["card_number"]))
index_path = self.data_dir / "_index.json"
with open(index_path, "w", encoding="utf-8") as f:
json.dump(index, f, indent=2, ensure_ascii=False)
logger.info(f"Index generated: {index_path} ({index['total_cards']} cards)")
return index_path
def save_errors(self) -> Path | None:
"""Save error log if there were any errors.
Returns:
Path to the error log file, or None if no errors.
"""
if not self.errors:
return None
error_log = self.data_dir / "_errors.log"
with open(error_log, "w", encoding="utf-8") as f:
f.write(f"Scraping errors - {datetime.now(UTC).isoformat()}\n")
f.write("=" * 60 + "\n\n")
for error in self.errors:
f.write(f"URL: {error['url']}\n")
f.write(f"Error: {error['error']}\n\n")
logger.warning(f"Errors logged to: {error_log}")
return error_log
def download_images_for_existing_cards(self, set_code: str | None = None) -> int:
"""Download images for cards that already have JSON files.
This is useful for downloading images separately from scraping,
or for retrying failed image downloads.
Args:
set_code: Optional set code to limit downloads to a specific set.
Returns:
Number of images downloaded.
"""
sets_to_process = [set_code] if set_code else list(SETS.keys())
total_downloaded = 0
for sc in sets_to_process:
set_dir = self.data_dir / sc
if not set_dir.exists():
logger.warning(f"No card data found for set {sc}")
continue
card_files = sorted(set_dir.glob("*.json"))
logger.info(f"Processing {len(card_files)} cards from set {sc}")
for i, card_file in enumerate(card_files, 1):
with open(card_file, encoding="utf-8") as f:
card_data = json.load(f)
image_url = card_data.get("image_url")
existing_file = card_data.get("image_file")
if not image_url:
continue
# Create a minimal Card object for download
card = Card(
id=card_data["id"],
name=card_data["name"],
set_code=card_data["set_code"],
set_name=card_data["set_name"],
card_number=card_data["card_number"],
rarity=card_data["rarity"],
card_type=card_data["card_type"],
image_url=image_url,
)
# Check if image already exists
if existing_file:
image_path = self.images_dir / existing_file
if image_path.exists():
self.image_stats["skipped"] += 1
continue
logger.info(f"[{i}/{len(card_files)}] Downloading: {card.name}")
image_file = self.download_image(card)
if image_file:
# Update the JSON file with the image path
card_data["image_file"] = image_file
with open(card_file, "w", encoding="utf-8") as f:
json.dump(card_data, f, indent=2, ensure_ascii=False)
total_downloaded += 1
time.sleep(IMAGE_REQUEST_DELAY)
return total_downloaded
def log_image_stats(self) -> None:
"""Log image download statistics."""
stats = self.image_stats
total = stats["downloaded"] + stats["skipped"] + stats["failed"]
if total > 0:
logger.info(
f"Images: {stats['downloaded']} downloaded, "
f"{stats['skipped']} skipped, {stats['failed']} failed"
)
# =============================================================================
# CLI
# =============================================================================
def main() -> int:
"""Main entry point for the scraper CLI."""
parser = argparse.ArgumentParser(
description="Scrape Pokemon TCG Pocket card data from pokemon-zone.com",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Scrape entire set
uv run python scripts/scrape_pokemon_pocket.py --set a1
# Scrape with limit (for testing)
uv run python scripts/scrape_pokemon_pocket.py --set a1 --limit 5
# Scrape set with images
uv run python scripts/scrape_pokemon_pocket.py --set a1 --images
# Download images for existing card data
uv run python scripts/scrape_pokemon_pocket.py --download-images
uv run python scripts/scrape_pokemon_pocket.py --download-images --set a1
# Scrape single card by ID
uv run python scripts/scrape_pokemon_pocket.py --card a1-132-gardevoir
# Regenerate index from existing card files
uv run python scripts/scrape_pokemon_pocket.py --reindex
""",
)
# Main action group - what operation to perform
action_group = parser.add_mutually_exclusive_group(required=True)
action_group.add_argument(
"--set",
choices=list(SETS.keys()),
help="Scrape all cards from a set",
)
action_group.add_argument(
"--card",
type=str,
help="Scrape a single card by ID (e.g., a1-132-gardevoir)",
)
action_group.add_argument(
"--reindex",
action="store_true",
help="Regenerate index from existing card files",
)
action_group.add_argument(
"--download-images",
choices=list(SETS.keys()) + ["all"],
nargs="?",
const="all",
help="Download images for existing card data (specify set or 'all')",
)
parser.add_argument(
"--limit",
type=int,
help="Maximum number of cards to scrape (for testing)",
)
parser.add_argument(
"--images",
action="store_true",
help="Download card images while scraping",
)
args = parser.parse_args()
scraper = PokemonPocketScraper(download_images=args.images)
if args.reindex:
scraper.generate_index()
return 0
if args.download_images:
# Download images for existing cards
set_code = None if args.download_images == "all" else args.download_images
set_info = f"set {set_code}" if set_code else "all sets"
logger.info(f"Downloading images for existing card data ({set_info})...")
scraper.download_images = True # Enable downloads
downloaded = scraper.download_images_for_existing_cards(set_code)
scraper.log_image_stats()
logger.info(f"Image download complete: {downloaded} new images")
return 0
if args.card:
card = scraper.scrape_card(args.card)
if card:
scraper.save_card(card)
scraper.generate_index()
scraper.log_image_stats()
logger.info(f"Successfully scraped: {card.name}")
return 0
else:
logger.error(f"Failed to scrape card: {args.card}")
return 1
if args.set:
cards = scraper.scrape_set(args.set, limit=args.limit)
scraper.generate_index()
scraper.save_errors()
scraper.log_image_stats()
success_count = len(cards)
error_count = len(scraper.errors)
total = success_count + error_count
logger.info(f"Scraping complete: {success_count}/{total} cards succeeded")
if error_count > 0:
logger.warning(f"{error_count} errors occurred (see _errors.log)")
return 1
return 0
return 1
if __name__ == "__main__":
sys.exit(main())