mantimon-tcg/backend/scripts/scrape_pokemon_pocket.py
Cal Corum 50684a1b11 Add database infrastructure with SQLAlchemy models and test suite
Phase 1 Database Implementation (DB-001 through DB-012):

Models:
- User: OAuth support (Google/Discord), premium subscriptions
- Collection: Card ownership with CardSource enum
- Deck: JSONB cards/energy_cards, validation state
- CampaignProgress: One-to-one with User, medals/NPCs as JSONB
- ActiveGame: In-progress games with GameType enum
- GameHistory: Completed games with EndReason enum, replay data

Infrastructure:
- Alembic migrations with sync psycopg2 (avoids async issues)
- Docker Compose for Postgres (5433) and Redis (6380)
- App config with Pydantic settings
- Redis client helper

Test Infrastructure:
- 68 database tests (47 model + 21 relationship)
- Async factory pattern for test data creation
- Sync TRUNCATE cleanup (solves pytest-asyncio event loop mismatch)
- Uses dev containers instead of testcontainers for reliability

Key technical decisions:
- passive_deletes=True for ON DELETE SET NULL relationships
- NullPool for test sessions (no connection reuse)
- expire_on_commit=False with manual expire() for relationship tests
2026-01-27 10:17:30 -06:00

1138 lines
40 KiB
Python

#!/usr/bin/env python
"""Scrape Pokemon TCG Pocket card data from pokemon-zone.com.
This script fetches card data from the Genetic Apex (A1) and Mythical Island (A1a)
sets and saves them as individual JSON files for use in the Mantimon TCG game engine.
Usage:
# Scrape entire set
uv run python scripts/scrape_pokemon_pocket.py --set a1
# Scrape with limit (for testing)
uv run python scripts/scrape_pokemon_pocket.py --set a1 --limit 5
# Scrape single card by ID
uv run python scripts/scrape_pokemon_pocket.py --card a1-132-gardevoir
# Regenerate index from existing card files
uv run python scripts/scrape_pokemon_pocket.py --reindex
Output:
- Individual card files: data/cards/{set}/{number}-{name}.json
- Combined index: data/cards/_index.json
- Error log: data/cards/_errors.log
"""
import argparse
import json
import logging
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
import requests
from bs4 import BeautifulSoup, Tag
# =============================================================================
# Configuration
# =============================================================================
BASE_URL = "https://www.pokemon-zone.com"
DATA_DIR = Path(__file__).parent.parent / "data" / "cards"
IMAGES_DIR = Path(__file__).parent.parent / "data" / "images"
REQUEST_DELAY = 1.5 # seconds between requests
IMAGE_REQUEST_DELAY = 0.5 # faster for images (different server)
MAX_RETRIES = 3
RETRY_DELAY = 5 # seconds
# Set info for validation and metadata
SETS = {
"a1": {"name": "Genetic Apex", "expected_cards": 286},
"a1a": {"name": "Mythical Island", "expected_cards": 86},
}
# Energy type mapping from CSS classes
ENERGY_TYPES = {
"energy-icon--type-grass": "grass",
"energy-icon--type-fire": "fire",
"energy-icon--type-water": "water",
"energy-icon--type-lightning": "lightning",
"energy-icon--type-psychic": "psychic",
"energy-icon--type-fighting": "fighting",
"energy-icon--type-darkness": "darkness",
"energy-icon--type-metal": "metal",
"energy-icon--type-colorless": "colorless",
"energy-icon--type-dragon": "dragon",
}
# Rarity code mapping from CSS classes (rarity-icon--rarity-X)
RARITY_CODES = {
"C": "Common",
"U": "Uncommon",
"R": "Rare",
"RR": "Double Rare",
"AR": "Art Rare",
"SAR": "Special Art Rare",
"UR": "Ultra Rare",
"IM": "Immersive",
"S": "Shiny",
"CR": "Crown Rare",
}
# =============================================================================
# Logging Setup
# =============================================================================
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)
# =============================================================================
# Data Classes
# =============================================================================
@dataclass
class Attack:
"""A Pokemon's attack."""
name: str
cost: list[str]
damage: int | None
damage_modifier: str | None # "+", "x", or None
effect_text: str | None
effect_id: str | None = None # To be mapped later
def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary for JSON serialization."""
return {
"name": self.name,
"cost": self.cost,
"damage": self.damage,
"damage_modifier": self.damage_modifier,
"effect_text": self.effect_text,
"effect_id": self.effect_id,
}
@dataclass
class Ability:
"""A Pokemon's ability."""
name: str
effect_text: str
effect_id: str | None = None # To be mapped later
def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary for JSON serialization."""
return {
"name": self.name,
"effect_text": self.effect_text,
"effect_id": self.effect_id,
}
@dataclass
class Card:
"""Complete card data."""
id: str
name: str
set_code: str
set_name: str
card_number: int
rarity: str
card_type: str # "pokemon", "trainer", "energy"
image_url: str | None = None # URL to card image for offline caching
image_file: str | None = None # Local path to downloaded image (relative to images dir)
hp: int | None = None
pokemon_type: str | None = None
stage: str | None = None # "basic", "stage_1", "stage_2"
evolves_from: str | None = None
is_ex: bool = False
abilities: list[Ability] = field(default_factory=list)
attacks: list[Attack] = field(default_factory=list)
weakness_type: str | None = None
weakness_value: int | None = None
resistance_type: str | None = None
resistance_value: int | None = None
retreat_cost: int = 0
flavor_text: str | None = None
illustrator: str | None = None
source_url: str = ""
def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary for JSON serialization."""
data: dict[str, Any] = {
"id": self.id,
"name": self.name,
"set_code": self.set_code,
"set_name": self.set_name,
"card_number": self.card_number,
"rarity": self.rarity,
"card_type": self.card_type,
"image_url": self.image_url,
"image_file": self.image_file,
"source_url": self.source_url,
}
if self.card_type == "pokemon":
data.update(
{
"hp": self.hp,
"pokemon_type": self.pokemon_type,
"stage": self.stage,
"evolves_from": self.evolves_from,
"is_ex": self.is_ex,
"abilities": [a.to_dict() for a in self.abilities],
"attacks": [a.to_dict() for a in self.attacks],
"weakness": (
{"type": self.weakness_type, "value": self.weakness_value}
if self.weakness_type
else None
),
"resistance": (
{"type": self.resistance_type, "value": self.resistance_value}
if self.resistance_type
else None
),
"retreat_cost": self.retreat_cost,
"flavor_text": self.flavor_text,
"illustrator": self.illustrator,
}
)
elif self.card_type == "trainer":
data.update(
{
"trainer_type": self.stage, # Reusing stage field for trainer type
"effect_text": self.flavor_text, # Trainer effect
"illustrator": self.illustrator,
}
)
elif self.card_type == "energy":
data.update(
{
"energy_type": self.pokemon_type,
"illustrator": self.illustrator,
}
)
return data
# =============================================================================
# Scraper Class
# =============================================================================
class PokemonPocketScraper:
"""Scraper for Pokemon TCG Pocket card data from pokemon-zone.com."""
def __init__(
self,
data_dir: Path = DATA_DIR,
images_dir: Path = IMAGES_DIR,
download_images: bool = False,
):
"""Initialize the scraper.
Args:
data_dir: Directory to save card data files.
images_dir: Directory to save downloaded card images.
download_images: Whether to download card images.
"""
self.data_dir = data_dir
self.images_dir = images_dir
self.download_images = download_images
self.session = requests.Session()
self.session.headers.update(
{
"User-Agent": "MantimonTCG-CardScraper/1.0 (https://github.com/mantimon-tcg)",
"Accept": "text/html,application/xhtml+xml",
}
)
self.errors: list[dict[str, Any]] = []
self.image_stats = {"downloaded": 0, "skipped": 0, "failed": 0}
def fetch_page(self, url: str) -> BeautifulSoup | None:
"""Fetch a page with retry logic.
Args:
url: URL to fetch.
Returns:
BeautifulSoup object or None if all retries failed.
"""
for attempt in range(MAX_RETRIES):
try:
response = self.session.get(url, timeout=30)
response.raise_for_status()
return BeautifulSoup(response.text, "html.parser")
except requests.RequestException as e:
logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for {url}: {e}")
if attempt < MAX_RETRIES - 1:
time.sleep(RETRY_DELAY)
self.errors.append({"url": url, "error": "Max retries exceeded"})
return None
def download_image(self, card: Card) -> str | None:
"""Download a card image and save it locally.
Args:
card: Card object with image_url set.
Returns:
Relative path to the saved image, or None if download failed.
"""
if not card.image_url:
return None
# Create directory structure: images/{set_code}/
set_dir = self.images_dir / card.set_code
set_dir.mkdir(parents=True, exist_ok=True)
# Determine file extension from URL
url_path = card.image_url.split("?")[0] # Remove query params
ext = Path(url_path).suffix or ".webp"
# Generate filename: {number:03d}-{name}{ext}
url_name = card.id.split("-", 2)[2] # Get name part from ID
filename = f"{card.card_number:03d}-{url_name}{ext}"
filepath = set_dir / filename
relative_path = f"{card.set_code}/{filename}"
# Skip if already downloaded
if filepath.exists():
logger.debug(f"Image exists, skipping: {relative_path}")
self.image_stats["skipped"] += 1
return relative_path
# Download the image with appropriate headers
image_headers = {
"Accept": "image/webp,image/apng,image/*,*/*;q=0.8",
"Referer": "https://www.pokemon-zone.com/",
}
for attempt in range(MAX_RETRIES):
try:
response = self.session.get(card.image_url, timeout=30, headers=image_headers)
response.raise_for_status()
# Verify it's an image
content_type = response.headers.get("content-type", "")
if not content_type.startswith("image/"):
logger.warning(f"Not an image: {content_type} for {card.image_url}")
self.image_stats["failed"] += 1
return None
# Save the image
with open(filepath, "wb") as f:
f.write(response.content)
logger.debug(f"Downloaded: {relative_path} ({len(response.content)} bytes)")
self.image_stats["downloaded"] += 1
return relative_path
except requests.RequestException as e:
logger.warning(f"Image download attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
if attempt < MAX_RETRIES - 1:
time.sleep(RETRY_DELAY)
logger.error(f"Failed to download image for {card.id}")
self.image_stats["failed"] += 1
return None
def get_card_urls_for_set(self, set_code: str) -> list[tuple[int, str, str]]:
"""Get all card URLs for a set.
Args:
set_code: Set code (e.g., "a1", "a1a").
Returns:
List of (card_number, card_name, url) tuples.
"""
set_url = f"{BASE_URL}/sets/{set_code}/"
logger.info(f"Fetching set page: {set_url}")
soup = self.fetch_page(set_url)
if not soup:
logger.error(f"Failed to fetch set page for {set_code}")
return []
cards: list[tuple[int, str, str]] = []
# Find all card links - they follow pattern /cards/{set}/{number}/{name}/
pattern = re.compile(rf"^/cards/{set_code}/(\d+)/([^/]+)/$")
for link in soup.find_all("a", href=pattern):
href = link.get("href", "")
match = pattern.match(href)
if match:
card_number = int(match.group(1))
card_name = match.group(2)
# Avoid duplicates (page may have multiple links to same card)
card_tuple = (card_number, card_name, f"{BASE_URL}{href}")
if card_tuple not in cards:
cards.append(card_tuple)
# Sort by card number
cards.sort(key=lambda x: x[0])
logger.info(f"Found {len(cards)} cards in set {set_code}")
return cards
def parse_energy_type(self, element: Tag | None) -> str | None:
"""Extract energy type from an element containing an energy icon.
Args:
element: BeautifulSoup element that may contain energy icons.
Returns:
Energy type string or None.
"""
if not element:
return None
for icon in element.find_all("span", class_=re.compile(r"energy-icon--type-")):
for cls in icon.get("class", []):
if cls in ENERGY_TYPES:
return ENERGY_TYPES[cls]
return None
def parse_attack(self, attack_row: Tag) -> Attack | None:
"""Parse an attack from an attack-summary-row element.
Args:
attack_row: BeautifulSoup element for the attack row.
Returns:
Attack object or None if parsing failed.
"""
try:
# Get attack name
name_elem = attack_row.find(class_="attack-summary-row__name")
if not name_elem:
return None
name = name_elem.get_text(strip=True)
# Get energy cost
cost: list[str] = []
costs_elem = attack_row.find(class_="attack-summary-row__costs")
if costs_elem:
for cost_icon in costs_elem.find_all("span", class_=re.compile(r"energy-icon")):
for cls in cost_icon.get("class", []):
if cls in ENERGY_TYPES:
cost.append(ENERGY_TYPES[cls])
# Get damage
damage: int | None = None
damage_modifier: str | None = None
damage_elem = attack_row.find(class_="attack-summary-row__damage")
if damage_elem:
damage_text = damage_elem.get_text(strip=True)
# Parse damage like "60", "50+", "100x"
match = re.match(r"(\d+)([+x])?", damage_text)
if match:
damage = int(match.group(1))
damage_modifier = match.group(2)
# Get effect text
effect_text: str | None = None
footer_elem = attack_row.find(class_="attack-summary-row__footer")
if footer_elem:
effect_text = footer_elem.get_text(strip=True)
if not effect_text:
effect_text = None
return Attack(
name=name,
cost=cost,
damage=damage,
damage_modifier=damage_modifier,
effect_text=effect_text,
)
except Exception as e:
logger.warning(f"Failed to parse attack: {e}")
return None
def parse_ability(self, ability_row: Tag) -> Ability | None:
"""Parse an ability from an ability-summary-row element.
Args:
ability_row: BeautifulSoup element for the ability row.
Returns:
Ability object or None if parsing failed.
"""
try:
# Get ability name (text after "Ability" badge)
name_elem = ability_row.find(class_="ability-summary-row__name")
if not name_elem:
return None
# Remove the "Ability" badge text to get just the name
name_text = name_elem.get_text(strip=True)
name = re.sub(r"^Ability\s*", "", name_text)
# Get effect text
desc_elem = ability_row.find(class_="ability-summary-row__description")
effect_text = desc_elem.get_text(strip=True) if desc_elem else ""
return Ability(name=name, effect_text=effect_text)
except Exception as e:
logger.warning(f"Failed to parse ability: {e}")
return None
def parse_card_page(self, soup: BeautifulSoup, url: str, set_code: str) -> Card | None:
"""Parse a card page into a Card object.
Args:
soup: BeautifulSoup object of the card page.
url: URL of the card page (for error logging).
set_code: Set code for this card.
Returns:
Card object or None if parsing failed.
"""
try:
# Extract card number and name from URL
match = re.search(rf"/cards/{set_code}/(\d+)/([^/]+)/", url)
if not match:
logger.error(f"Could not parse card URL: {url}")
return None
card_number = int(match.group(1))
url_name = match.group(2)
# Get card name from page
name_elem = soup.find("h1")
if not name_elem:
logger.error(f"Could not find card name on page: {url}")
return None
name = name_elem.get_text(strip=True)
# Determine card type - look for specific card type indicators
# Trainers have "Trainer | Supporter/Item/Stadium" text
# Energy cards have specific energy type text
# Pokemon cards have "Pokémon | Basic/Stage 1/Stage 2" text
card_type = "pokemon" # Default
# Check for Trainer first (more specific match)
trainer_match = soup.find(
string=re.compile(r"Trainer\s*\|\s*(Supporter|Item|Stadium)", re.IGNORECASE)
)
if trainer_match:
card_type = "trainer"
else:
# Check for Energy
energy_match = soup.find(string=re.compile(r"Energy\s*$", re.IGNORECASE))
if energy_match:
card_type = "energy"
# Create card ID
card_id = f"{set_code}-{card_number:03d}-{url_name}"
# Get rarity from CSS class (rarity-icon--rarity-X)
rarity = "Unknown"
rarity_icon = soup.find("span", class_="rarity-icon")
if rarity_icon:
for cls in rarity_icon.get("class", []):
if "rarity-icon--rarity-" in cls:
rarity_code = cls.replace("rarity-icon--rarity-", "")
rarity = RARITY_CODES.get(rarity_code, rarity_code)
break
# Get card image URL (first image in card-detail__card section)
image_url: str | None = None
card_section = soup.find("div", class_="card-detail__card")
if card_section:
img = card_section.find("img")
if img:
image_url = img.get("src")
# Remove query params to get full resolution
if image_url and "?" in image_url:
image_url = image_url.split("?")[0]
# Initialize card
# is_ex: Check if name ends with " ex" (case insensitive)
# This avoids false positives like "Exeggutor"
is_ex = name.lower().endswith(" ex")
card = Card(
id=card_id,
name=name,
set_code=set_code,
set_name=SETS.get(set_code, {}).get("name", set_code),
card_number=card_number,
rarity=rarity,
card_type=card_type,
image_url=image_url,
source_url=url,
is_ex=is_ex,
)
if card_type == "pokemon":
self._parse_pokemon_details(soup, card)
elif card_type == "trainer":
self._parse_trainer_details(soup, card)
elif card_type == "energy":
self._parse_energy_details(soup, card)
return card
except Exception as e:
logger.error(f"Failed to parse card page {url}: {e}")
self.errors.append({"url": url, "error": str(e)})
return None
def _parse_pokemon_details(self, soup: BeautifulSoup, card: Card) -> None:
"""Parse Pokemon-specific details from the page.
Args:
soup: BeautifulSoup object of the card page.
card: Card object to populate.
"""
# Get HP
hp_match = soup.find(string=re.compile(r"HP\s*(\d+)", re.IGNORECASE))
if hp_match:
hp_num = re.search(r"(\d+)", str(hp_match))
if hp_num:
card.hp = int(hp_num.group(1))
else:
# Try finding HP in the stat display
hp_elem = soup.find("span", string="HP")
if hp_elem:
hp_value = hp_elem.find_next("span")
if hp_value:
hp_text = hp_value.get_text(strip=True)
hp_num = re.search(r"(\d+)", hp_text)
if hp_num:
card.hp = int(hp_num.group(1))
# Get Pokemon type from first energy icon NOT in an attack row
# The card's type icon is in the header area, not in attack-summary-row__cost
for icon in soup.find_all("span", class_=re.compile(r"energy-icon--type-")):
parent = icon.parent
parent_classes = parent.get("class", []) if parent else []
# Skip if this is an attack cost icon
if "attack-summary-row__cost" not in parent_classes:
for cls in icon.get("class", []):
if cls in ENERGY_TYPES:
card.pokemon_type = ENERGY_TYPES[cls]
break
if card.pokemon_type:
break
# Get stage and evolution info
stage_text = soup.find(string=re.compile(r"Basic|Stage 1|Stage 2", re.IGNORECASE))
if stage_text:
stage_lower = str(stage_text).lower()
if "stage 2" in stage_lower:
card.stage = "stage_2"
elif "stage 1" in stage_lower:
card.stage = "stage_1"
elif "basic" in stage_lower:
card.stage = "basic"
# Get evolves_from
evolves_match = soup.find(string=re.compile(r"Evolves from", re.IGNORECASE))
if evolves_match:
# Try to find the Pokemon name link nearby
parent = evolves_match.parent if hasattr(evolves_match, "parent") else None
if parent:
link = parent.find("a")
if link:
card.evolves_from = link.get_text(strip=True)
# Get abilities
for ability_row in soup.find_all(class_="ability-summary-row"):
ability = self.parse_ability(ability_row)
if ability:
card.abilities.append(ability)
# Get attacks
for attack_row in soup.find_all(class_="attack-summary-row"):
attack = self.parse_attack(attack_row)
if attack:
card.attacks.append(attack)
# Get weakness
weakness_section = soup.find(string=re.compile(r"Weakness", re.IGNORECASE))
if weakness_section:
parent = weakness_section.parent
if parent:
card.weakness_type = self.parse_energy_type(parent.parent)
# Look for +20 pattern
value_match = re.search(
r"\+(\d+)", parent.parent.get_text() if parent.parent else ""
)
if value_match:
card.weakness_value = int(value_match.group(1))
# Get retreat cost (count colorless energy icons in retreat section)
retreat_section = soup.find(string=re.compile(r"Retreat", re.IGNORECASE))
if retreat_section:
parent = retreat_section.parent
if parent and parent.parent:
retreat_icons = parent.parent.find_all(
"span", class_=re.compile(r"energy-icon--type-colorless")
)
card.retreat_cost = len(retreat_icons)
# Get illustrator
illustrator_match = soup.find(string=re.compile(r"Illustrated by", re.IGNORECASE))
if illustrator_match:
card.illustrator = str(illustrator_match).replace("Illustrated by", "").strip()
# Get flavor text (Pokemon description)
# This is usually in a paragraph after the attacks section
for p in soup.find_all("p"):
text = p.get_text(strip=True)
if text and len(text) > 50 and "Illustrated" not in text and "Artwork" not in text:
card.flavor_text = text
break
def _parse_trainer_details(self, soup: BeautifulSoup, card: Card) -> None:
"""Parse Trainer-specific details from the page.
Args:
soup: BeautifulSoup object of the card page.
card: Card object to populate.
"""
# Get trainer type (Item, Supporter, Stadium) from "Trainer | Type" text
type_match = soup.find(
string=re.compile(r"Trainer\s*\|\s*(Supporter|Item|Stadium)", re.IGNORECASE)
)
if type_match:
match = re.search(r"(Supporter|Item|Stadium)", str(type_match), re.IGNORECASE)
if match:
card.stage = match.group(1).lower()
# Get effect text - look for card-detail__content-body
content_body = soup.find("div", class_="card-detail__content-body")
if content_body:
# Get full text from content body
full_text = content_body.get_text(strip=True)
# Remove illustrator info at the end
if "Illustrated by" in full_text:
full_text = full_text.split("Illustrated by")[0].strip()
if full_text:
card.flavor_text = full_text
else:
# Fallback: look for any paragraph with effect-like text
for elem in soup.find_all("p"):
text = elem.get_text(strip=True)
if (
text
and len(text) > 20
and "Illustrated" not in text
and "Artwork" not in text
and "Pokemon Zone" not in text
and "unofficial" not in text.lower()
):
card.flavor_text = text
break
# Get illustrator
illustrator_match = soup.find(string=re.compile(r"Illustrated by", re.IGNORECASE))
if illustrator_match:
card.illustrator = str(illustrator_match).replace("Illustrated by", "").strip()
def _parse_energy_details(self, soup: BeautifulSoup, card: Card) -> None:
"""Parse Energy-specific details from the page.
Args:
soup: BeautifulSoup object of the card page.
card: Card object to populate.
"""
# Get energy type from the page
card.pokemon_type = self.parse_energy_type(soup)
# Get illustrator
illustrator_match = soup.find(string=re.compile(r"Illustrated by", re.IGNORECASE))
if illustrator_match:
card.illustrator = str(illustrator_match).replace("Illustrated by", "").strip()
def scrape_card(self, card_id: str) -> Card | None:
"""Scrape a single card by ID.
Args:
card_id: Card ID in format "{set}-{number}-{name}" (e.g., "a1-132-gardevoir").
Returns:
Card object or None if scraping failed.
"""
# Parse card ID
match = re.match(r"([a-z0-9]+)-(\d+)-(.+)", card_id)
if not match:
logger.error(f"Invalid card ID format: {card_id}")
return None
set_code = match.group(1)
card_number = int(match.group(2))
card_name = match.group(3)
url = f"{BASE_URL}/cards/{set_code}/{card_number}/{card_name}/"
logger.info(f"Scraping card: {url}")
soup = self.fetch_page(url)
if not soup:
return None
card = self.parse_card_page(soup, url, set_code)
# Download image if enabled
if card and self.download_images and card.image_url:
card.image_file = self.download_image(card)
return card
def scrape_set(self, set_code: str, limit: int | None = None) -> list[Card]:
"""Scrape all cards from a set.
Args:
set_code: Set code (e.g., "a1", "a1a").
limit: Maximum number of cards to scrape (for testing).
Returns:
List of Card objects.
"""
if set_code not in SETS:
logger.error(f"Unknown set code: {set_code}")
return []
card_urls = self.get_card_urls_for_set(set_code)
if limit:
card_urls = card_urls[:limit]
cards: list[Card] = []
total = len(card_urls)
for i, (card_number, card_name, url) in enumerate(card_urls, 1):
logger.info(f"[{i}/{total}] Scraping: {card_name} (#{card_number})")
soup = self.fetch_page(url)
if soup:
card = self.parse_card_page(soup, url, set_code)
if card:
# Download image if enabled
if self.download_images and card.image_url:
card.image_file = self.download_image(card)
time.sleep(IMAGE_REQUEST_DELAY)
cards.append(card)
self.save_card(card)
# Rate limiting
if i < total:
time.sleep(REQUEST_DELAY)
return cards
def save_card(self, card: Card) -> Path:
"""Save a card to a JSON file.
Args:
card: Card object to save.
Returns:
Path to the saved file.
"""
set_dir = self.data_dir / card.set_code
set_dir.mkdir(parents=True, exist_ok=True)
# Generate filename: {number:03d}-{name}.json
url_name = card.id.split("-", 2)[2] # Get name part from ID
filename = f"{card.card_number:03d}-{url_name}.json"
filepath = set_dir / filename
with open(filepath, "w", encoding="utf-8") as f:
json.dump(card.to_dict(), f, indent=2, ensure_ascii=False)
logger.debug(f"Saved: {filepath}")
return filepath
def generate_index(self) -> Path:
"""Generate the combined index file from existing card files.
Returns:
Path to the index file.
"""
logger.info("Generating index...")
index: dict[str, Any] = {
"generated_at": datetime.now(UTC).isoformat(),
"schema_version": "1.0",
"sets": {},
"cards": [],
"total_cards": 0,
}
for set_code in SETS:
set_dir = self.data_dir / set_code
if not set_dir.exists():
continue
card_files = sorted(set_dir.glob("*.json"))
index["sets"][set_code] = {
"name": SETS[set_code]["name"],
"card_count": len(card_files),
}
for card_file in card_files:
relative_path = f"{set_code}/{card_file.name}"
with open(card_file, encoding="utf-8") as f:
card_data = json.load(f)
index["cards"].append(
{
"id": card_data["id"],
"name": card_data["name"],
"set_code": set_code,
"card_number": card_data["card_number"],
"file": relative_path,
}
)
index["total_cards"] = len(index["cards"])
# Sort cards by set and number
index["cards"].sort(key=lambda x: (x["set_code"], x["card_number"]))
index_path = self.data_dir / "_index.json"
with open(index_path, "w", encoding="utf-8") as f:
json.dump(index, f, indent=2, ensure_ascii=False)
logger.info(f"Index generated: {index_path} ({index['total_cards']} cards)")
return index_path
def save_errors(self) -> Path | None:
"""Save error log if there were any errors.
Returns:
Path to the error log file, or None if no errors.
"""
if not self.errors:
return None
error_log = self.data_dir / "_errors.log"
with open(error_log, "w", encoding="utf-8") as f:
f.write(f"Scraping errors - {datetime.now(UTC).isoformat()}\n")
f.write("=" * 60 + "\n\n")
for error in self.errors:
f.write(f"URL: {error['url']}\n")
f.write(f"Error: {error['error']}\n\n")
logger.warning(f"Errors logged to: {error_log}")
return error_log
def download_images_for_existing_cards(self, set_code: str | None = None) -> int:
"""Download images for cards that already have JSON files.
This is useful for downloading images separately from scraping,
or for retrying failed image downloads.
Args:
set_code: Optional set code to limit downloads to a specific set.
Returns:
Number of images downloaded.
"""
sets_to_process = [set_code] if set_code else list(SETS.keys())
total_downloaded = 0
for sc in sets_to_process:
set_dir = self.data_dir / sc
if not set_dir.exists():
logger.warning(f"No card data found for set {sc}")
continue
card_files = sorted(set_dir.glob("*.json"))
logger.info(f"Processing {len(card_files)} cards from set {sc}")
for i, card_file in enumerate(card_files, 1):
with open(card_file, encoding="utf-8") as f:
card_data = json.load(f)
image_url = card_data.get("image_url")
existing_file = card_data.get("image_file")
if not image_url:
continue
# Create a minimal Card object for download
card = Card(
id=card_data["id"],
name=card_data["name"],
set_code=card_data["set_code"],
set_name=card_data["set_name"],
card_number=card_data["card_number"],
rarity=card_data["rarity"],
card_type=card_data["card_type"],
image_url=image_url,
)
# Check if image already exists
if existing_file:
image_path = self.images_dir / existing_file
if image_path.exists():
self.image_stats["skipped"] += 1
continue
logger.info(f"[{i}/{len(card_files)}] Downloading: {card.name}")
image_file = self.download_image(card)
if image_file:
# Update the JSON file with the image path
card_data["image_file"] = image_file
with open(card_file, "w", encoding="utf-8") as f:
json.dump(card_data, f, indent=2, ensure_ascii=False)
total_downloaded += 1
time.sleep(IMAGE_REQUEST_DELAY)
return total_downloaded
def log_image_stats(self) -> None:
"""Log image download statistics."""
stats = self.image_stats
total = stats["downloaded"] + stats["skipped"] + stats["failed"]
if total > 0:
logger.info(
f"Images: {stats['downloaded']} downloaded, "
f"{stats['skipped']} skipped, {stats['failed']} failed"
)
# =============================================================================
# CLI
# =============================================================================
def main() -> int:
"""Main entry point for the scraper CLI."""
parser = argparse.ArgumentParser(
description="Scrape Pokemon TCG Pocket card data from pokemon-zone.com",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Scrape entire set
uv run python scripts/scrape_pokemon_pocket.py --set a1
# Scrape with limit (for testing)
uv run python scripts/scrape_pokemon_pocket.py --set a1 --limit 5
# Scrape set with images
uv run python scripts/scrape_pokemon_pocket.py --set a1 --images
# Download images for existing card data
uv run python scripts/scrape_pokemon_pocket.py --download-images
uv run python scripts/scrape_pokemon_pocket.py --download-images --set a1
# Scrape single card by ID
uv run python scripts/scrape_pokemon_pocket.py --card a1-132-gardevoir
# Regenerate index from existing card files
uv run python scripts/scrape_pokemon_pocket.py --reindex
""",
)
# Main action group - what operation to perform
action_group = parser.add_mutually_exclusive_group(required=True)
action_group.add_argument(
"--set",
choices=list(SETS.keys()),
help="Scrape all cards from a set",
)
action_group.add_argument(
"--card",
type=str,
help="Scrape a single card by ID (e.g., a1-132-gardevoir)",
)
action_group.add_argument(
"--reindex",
action="store_true",
help="Regenerate index from existing card files",
)
action_group.add_argument(
"--download-images",
choices=list(SETS.keys()) + ["all"],
nargs="?",
const="all",
help="Download images for existing card data (specify set or 'all')",
)
parser.add_argument(
"--limit",
type=int,
help="Maximum number of cards to scrape (for testing)",
)
parser.add_argument(
"--images",
action="store_true",
help="Download card images while scraping",
)
args = parser.parse_args()
scraper = PokemonPocketScraper(download_images=args.images)
if args.reindex:
scraper.generate_index()
return 0
if args.download_images:
# Download images for existing cards
set_code = None if args.download_images == "all" else args.download_images
set_info = f"set {set_code}" if set_code else "all sets"
logger.info(f"Downloading images for existing card data ({set_info})...")
scraper.download_images = True # Enable downloads
downloaded = scraper.download_images_for_existing_cards(set_code)
scraper.log_image_stats()
logger.info(f"Image download complete: {downloaded} new images")
return 0
if args.card:
card = scraper.scrape_card(args.card)
if card:
scraper.save_card(card)
scraper.generate_index()
scraper.log_image_stats()
logger.info(f"Successfully scraped: {card.name}")
return 0
else:
logger.error(f"Failed to scrape card: {args.card}")
return 1
if args.set:
cards = scraper.scrape_set(args.set, limit=args.limit)
scraper.generate_index()
scraper.save_errors()
scraper.log_image_stats()
success_count = len(cards)
error_count = len(scraper.errors)
total = success_count + error_count
logger.info(f"Scraping complete: {success_count}/{total} cards succeeded")
if error_count > 0:
logger.warning(f"{error_count} errors occurred (see _errors.log)")
return 1
return 0
return 1
if __name__ == "__main__":
sys.exit(main())