The source website uses <span class='energy-text energy-text--type-fire'> to render inline energy icons. BeautifulSoup's get_text() was stripping these spans, losing the energy type information and causing merged text like 'Discard aEnergy' instead of 'Discard a Fire Energy'. Changes: - Add ENERGY_TEXT_TYPES mapping for inline energy references - Add replace_energy_text_spans() to convert spans to text before extraction - Add extract_effect_text() helper with proper text joining (separator=' ') - Update parse_attack(), parse_ability(), _parse_trainer_details() to use it - Fix JSON encoding in convert_cards.py to use UTF-8 (ensure_ascii=False) Before: 'Discard an Energy from this Pokémon' After: 'Discard a Fire Energy from this Pokémon' Re-scraped all 372 cards and regenerated 382 definitions.
1280 lines
45 KiB
Python
1280 lines
45 KiB
Python
#!/usr/bin/env python
|
|
"""Scrape Pokemon TCG Pocket card data from pokemon-zone.com.
|
|
|
|
This script fetches card data from the Genetic Apex (A1) and Mythical Island (A1a)
|
|
sets and saves them as individual JSON files for use in the Mantimon TCG game engine.
|
|
|
|
Usage:
|
|
# Scrape entire set
|
|
uv run python scripts/scrape_pokemon_pocket.py --set a1
|
|
|
|
# Scrape with limit (for testing)
|
|
uv run python scripts/scrape_pokemon_pocket.py --set a1 --limit 5
|
|
|
|
# Scrape single card by ID
|
|
uv run python scripts/scrape_pokemon_pocket.py --card a1-132-gardevoir
|
|
|
|
# Regenerate index from existing card files
|
|
uv run python scripts/scrape_pokemon_pocket.py --reindex
|
|
|
|
Output:
|
|
- Individual card files: data/raw/{set}/{number}-{name}.json
|
|
- Combined index: data/raw/_index.json
|
|
- Error log: data/raw/_errors.log
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import re
|
|
import sys
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from datetime import UTC, datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup, Tag
|
|
|
|
# =============================================================================
|
|
# Configuration
|
|
# =============================================================================
|
|
|
|
BASE_URL = "https://www.pokemon-zone.com"
|
|
DATA_DIR = Path(__file__).parent.parent / "data" / "raw"
|
|
IMAGES_DIR = Path(__file__).parent.parent / "data" / "images"
|
|
REQUEST_DELAY = 1.5 # seconds between requests
|
|
IMAGE_REQUEST_DELAY = 0.5 # faster for images (different server)
|
|
MAX_RETRIES = 3
|
|
RETRY_DELAY = 5 # seconds
|
|
|
|
# Set info for validation and metadata
|
|
SETS = {
|
|
"a1": {"name": "Genetic Apex", "expected_cards": 286},
|
|
"a1a": {"name": "Mythical Island", "expected_cards": 86},
|
|
}
|
|
|
|
# Energy type mapping from CSS classes (for attack cost icons)
|
|
ENERGY_TYPES = {
|
|
"energy-icon--type-grass": "grass",
|
|
"energy-icon--type-fire": "fire",
|
|
"energy-icon--type-water": "water",
|
|
"energy-icon--type-lightning": "lightning",
|
|
"energy-icon--type-psychic": "psychic",
|
|
"energy-icon--type-fighting": "fighting",
|
|
"energy-icon--type-darkness": "darkness",
|
|
"energy-icon--type-metal": "metal",
|
|
"energy-icon--type-colorless": "colorless",
|
|
"energy-icon--type-dragon": "dragon",
|
|
}
|
|
|
|
# Energy text mapping from CSS classes (for inline text references)
|
|
# These appear in effect text like "Discard a <span class="energy-text--type-fire"></span> Energy"
|
|
ENERGY_TEXT_TYPES = {
|
|
"energy-text--type-grass": "Grass",
|
|
"energy-text--type-fire": "Fire",
|
|
"energy-text--type-water": "Water",
|
|
"energy-text--type-lightning": "Lightning",
|
|
"energy-text--type-psychic": "Psychic",
|
|
"energy-text--type-fighting": "Fighting",
|
|
"energy-text--type-darkness": "Darkness",
|
|
"energy-text--type-metal": "Metal",
|
|
"energy-text--type-colorless": "Colorless",
|
|
"energy-text--type-dragon": "Dragon",
|
|
}
|
|
|
|
# Rarity code mapping from CSS classes (rarity-icon--rarity-X)
|
|
RARITY_CODES = {
|
|
"C": "Common",
|
|
"U": "Uncommon",
|
|
"R": "Rare",
|
|
"RR": "Double Rare",
|
|
"AR": "Art Rare",
|
|
"SAR": "Special Art Rare",
|
|
"UR": "Ultra Rare",
|
|
"IM": "Immersive",
|
|
"S": "Shiny",
|
|
"CR": "Crown Rare",
|
|
}
|
|
|
|
# Fossil cards that are Trainer/Item cards, not Pokemon
|
|
# These are often misclassified because they have Pokemon-like layouts on the source site
|
|
FOSSIL_CARDS = {
|
|
"Helix Fossil",
|
|
"Dome Fossil",
|
|
"Old Amber",
|
|
}
|
|
|
|
# Text artifact patterns to fix (caused by stripped energy icons)
|
|
# Format: (pattern, replacement)
|
|
# Energy icons render as empty strings, merging adjacent text
|
|
TEXT_ARTIFACT_FIXES = [
|
|
# Energy-related artifacts
|
|
(r"\baEnergy\b", "an Energy"),
|
|
(r"\bofEnergy\b", "of Energy"),
|
|
(r"\bextraEnergy\b", "extra Energy"),
|
|
(r"\battachedEnergy\b", "attached Energy"),
|
|
(r"\banyEnergy\b", "any Energy"),
|
|
(r"(\d+)Energy\b", r"\1 Energy"), # "2Energy" -> "2 Energy"
|
|
(r"(\d+)-HP\b", r"\1 HP"), # "40-HP" -> "40 HP"
|
|
# Pokemon-related artifacts
|
|
(r"\bBasicPokémon\b", "Basic Pokémon"),
|
|
(r"\bBenchedPokémon\b", "Benched Pokémon"),
|
|
(r"\bthePokémon\b", "the Pokémon"),
|
|
(r"\bthisPokémon\b", "this Pokémon"),
|
|
(r"\byourPokémon\b", "your Pokémon"),
|
|
(r"\bActivePokémon\b", "Active Pokémon"),
|
|
(r"\bDefendingPokémon\b", "Defending Pokémon"),
|
|
(r"\bopponent'sPokémon\b", "opponent's Pokémon"),
|
|
(r"\bOpponent'sPokémon\b", "Opponent's Pokémon"),
|
|
# Other common artifacts
|
|
(r"\bthatPokémon\b", "that Pokémon"),
|
|
(r"\beachPokémon\b", "each Pokémon"),
|
|
(r"\baPokémon\b", "a Pokémon"),
|
|
]
|
|
|
|
|
|
def replace_energy_text_spans(element: Tag) -> None:
|
|
"""Replace energy-text spans with their text representation in-place.
|
|
|
|
The source website uses <span class="energy-text energy-text--type-fire"></span>
|
|
to render inline energy type icons in effect text. When BeautifulSoup extracts
|
|
text with get_text(), these empty spans disappear, merging adjacent words.
|
|
|
|
This function finds all such spans and replaces them with their text equivalent
|
|
(e.g., "Fire") BEFORE text extraction, preserving the energy type information.
|
|
|
|
Args:
|
|
element: BeautifulSoup element to process in-place. The element is modified
|
|
directly - energy-text spans are replaced with NavigableString text.
|
|
|
|
Example:
|
|
>>> soup = BeautifulSoup('<div>Discard a <span class="energy-text--type-fire"></span> Energy</div>')
|
|
>>> div = soup.find('div')
|
|
>>> replace_energy_text_spans(div)
|
|
>>> div.get_text()
|
|
'Discard a Fire Energy'
|
|
"""
|
|
from bs4 import NavigableString
|
|
|
|
# Find all energy-text spans (they may have multiple classes)
|
|
for span in element.find_all("span", class_=re.compile(r"energy-text--type-")):
|
|
energy_type = None
|
|
for cls in span.get("class", []):
|
|
if cls in ENERGY_TEXT_TYPES:
|
|
energy_type = ENERGY_TEXT_TYPES[cls]
|
|
break
|
|
|
|
if energy_type:
|
|
# Replace the span with a text node
|
|
span.replace_with(NavigableString(energy_type))
|
|
|
|
|
|
def extract_effect_text(element: Tag) -> str | None:
|
|
"""Extract effect text from an element, properly handling energy-text spans.
|
|
|
|
This is the correct way to extract effect text that may contain energy type
|
|
icons. It replaces the energy-text spans with readable text before extraction.
|
|
|
|
Args:
|
|
element: BeautifulSoup element containing effect text.
|
|
|
|
Returns:
|
|
Cleaned effect text, or None if empty.
|
|
"""
|
|
# Work on a deep copy to avoid modifying the original soup
|
|
# BeautifulSoup's copy.copy is shallow - we need to re-parse
|
|
from bs4 import BeautifulSoup as BS
|
|
|
|
element_copy = BS(str(element), "html.parser")
|
|
replace_energy_text_spans(element_copy)
|
|
# Use separator=" " to properly join text nodes with spaces
|
|
# strip=True removes leading/trailing whitespace from each text node
|
|
raw_text = element_copy.get_text(separator=" ", strip=True)
|
|
# Normalize multiple spaces to single spaces
|
|
raw_text = re.sub(r"\s+", " ", raw_text).strip()
|
|
# Still apply artifact fixes as fallback for any edge cases
|
|
return clean_effect_text(raw_text)
|
|
|
|
|
|
def clean_effect_text(text: str | None) -> str | None:
|
|
"""Clean scraped effect text by fixing common artifacts.
|
|
|
|
This is a fallback for any text artifacts not caught by energy-text span
|
|
replacement. It handles edge cases like merged words.
|
|
|
|
Args:
|
|
text: The raw effect text from scraping.
|
|
|
|
Returns:
|
|
Cleaned text with artifacts fixed, or None if input was None/empty.
|
|
"""
|
|
if not text:
|
|
return None
|
|
|
|
result = text
|
|
for pattern, replacement in TEXT_ARTIFACT_FIXES:
|
|
result = re.sub(pattern, replacement, result)
|
|
|
|
return result if result else None
|
|
|
|
|
|
# =============================================================================
|
|
# Logging Setup
|
|
# =============================================================================
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# =============================================================================
|
|
# Data Classes
|
|
# =============================================================================
|
|
|
|
|
|
@dataclass
|
|
class Attack:
|
|
"""A Pokemon's attack."""
|
|
|
|
name: str
|
|
cost: list[str]
|
|
damage: int | None
|
|
damage_modifier: str | None # "+", "x", or None
|
|
effect_text: str | None
|
|
effect_id: str | None = None # To be mapped later
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
"""Convert to dictionary for JSON serialization."""
|
|
return {
|
|
"name": self.name,
|
|
"cost": self.cost,
|
|
"damage": self.damage,
|
|
"damage_modifier": self.damage_modifier,
|
|
"effect_text": self.effect_text,
|
|
"effect_id": self.effect_id,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class Ability:
|
|
"""A Pokemon's ability."""
|
|
|
|
name: str
|
|
effect_text: str
|
|
effect_id: str | None = None # To be mapped later
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
"""Convert to dictionary for JSON serialization."""
|
|
return {
|
|
"name": self.name,
|
|
"effect_text": self.effect_text,
|
|
"effect_id": self.effect_id,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class Card:
|
|
"""Complete card data."""
|
|
|
|
id: str
|
|
name: str
|
|
set_code: str
|
|
set_name: str
|
|
card_number: int
|
|
rarity: str
|
|
card_type: str # "pokemon", "trainer", "energy"
|
|
image_url: str | None = None # URL to card image for offline caching
|
|
image_file: str | None = None # Local path to downloaded image (relative to images dir)
|
|
hp: int | None = None
|
|
pokemon_type: str | None = None
|
|
stage: str | None = None # "basic", "stage_1", "stage_2"
|
|
evolves_from: str | None = None
|
|
is_ex: bool = False
|
|
abilities: list[Ability] = field(default_factory=list)
|
|
attacks: list[Attack] = field(default_factory=list)
|
|
weakness_type: str | None = None
|
|
weakness_value: int | None = None
|
|
resistance_type: str | None = None
|
|
resistance_value: int | None = None
|
|
retreat_cost: int = 0
|
|
flavor_text: str | None = None
|
|
illustrator: str | None = None
|
|
source_url: str = ""
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
"""Convert to dictionary for JSON serialization."""
|
|
data: dict[str, Any] = {
|
|
"id": self.id,
|
|
"name": self.name,
|
|
"set_code": self.set_code,
|
|
"set_name": self.set_name,
|
|
"card_number": self.card_number,
|
|
"rarity": self.rarity,
|
|
"card_type": self.card_type,
|
|
"image_url": self.image_url,
|
|
"image_file": self.image_file,
|
|
"source_url": self.source_url,
|
|
}
|
|
|
|
if self.card_type == "pokemon":
|
|
data.update(
|
|
{
|
|
"hp": self.hp,
|
|
"pokemon_type": self.pokemon_type,
|
|
"stage": self.stage,
|
|
"evolves_from": self.evolves_from,
|
|
"is_ex": self.is_ex,
|
|
"abilities": [a.to_dict() for a in self.abilities],
|
|
"attacks": [a.to_dict() for a in self.attacks],
|
|
"weakness": (
|
|
{"type": self.weakness_type, "value": self.weakness_value}
|
|
if self.weakness_type
|
|
else None
|
|
),
|
|
"resistance": (
|
|
{"type": self.resistance_type, "value": self.resistance_value}
|
|
if self.resistance_type
|
|
else None
|
|
),
|
|
"retreat_cost": self.retreat_cost,
|
|
"flavor_text": self.flavor_text,
|
|
"illustrator": self.illustrator,
|
|
}
|
|
)
|
|
elif self.card_type == "trainer":
|
|
data.update(
|
|
{
|
|
"trainer_type": self.stage, # Reusing stage field for trainer type
|
|
"effect_text": self.flavor_text, # Trainer effect
|
|
"illustrator": self.illustrator,
|
|
}
|
|
)
|
|
elif self.card_type == "energy":
|
|
data.update(
|
|
{
|
|
"energy_type": self.pokemon_type,
|
|
"illustrator": self.illustrator,
|
|
}
|
|
)
|
|
|
|
return data
|
|
|
|
|
|
# =============================================================================
|
|
# Scraper Class
|
|
# =============================================================================
|
|
|
|
|
|
class PokemonPocketScraper:
|
|
"""Scraper for Pokemon TCG Pocket card data from pokemon-zone.com."""
|
|
|
|
def __init__(
|
|
self,
|
|
data_dir: Path = DATA_DIR,
|
|
images_dir: Path = IMAGES_DIR,
|
|
download_images: bool = False,
|
|
):
|
|
"""Initialize the scraper.
|
|
|
|
Args:
|
|
data_dir: Directory to save card data files.
|
|
images_dir: Directory to save downloaded card images.
|
|
download_images: Whether to download card images.
|
|
"""
|
|
self.data_dir = data_dir
|
|
self.images_dir = images_dir
|
|
self.download_images = download_images
|
|
self.session = requests.Session()
|
|
self.session.headers.update(
|
|
{
|
|
"User-Agent": "MantimonTCG-CardScraper/1.0 (https://github.com/mantimon-tcg)",
|
|
"Accept": "text/html,application/xhtml+xml",
|
|
}
|
|
)
|
|
self.errors: list[dict[str, Any]] = []
|
|
self.image_stats = {"downloaded": 0, "skipped": 0, "failed": 0}
|
|
|
|
def fetch_page(self, url: str) -> BeautifulSoup | None:
|
|
"""Fetch a page with retry logic.
|
|
|
|
Args:
|
|
url: URL to fetch.
|
|
|
|
Returns:
|
|
BeautifulSoup object or None if all retries failed.
|
|
"""
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
response = self.session.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
return BeautifulSoup(response.text, "html.parser")
|
|
except requests.RequestException as e:
|
|
logger.warning(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for {url}: {e}")
|
|
if attempt < MAX_RETRIES - 1:
|
|
time.sleep(RETRY_DELAY)
|
|
|
|
self.errors.append({"url": url, "error": "Max retries exceeded"})
|
|
return None
|
|
|
|
def download_image(self, card: Card) -> str | None:
|
|
"""Download a card image and save it locally.
|
|
|
|
Args:
|
|
card: Card object with image_url set.
|
|
|
|
Returns:
|
|
Relative path to the saved image, or None if download failed.
|
|
"""
|
|
if not card.image_url:
|
|
return None
|
|
|
|
# Create directory structure: images/{set_code}/
|
|
set_dir = self.images_dir / card.set_code
|
|
set_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Determine file extension from URL
|
|
url_path = card.image_url.split("?")[0] # Remove query params
|
|
ext = Path(url_path).suffix or ".webp"
|
|
|
|
# Generate filename: {number:03d}-{name}{ext}
|
|
url_name = card.id.split("-", 2)[2] # Get name part from ID
|
|
filename = f"{card.card_number:03d}-{url_name}{ext}"
|
|
filepath = set_dir / filename
|
|
relative_path = f"{card.set_code}/{filename}"
|
|
|
|
# Skip if already downloaded
|
|
if filepath.exists():
|
|
logger.debug(f"Image exists, skipping: {relative_path}")
|
|
self.image_stats["skipped"] += 1
|
|
return relative_path
|
|
|
|
# Download the image with appropriate headers
|
|
image_headers = {
|
|
"Accept": "image/webp,image/apng,image/*,*/*;q=0.8",
|
|
"Referer": "https://www.pokemon-zone.com/",
|
|
}
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
response = self.session.get(card.image_url, timeout=30, headers=image_headers)
|
|
response.raise_for_status()
|
|
|
|
# Verify it's an image
|
|
content_type = response.headers.get("content-type", "")
|
|
if not content_type.startswith("image/"):
|
|
logger.warning(f"Not an image: {content_type} for {card.image_url}")
|
|
self.image_stats["failed"] += 1
|
|
return None
|
|
|
|
# Save the image
|
|
with open(filepath, "wb") as f:
|
|
f.write(response.content)
|
|
|
|
logger.debug(f"Downloaded: {relative_path} ({len(response.content)} bytes)")
|
|
self.image_stats["downloaded"] += 1
|
|
return relative_path
|
|
|
|
except requests.RequestException as e:
|
|
logger.warning(f"Image download attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
|
|
if attempt < MAX_RETRIES - 1:
|
|
time.sleep(RETRY_DELAY)
|
|
|
|
logger.error(f"Failed to download image for {card.id}")
|
|
self.image_stats["failed"] += 1
|
|
return None
|
|
|
|
def get_card_urls_for_set(self, set_code: str) -> list[tuple[int, str, str]]:
|
|
"""Get all card URLs for a set.
|
|
|
|
Args:
|
|
set_code: Set code (e.g., "a1", "a1a").
|
|
|
|
Returns:
|
|
List of (card_number, card_name, url) tuples.
|
|
"""
|
|
set_url = f"{BASE_URL}/sets/{set_code}/"
|
|
logger.info(f"Fetching set page: {set_url}")
|
|
|
|
soup = self.fetch_page(set_url)
|
|
if not soup:
|
|
logger.error(f"Failed to fetch set page for {set_code}")
|
|
return []
|
|
|
|
cards: list[tuple[int, str, str]] = []
|
|
# Find all card links - they follow pattern /cards/{set}/{number}/{name}/
|
|
pattern = re.compile(rf"^/cards/{set_code}/(\d+)/([^/]+)/$")
|
|
|
|
for link in soup.find_all("a", href=pattern):
|
|
href = link.get("href", "")
|
|
match = pattern.match(href)
|
|
if match:
|
|
card_number = int(match.group(1))
|
|
card_name = match.group(2)
|
|
# Avoid duplicates (page may have multiple links to same card)
|
|
card_tuple = (card_number, card_name, f"{BASE_URL}{href}")
|
|
if card_tuple not in cards:
|
|
cards.append(card_tuple)
|
|
|
|
# Sort by card number
|
|
cards.sort(key=lambda x: x[0])
|
|
logger.info(f"Found {len(cards)} cards in set {set_code}")
|
|
|
|
return cards
|
|
|
|
def parse_energy_type(self, element: Tag | None) -> str | None:
|
|
"""Extract energy type from an element containing an energy icon.
|
|
|
|
Args:
|
|
element: BeautifulSoup element that may contain energy icons.
|
|
|
|
Returns:
|
|
Energy type string or None.
|
|
"""
|
|
if not element:
|
|
return None
|
|
|
|
for icon in element.find_all("span", class_=re.compile(r"energy-icon--type-")):
|
|
for cls in icon.get("class", []):
|
|
if cls in ENERGY_TYPES:
|
|
return ENERGY_TYPES[cls]
|
|
|
|
return None
|
|
|
|
def parse_attack(self, attack_row: Tag) -> Attack | None:
|
|
"""Parse an attack from an attack-summary-row element.
|
|
|
|
Args:
|
|
attack_row: BeautifulSoup element for the attack row.
|
|
|
|
Returns:
|
|
Attack object or None if parsing failed.
|
|
"""
|
|
try:
|
|
# Get attack name
|
|
name_elem = attack_row.find(class_="attack-summary-row__name")
|
|
if not name_elem:
|
|
return None
|
|
name = name_elem.get_text(strip=True)
|
|
|
|
# Get energy cost
|
|
cost: list[str] = []
|
|
costs_elem = attack_row.find(class_="attack-summary-row__costs")
|
|
if costs_elem:
|
|
for cost_icon in costs_elem.find_all("span", class_=re.compile(r"energy-icon")):
|
|
for cls in cost_icon.get("class", []):
|
|
if cls in ENERGY_TYPES:
|
|
cost.append(ENERGY_TYPES[cls])
|
|
|
|
# Get damage
|
|
damage: int | None = None
|
|
damage_modifier: str | None = None
|
|
damage_elem = attack_row.find(class_="attack-summary-row__damage")
|
|
if damage_elem:
|
|
damage_text = damage_elem.get_text(strip=True)
|
|
# Parse damage like "60", "50+", "100x"
|
|
match = re.match(r"(\d+)([+x])?", damage_text)
|
|
if match:
|
|
damage = int(match.group(1))
|
|
damage_modifier = match.group(2)
|
|
|
|
# Get effect text (using extract_effect_text to preserve energy types)
|
|
effect_text: str | None = None
|
|
footer_elem = attack_row.find(class_="attack-summary-row__footer")
|
|
if footer_elem:
|
|
effect_text = extract_effect_text(footer_elem)
|
|
|
|
return Attack(
|
|
name=name,
|
|
cost=cost,
|
|
damage=damage,
|
|
damage_modifier=damage_modifier,
|
|
effect_text=effect_text,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse attack: {e}")
|
|
return None
|
|
|
|
def parse_ability(self, ability_row: Tag) -> Ability | None:
|
|
"""Parse an ability from an ability-summary-row element.
|
|
|
|
Args:
|
|
ability_row: BeautifulSoup element for the ability row.
|
|
|
|
Returns:
|
|
Ability object or None if parsing failed.
|
|
"""
|
|
try:
|
|
# Get ability name (text after "Ability" badge)
|
|
name_elem = ability_row.find(class_="ability-summary-row__name")
|
|
if not name_elem:
|
|
return None
|
|
|
|
# Remove the "Ability" badge text to get just the name
|
|
name_text = name_elem.get_text(strip=True)
|
|
name = re.sub(r"^Ability\s*", "", name_text)
|
|
|
|
# Get effect text (using extract_effect_text to preserve energy types)
|
|
desc_elem = ability_row.find(class_="ability-summary-row__description")
|
|
effect_text = extract_effect_text(desc_elem) if desc_elem else ""
|
|
|
|
return Ability(name=name, effect_text=effect_text)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse ability: {e}")
|
|
return None
|
|
|
|
def parse_card_page(self, soup: BeautifulSoup, url: str, set_code: str) -> Card | None:
|
|
"""Parse a card page into a Card object.
|
|
|
|
Args:
|
|
soup: BeautifulSoup object of the card page.
|
|
url: URL of the card page (for error logging).
|
|
set_code: Set code for this card.
|
|
|
|
Returns:
|
|
Card object or None if parsing failed.
|
|
"""
|
|
try:
|
|
# Extract card number and name from URL
|
|
match = re.search(rf"/cards/{set_code}/(\d+)/([^/]+)/", url)
|
|
if not match:
|
|
logger.error(f"Could not parse card URL: {url}")
|
|
return None
|
|
|
|
card_number = int(match.group(1))
|
|
url_name = match.group(2)
|
|
|
|
# Get card name from page
|
|
name_elem = soup.find("h1")
|
|
if not name_elem:
|
|
logger.error(f"Could not find card name on page: {url}")
|
|
return None
|
|
name = name_elem.get_text(strip=True)
|
|
|
|
# Determine card type - look for specific card type indicators
|
|
# Trainers have "Trainer | Supporter/Item/Stadium" text
|
|
# Energy cards have specific energy type text
|
|
# Pokemon cards have "Pokémon | Basic/Stage 1/Stage 2" text
|
|
card_type = "pokemon" # Default
|
|
trainer_type: str | None = None
|
|
|
|
# Check for Fossil cards first (they appear as Pokemon on the site but are Items)
|
|
if name in FOSSIL_CARDS:
|
|
card_type = "trainer"
|
|
trainer_type = "item"
|
|
logger.info(f"Detected fossil card as Trainer/Item: {name}")
|
|
# Check for Trainer (more specific match)
|
|
elif soup.find(
|
|
string=re.compile(r"Trainer\s*\|\s*(Supporter|Item|Stadium)", re.IGNORECASE)
|
|
):
|
|
card_type = "trainer"
|
|
# Check for Energy
|
|
elif soup.find(string=re.compile(r"Energy\s*$", re.IGNORECASE)):
|
|
card_type = "energy"
|
|
|
|
# Create card ID
|
|
card_id = f"{set_code}-{card_number:03d}-{url_name}"
|
|
|
|
# Get rarity from CSS class (rarity-icon--rarity-X)
|
|
rarity = "Unknown"
|
|
rarity_icon = soup.find("span", class_="rarity-icon")
|
|
if rarity_icon:
|
|
for cls in rarity_icon.get("class", []):
|
|
if "rarity-icon--rarity-" in cls:
|
|
rarity_code = cls.replace("rarity-icon--rarity-", "")
|
|
rarity = RARITY_CODES.get(rarity_code, rarity_code)
|
|
break
|
|
|
|
# Get card image URL (first image in card-detail__card section)
|
|
image_url: str | None = None
|
|
card_section = soup.find("div", class_="card-detail__card")
|
|
if card_section:
|
|
img = card_section.find("img")
|
|
if img:
|
|
image_url = img.get("src")
|
|
# Remove query params to get full resolution
|
|
if image_url and "?" in image_url:
|
|
image_url = image_url.split("?")[0]
|
|
|
|
# Initialize card
|
|
# is_ex: Check if name ends with " ex" (case insensitive)
|
|
# This avoids false positives like "Exeggutor"
|
|
is_ex = name.lower().endswith(" ex")
|
|
|
|
card = Card(
|
|
id=card_id,
|
|
name=name,
|
|
set_code=set_code,
|
|
set_name=SETS.get(set_code, {}).get("name", set_code),
|
|
card_number=card_number,
|
|
rarity=rarity,
|
|
card_type=card_type,
|
|
image_url=image_url,
|
|
source_url=url,
|
|
is_ex=is_ex,
|
|
)
|
|
|
|
if card_type == "pokemon":
|
|
self._parse_pokemon_details(soup, card)
|
|
elif card_type == "trainer":
|
|
# For fossil cards, we already know the trainer_type
|
|
if trainer_type:
|
|
card.stage = trainer_type
|
|
self._parse_trainer_details(soup, card)
|
|
elif card_type == "energy":
|
|
self._parse_energy_details(soup, card)
|
|
|
|
return card
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to parse card page {url}: {e}")
|
|
self.errors.append({"url": url, "error": str(e)})
|
|
return None
|
|
|
|
def _parse_pokemon_details(self, soup: BeautifulSoup, card: Card) -> None:
|
|
"""Parse Pokemon-specific details from the page.
|
|
|
|
Args:
|
|
soup: BeautifulSoup object of the card page.
|
|
card: Card object to populate.
|
|
"""
|
|
# Get HP
|
|
hp_match = soup.find(string=re.compile(r"HP\s*(\d+)", re.IGNORECASE))
|
|
if hp_match:
|
|
hp_num = re.search(r"(\d+)", str(hp_match))
|
|
if hp_num:
|
|
card.hp = int(hp_num.group(1))
|
|
else:
|
|
# Try finding HP in the stat display
|
|
hp_elem = soup.find("span", string="HP")
|
|
if hp_elem:
|
|
hp_value = hp_elem.find_next("span")
|
|
if hp_value:
|
|
hp_text = hp_value.get_text(strip=True)
|
|
hp_num = re.search(r"(\d+)", hp_text)
|
|
if hp_num:
|
|
card.hp = int(hp_num.group(1))
|
|
|
|
# Get Pokemon type from first energy icon NOT in an attack row
|
|
# The card's type icon is in the header area, not in attack-summary-row__cost
|
|
for icon in soup.find_all("span", class_=re.compile(r"energy-icon--type-")):
|
|
parent = icon.parent
|
|
parent_classes = parent.get("class", []) if parent else []
|
|
# Skip if this is an attack cost icon
|
|
if "attack-summary-row__cost" not in parent_classes:
|
|
for cls in icon.get("class", []):
|
|
if cls in ENERGY_TYPES:
|
|
card.pokemon_type = ENERGY_TYPES[cls]
|
|
break
|
|
if card.pokemon_type:
|
|
break
|
|
|
|
# Get stage and evolution info
|
|
stage_text = soup.find(string=re.compile(r"Basic|Stage 1|Stage 2", re.IGNORECASE))
|
|
if stage_text:
|
|
stage_lower = str(stage_text).lower()
|
|
if "stage 2" in stage_lower:
|
|
card.stage = "stage_2"
|
|
elif "stage 1" in stage_lower:
|
|
card.stage = "stage_1"
|
|
elif "basic" in stage_lower:
|
|
card.stage = "basic"
|
|
|
|
# Get evolves_from
|
|
evolves_match = soup.find(string=re.compile(r"Evolves from", re.IGNORECASE))
|
|
if evolves_match:
|
|
# Try to find the Pokemon name link nearby
|
|
parent = evolves_match.parent if hasattr(evolves_match, "parent") else None
|
|
if parent:
|
|
link = parent.find("a")
|
|
if link:
|
|
card.evolves_from = link.get_text(strip=True)
|
|
|
|
# Get abilities
|
|
for ability_row in soup.find_all(class_="ability-summary-row"):
|
|
ability = self.parse_ability(ability_row)
|
|
if ability:
|
|
card.abilities.append(ability)
|
|
|
|
# Get attacks
|
|
for attack_row in soup.find_all(class_="attack-summary-row"):
|
|
attack = self.parse_attack(attack_row)
|
|
if attack:
|
|
card.attacks.append(attack)
|
|
|
|
# Get weakness
|
|
weakness_section = soup.find(string=re.compile(r"Weakness", re.IGNORECASE))
|
|
if weakness_section:
|
|
parent = weakness_section.parent
|
|
if parent:
|
|
card.weakness_type = self.parse_energy_type(parent.parent)
|
|
# Look for +20 pattern
|
|
value_match = re.search(
|
|
r"\+(\d+)", parent.parent.get_text() if parent.parent else ""
|
|
)
|
|
if value_match:
|
|
card.weakness_value = int(value_match.group(1))
|
|
|
|
# Get retreat cost (count colorless energy icons in retreat section)
|
|
retreat_section = soup.find(string=re.compile(r"Retreat", re.IGNORECASE))
|
|
if retreat_section:
|
|
parent = retreat_section.parent
|
|
if parent and parent.parent:
|
|
retreat_icons = parent.parent.find_all(
|
|
"span", class_=re.compile(r"energy-icon--type-colorless")
|
|
)
|
|
card.retreat_cost = len(retreat_icons)
|
|
|
|
# Get illustrator
|
|
illustrator_match = soup.find(string=re.compile(r"Illustrated by", re.IGNORECASE))
|
|
if illustrator_match:
|
|
card.illustrator = str(illustrator_match).replace("Illustrated by", "").strip()
|
|
|
|
# Get flavor text (Pokemon description)
|
|
# This is usually in a paragraph after the attacks section
|
|
for p in soup.find_all("p"):
|
|
text = p.get_text(strip=True)
|
|
if text and len(text) > 50 and "Illustrated" not in text and "Artwork" not in text:
|
|
card.flavor_text = text
|
|
break
|
|
|
|
def _parse_trainer_details(self, soup: BeautifulSoup, card: Card) -> None:
|
|
"""Parse Trainer-specific details from the page.
|
|
|
|
Args:
|
|
soup: BeautifulSoup object of the card page.
|
|
card: Card object to populate.
|
|
"""
|
|
# Get trainer type (Item, Supporter, Stadium) from "Trainer | Type" text
|
|
type_match = soup.find(
|
|
string=re.compile(r"Trainer\s*\|\s*(Supporter|Item|Stadium)", re.IGNORECASE)
|
|
)
|
|
if type_match:
|
|
match = re.search(r"(Supporter|Item|Stadium)", str(type_match), re.IGNORECASE)
|
|
if match:
|
|
card.stage = match.group(1).lower()
|
|
|
|
# Get effect text - look for card-detail__content-body
|
|
# Using extract_effect_text to preserve energy type references
|
|
content_body = soup.find("div", class_="card-detail__content-body")
|
|
if content_body:
|
|
# Extract effect text with proper energy type handling
|
|
effect_text = extract_effect_text(content_body)
|
|
# Remove illustrator info at the end
|
|
if effect_text and "Illustrated by" in effect_text:
|
|
effect_text = effect_text.split("Illustrated by")[0].strip()
|
|
if effect_text:
|
|
card.flavor_text = effect_text
|
|
else:
|
|
# Fallback: look for any paragraph with effect-like text
|
|
for elem in soup.find_all("p"):
|
|
effect_text = extract_effect_text(elem)
|
|
if (
|
|
effect_text
|
|
and len(effect_text) > 20
|
|
and "Illustrated" not in effect_text
|
|
and "Artwork" not in effect_text
|
|
and "Pokemon Zone" not in effect_text
|
|
and "unofficial" not in effect_text.lower()
|
|
):
|
|
card.flavor_text = effect_text
|
|
break
|
|
|
|
# Get illustrator
|
|
illustrator_match = soup.find(string=re.compile(r"Illustrated by", re.IGNORECASE))
|
|
if illustrator_match:
|
|
card.illustrator = str(illustrator_match).replace("Illustrated by", "").strip()
|
|
|
|
def _parse_energy_details(self, soup: BeautifulSoup, card: Card) -> None:
|
|
"""Parse Energy-specific details from the page.
|
|
|
|
Args:
|
|
soup: BeautifulSoup object of the card page.
|
|
card: Card object to populate.
|
|
"""
|
|
# Get energy type from the page
|
|
card.pokemon_type = self.parse_energy_type(soup)
|
|
|
|
# Get illustrator
|
|
illustrator_match = soup.find(string=re.compile(r"Illustrated by", re.IGNORECASE))
|
|
if illustrator_match:
|
|
card.illustrator = str(illustrator_match).replace("Illustrated by", "").strip()
|
|
|
|
def scrape_card(self, card_id: str) -> Card | None:
|
|
"""Scrape a single card by ID.
|
|
|
|
Args:
|
|
card_id: Card ID in format "{set}-{number}-{name}" (e.g., "a1-132-gardevoir").
|
|
|
|
Returns:
|
|
Card object or None if scraping failed.
|
|
"""
|
|
# Parse card ID
|
|
match = re.match(r"([a-z0-9]+)-(\d+)-(.+)", card_id)
|
|
if not match:
|
|
logger.error(f"Invalid card ID format: {card_id}")
|
|
return None
|
|
|
|
set_code = match.group(1)
|
|
card_number = int(match.group(2))
|
|
card_name = match.group(3)
|
|
|
|
url = f"{BASE_URL}/cards/{set_code}/{card_number}/{card_name}/"
|
|
logger.info(f"Scraping card: {url}")
|
|
|
|
soup = self.fetch_page(url)
|
|
if not soup:
|
|
return None
|
|
|
|
card = self.parse_card_page(soup, url, set_code)
|
|
|
|
# Download image if enabled
|
|
if card and self.download_images and card.image_url:
|
|
card.image_file = self.download_image(card)
|
|
|
|
return card
|
|
|
|
def scrape_set(self, set_code: str, limit: int | None = None) -> list[Card]:
|
|
"""Scrape all cards from a set.
|
|
|
|
Args:
|
|
set_code: Set code (e.g., "a1", "a1a").
|
|
limit: Maximum number of cards to scrape (for testing).
|
|
|
|
Returns:
|
|
List of Card objects.
|
|
"""
|
|
if set_code not in SETS:
|
|
logger.error(f"Unknown set code: {set_code}")
|
|
return []
|
|
|
|
card_urls = self.get_card_urls_for_set(set_code)
|
|
if limit:
|
|
card_urls = card_urls[:limit]
|
|
|
|
cards: list[Card] = []
|
|
total = len(card_urls)
|
|
|
|
for i, (card_number, card_name, url) in enumerate(card_urls, 1):
|
|
logger.info(f"[{i}/{total}] Scraping: {card_name} (#{card_number})")
|
|
|
|
soup = self.fetch_page(url)
|
|
if soup:
|
|
card = self.parse_card_page(soup, url, set_code)
|
|
if card:
|
|
# Download image if enabled
|
|
if self.download_images and card.image_url:
|
|
card.image_file = self.download_image(card)
|
|
time.sleep(IMAGE_REQUEST_DELAY)
|
|
|
|
cards.append(card)
|
|
self.save_card(card)
|
|
|
|
# Rate limiting
|
|
if i < total:
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
return cards
|
|
|
|
def save_card(self, card: Card) -> Path:
|
|
"""Save a card to a JSON file.
|
|
|
|
Args:
|
|
card: Card object to save.
|
|
|
|
Returns:
|
|
Path to the saved file.
|
|
"""
|
|
set_dir = self.data_dir / card.set_code
|
|
set_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Generate filename: {number:03d}-{name}.json
|
|
url_name = card.id.split("-", 2)[2] # Get name part from ID
|
|
filename = f"{card.card_number:03d}-{url_name}.json"
|
|
filepath = set_dir / filename
|
|
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
|
json.dump(card.to_dict(), f, indent=2, ensure_ascii=False)
|
|
|
|
logger.debug(f"Saved: {filepath}")
|
|
return filepath
|
|
|
|
def generate_index(self) -> Path:
|
|
"""Generate the combined index file from existing card files.
|
|
|
|
Returns:
|
|
Path to the index file.
|
|
"""
|
|
logger.info("Generating index...")
|
|
|
|
index: dict[str, Any] = {
|
|
"generated_at": datetime.now(UTC).isoformat(),
|
|
"schema_version": "1.0",
|
|
"sets": {},
|
|
"cards": [],
|
|
"total_cards": 0,
|
|
}
|
|
|
|
for set_code in SETS:
|
|
set_dir = self.data_dir / set_code
|
|
if not set_dir.exists():
|
|
continue
|
|
|
|
card_files = sorted(set_dir.glob("*.json"))
|
|
index["sets"][set_code] = {
|
|
"name": SETS[set_code]["name"],
|
|
"card_count": len(card_files),
|
|
}
|
|
|
|
for card_file in card_files:
|
|
relative_path = f"{set_code}/{card_file.name}"
|
|
with open(card_file, encoding="utf-8") as f:
|
|
card_data = json.load(f)
|
|
|
|
index["cards"].append(
|
|
{
|
|
"id": card_data["id"],
|
|
"name": card_data["name"],
|
|
"set_code": set_code,
|
|
"card_number": card_data["card_number"],
|
|
"file": relative_path,
|
|
}
|
|
)
|
|
|
|
index["total_cards"] = len(index["cards"])
|
|
|
|
# Sort cards by set and number
|
|
index["cards"].sort(key=lambda x: (x["set_code"], x["card_number"]))
|
|
|
|
index_path = self.data_dir / "_index.json"
|
|
with open(index_path, "w", encoding="utf-8") as f:
|
|
json.dump(index, f, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f"Index generated: {index_path} ({index['total_cards']} cards)")
|
|
return index_path
|
|
|
|
def save_errors(self) -> Path | None:
|
|
"""Save error log if there were any errors.
|
|
|
|
Returns:
|
|
Path to the error log file, or None if no errors.
|
|
"""
|
|
if not self.errors:
|
|
return None
|
|
|
|
error_log = self.data_dir / "_errors.log"
|
|
with open(error_log, "w", encoding="utf-8") as f:
|
|
f.write(f"Scraping errors - {datetime.now(UTC).isoformat()}\n")
|
|
f.write("=" * 60 + "\n\n")
|
|
for error in self.errors:
|
|
f.write(f"URL: {error['url']}\n")
|
|
f.write(f"Error: {error['error']}\n\n")
|
|
|
|
logger.warning(f"Errors logged to: {error_log}")
|
|
return error_log
|
|
|
|
def download_images_for_existing_cards(self, set_code: str | None = None) -> int:
|
|
"""Download images for cards that already have JSON files.
|
|
|
|
This is useful for downloading images separately from scraping,
|
|
or for retrying failed image downloads.
|
|
|
|
Args:
|
|
set_code: Optional set code to limit downloads to a specific set.
|
|
|
|
Returns:
|
|
Number of images downloaded.
|
|
"""
|
|
sets_to_process = [set_code] if set_code else list(SETS.keys())
|
|
total_downloaded = 0
|
|
|
|
for sc in sets_to_process:
|
|
set_dir = self.data_dir / sc
|
|
if not set_dir.exists():
|
|
logger.warning(f"No card data found for set {sc}")
|
|
continue
|
|
|
|
card_files = sorted(set_dir.glob("*.json"))
|
|
logger.info(f"Processing {len(card_files)} cards from set {sc}")
|
|
|
|
for i, card_file in enumerate(card_files, 1):
|
|
with open(card_file, encoding="utf-8") as f:
|
|
card_data = json.load(f)
|
|
|
|
image_url = card_data.get("image_url")
|
|
existing_file = card_data.get("image_file")
|
|
|
|
if not image_url:
|
|
continue
|
|
|
|
# Create a minimal Card object for download
|
|
card = Card(
|
|
id=card_data["id"],
|
|
name=card_data["name"],
|
|
set_code=card_data["set_code"],
|
|
set_name=card_data["set_name"],
|
|
card_number=card_data["card_number"],
|
|
rarity=card_data["rarity"],
|
|
card_type=card_data["card_type"],
|
|
image_url=image_url,
|
|
)
|
|
|
|
# Check if image already exists
|
|
if existing_file:
|
|
image_path = self.images_dir / existing_file
|
|
if image_path.exists():
|
|
self.image_stats["skipped"] += 1
|
|
continue
|
|
|
|
logger.info(f"[{i}/{len(card_files)}] Downloading: {card.name}")
|
|
image_file = self.download_image(card)
|
|
|
|
if image_file:
|
|
# Update the JSON file with the image path
|
|
card_data["image_file"] = image_file
|
|
with open(card_file, "w", encoding="utf-8") as f:
|
|
json.dump(card_data, f, indent=2, ensure_ascii=False)
|
|
total_downloaded += 1
|
|
|
|
time.sleep(IMAGE_REQUEST_DELAY)
|
|
|
|
return total_downloaded
|
|
|
|
def log_image_stats(self) -> None:
|
|
"""Log image download statistics."""
|
|
stats = self.image_stats
|
|
total = stats["downloaded"] + stats["skipped"] + stats["failed"]
|
|
if total > 0:
|
|
logger.info(
|
|
f"Images: {stats['downloaded']} downloaded, "
|
|
f"{stats['skipped']} skipped, {stats['failed']} failed"
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# CLI
|
|
# =============================================================================
|
|
|
|
|
|
def main() -> int:
|
|
"""Main entry point for the scraper CLI."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Scrape Pokemon TCG Pocket card data from pokemon-zone.com",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Scrape entire set
|
|
uv run python scripts/scrape_pokemon_pocket.py --set a1
|
|
|
|
# Scrape with limit (for testing)
|
|
uv run python scripts/scrape_pokemon_pocket.py --set a1 --limit 5
|
|
|
|
# Scrape set with images
|
|
uv run python scripts/scrape_pokemon_pocket.py --set a1 --images
|
|
|
|
# Download images for existing card data
|
|
uv run python scripts/scrape_pokemon_pocket.py --download-images
|
|
uv run python scripts/scrape_pokemon_pocket.py --download-images --set a1
|
|
|
|
# Scrape single card by ID
|
|
uv run python scripts/scrape_pokemon_pocket.py --card a1-132-gardevoir
|
|
|
|
# Regenerate index from existing card files
|
|
uv run python scripts/scrape_pokemon_pocket.py --reindex
|
|
""",
|
|
)
|
|
|
|
# Main action group - what operation to perform
|
|
action_group = parser.add_mutually_exclusive_group(required=True)
|
|
action_group.add_argument(
|
|
"--set",
|
|
choices=list(SETS.keys()),
|
|
help="Scrape all cards from a set",
|
|
)
|
|
action_group.add_argument(
|
|
"--card",
|
|
type=str,
|
|
help="Scrape a single card by ID (e.g., a1-132-gardevoir)",
|
|
)
|
|
action_group.add_argument(
|
|
"--reindex",
|
|
action="store_true",
|
|
help="Regenerate index from existing card files",
|
|
)
|
|
action_group.add_argument(
|
|
"--download-images",
|
|
choices=list(SETS.keys()) + ["all"],
|
|
nargs="?",
|
|
const="all",
|
|
help="Download images for existing card data (specify set or 'all')",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--limit",
|
|
type=int,
|
|
help="Maximum number of cards to scrape (for testing)",
|
|
)
|
|
parser.add_argument(
|
|
"--images",
|
|
action="store_true",
|
|
help="Download card images while scraping",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
scraper = PokemonPocketScraper(download_images=args.images)
|
|
|
|
if args.reindex:
|
|
scraper.generate_index()
|
|
return 0
|
|
|
|
if args.download_images:
|
|
# Download images for existing cards
|
|
set_code = None if args.download_images == "all" else args.download_images
|
|
set_info = f"set {set_code}" if set_code else "all sets"
|
|
logger.info(f"Downloading images for existing card data ({set_info})...")
|
|
scraper.download_images = True # Enable downloads
|
|
downloaded = scraper.download_images_for_existing_cards(set_code)
|
|
scraper.log_image_stats()
|
|
logger.info(f"Image download complete: {downloaded} new images")
|
|
return 0
|
|
|
|
if args.card:
|
|
card = scraper.scrape_card(args.card)
|
|
if card:
|
|
scraper.save_card(card)
|
|
scraper.generate_index()
|
|
scraper.log_image_stats()
|
|
logger.info(f"Successfully scraped: {card.name}")
|
|
return 0
|
|
else:
|
|
logger.error(f"Failed to scrape card: {args.card}")
|
|
return 1
|
|
|
|
if args.set:
|
|
cards = scraper.scrape_set(args.set, limit=args.limit)
|
|
scraper.generate_index()
|
|
scraper.save_errors()
|
|
scraper.log_image_stats()
|
|
|
|
success_count = len(cards)
|
|
error_count = len(scraper.errors)
|
|
total = success_count + error_count
|
|
|
|
logger.info(f"Scraping complete: {success_count}/{total} cards succeeded")
|
|
if error_count > 0:
|
|
logger.warning(f"{error_count} errors occurred (see _errors.log)")
|
|
return 1
|
|
return 0
|
|
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|