Add image download support to card scraper

- Add --images flag to download images during scraping - Add --download-images command to fetch images for existing card data - Images saved to data/images/{set}/ directory - Card JSON updated with image_file field (relative path) - Uses Referer header for asset server compatibility - Supports skip-if-exists for incremental downloads
2026-01-26 22:55:13 -06:00 · 2026-01-26 22:55:13 -06:00 · 5e1229aa7c
commit 5e1229aa7c
parent 2517d241ac
12 changed files with 218 additions and 12 deletions
--- a/backend/data/cards/_index.json
+++ b/backend/data/cards/_index.json
@ -1,5 +1,5 @@
 {
-  "generated_at": "2026-01-27T04:45:04.962697+00:00",
+  "generated_at": "2026-01-27T04:54:59.854435+00:00",
  "schema_version": "1.0",
  "sets": {
    "a1": {
--- a/backend/data/cards/a1/001-bulbasaur.json
+++ b/backend/data/cards/a1/001-bulbasaur.json
@ -34,5 +34,6 @@
  "resistance": null,
  "retreat_cost": 1,
  "flavor_text": null,
-  "illustrator": "Narumi Sato"
+  "illustrator": "Narumi Sato",
+  "image_file": "a1/001-bulbasaur.webp"
 }
--- a/backend/data/cards/a1/002-ivysaur.json
+++ b/backend/data/cards/a1/002-ivysaur.json
@ -35,5 +35,6 @@
  "resistance": null,
  "retreat_cost": 2,
  "flavor_text": null,
-  "illustrator": "Kurata So"
+  "illustrator": "Kurata So",
+  "image_file": "a1/002-ivysaur.webp"
 }
--- a/backend/data/cards/a1/003-venusaur.json
+++ b/backend/data/cards/a1/003-venusaur.json
@ -36,5 +36,6 @@
  "resistance": null,
  "retreat_cost": 3,
  "flavor_text": null,
-  "illustrator": "Ryota Murayama"
+  "illustrator": "Ryota Murayama",
+  "image_file": "a1/003-venusaur.webp"
 }
--- a/backend/data/cards/a1/004-venusaur-ex.json
+++ b/backend/data/cards/a1/004-venusaur-ex.json
@ -48,5 +48,6 @@
  "resistance": null,
  "retreat_cost": 3,
  "flavor_text": null,
-  "illustrator": "PLANETA CG Works"
+  "illustrator": "PLANETA CG Works",
+  "image_file": "a1/004-venusaur-ex.webp"
 }
--- a/backend/data/cards/a1/005-caterpie.json
+++ b/backend/data/cards/a1/005-caterpie.json
@ -7,6 +7,7 @@
  "rarity": "Common",
  "card_type": "pokemon",
  "image_url": "https://assets.pokemon-zone.com/game-assets/CardPreviews/cPK_10_000050_00_CATERPIE_C.webp",
+  "image_file": "a1/005-caterpie.webp",
  "source_url": "https://www.pokemon-zone.com/cards/a1/5/caterpie/",
  "hp": 50,
  "pokemon_type": "grass",
--- a/backend/data/images/a1/001-bulbasaur.webp
+++ b/backend/data/images/a1/001-bulbasaur.webp
--- a/backend/data/images/a1/002-ivysaur.webp
+++ b/backend/data/images/a1/002-ivysaur.webp
--- a/backend/data/images/a1/003-venusaur.webp
+++ b/backend/data/images/a1/003-venusaur.webp
--- a/backend/data/images/a1/004-venusaur-ex.webp
+++ b/backend/data/images/a1/004-venusaur-ex.webp
--- a/backend/data/images/a1/005-caterpie.webp
+++ b/backend/data/images/a1/005-caterpie.webp
--- a/backend/scripts/scrape_pokemon_pocket.py
+++ b/backend/scripts/scrape_pokemon_pocket.py
@ -43,7 +43,9 @@ from bs4 import BeautifulSoup, Tag

 BASE_URL = "https://www.pokemon-zone.com"
 DATA_DIR = Path(__file__).parent.parent / "data" / "cards"
+IMAGES_DIR = Path(__file__).parent.parent / "data" / "images"
 REQUEST_DELAY = 1.5  # seconds between requests
+IMAGE_REQUEST_DELAY = 0.5  # faster for images (different server)
 MAX_RETRIES = 3
 RETRY_DELAY = 5  # seconds

@ -150,6 +152,7 @@ class Card:
    rarity: str
    card_type: str  # "pokemon", "trainer", "energy"
    image_url: str | None = None  # URL to card image for offline caching
+    image_file: str | None = None  # Local path to downloaded image (relative to images dir)
    hp: int | None = None
    pokemon_type: str | None = None
    stage: str | None = None  # "basic", "stage_1", "stage_2"
@ -177,6 +180,7 @@ class Card:
            "rarity": self.rarity,
            "card_type": self.card_type,
            "image_url": self.image_url,
+            "image_file": self.image_file,
            "source_url": self.source_url,
        }

@ -232,13 +236,22 @@ class Card:
 class PokemonPocketScraper:
    """Scraper for Pokemon TCG Pocket card data from pokemon-zone.com."""

-    def __init__(self, data_dir: Path = DATA_DIR):
+    def __init__(
+        self,
+        data_dir: Path = DATA_DIR,
+        images_dir: Path = IMAGES_DIR,
+        download_images: bool = False,
+    ):
        """Initialize the scraper.

        Args:
            data_dir: Directory to save card data files.
+            images_dir: Directory to save downloaded card images.
+            download_images: Whether to download card images.
        """
        self.data_dir = data_dir
+        self.images_dir = images_dir
+        self.download_images = download_images
        self.session = requests.Session()
        self.session.headers.update(
            {
@ -247,6 +260,7 @@ class PokemonPocketScraper:
            }
        )
        self.errors: list[dict[str, Any]] = []
+        self.image_stats = {"downloaded": 0, "skipped": 0, "failed": 0}

    def fetch_page(self, url: str) -> BeautifulSoup | None:
        """Fetch a page with retry logic.
@ -270,6 +284,72 @@ class PokemonPocketScraper:
        self.errors.append({"url": url, "error": "Max retries exceeded"})
        return None

+    def download_image(self, card: Card) -> str | None:
+        """Download a card image and save it locally.
+
+        Args:
+            card: Card object with image_url set.
+
+        Returns:
+            Relative path to the saved image, or None if download failed.
+        """
+        if not card.image_url:
+            return None
+
+        # Create directory structure: images/{set_code}/
+        set_dir = self.images_dir / card.set_code
+        set_dir.mkdir(parents=True, exist_ok=True)
+
+        # Determine file extension from URL
+        url_path = card.image_url.split("?")[0]  # Remove query params
+        ext = Path(url_path).suffix or ".webp"
+
+        # Generate filename: {number:03d}-{name}{ext}
+        url_name = card.id.split("-", 2)[2]  # Get name part from ID
+        filename = f"{card.card_number:03d}-{url_name}{ext}"
+        filepath = set_dir / filename
+        relative_path = f"{card.set_code}/{filename}"
+
+        # Skip if already downloaded
+        if filepath.exists():
+            logger.debug(f"Image exists, skipping: {relative_path}")
+            self.image_stats["skipped"] += 1
+            return relative_path
+
+        # Download the image with appropriate headers
+        image_headers = {
+            "Accept": "image/webp,image/apng,image/*,*/*;q=0.8",
+            "Referer": "https://www.pokemon-zone.com/",
+        }
+        for attempt in range(MAX_RETRIES):
+            try:
+                response = self.session.get(card.image_url, timeout=30, headers=image_headers)
+                response.raise_for_status()
+
+                # Verify it's an image
+                content_type = response.headers.get("content-type", "")
+                if not content_type.startswith("image/"):
+                    logger.warning(f"Not an image: {content_type} for {card.image_url}")
+                    self.image_stats["failed"] += 1
+                    return None
+
+                # Save the image
+                with open(filepath, "wb") as f:
+                    f.write(response.content)
+
+                logger.debug(f"Downloaded: {relative_path} ({len(response.content)} bytes)")
+                self.image_stats["downloaded"] += 1
+                return relative_path
+
+            except requests.RequestException as e:
+                logger.warning(f"Image download attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
+                if attempt < MAX_RETRIES - 1:
+                    time.sleep(RETRY_DELAY)
+
+        logger.error(f"Failed to download image for {card.id}")
+        self.image_stats["failed"] += 1
+        return None
+
    def get_card_urls_for_set(self, set_code: str) -> list[tuple[int, str, str]]:
        """Get all card URLs for a set.

@ -676,7 +756,13 @@ class PokemonPocketScraper:
        if not soup:
            return None

-        return self.parse_card_page(soup, url, set_code)
+        card = self.parse_card_page(soup, url, set_code)
+
+        # Download image if enabled
+        if card and self.download_images and card.image_url:
+            card.image_file = self.download_image(card)
+
+        return card

    def scrape_set(self, set_code: str, limit: int | None = None) -> list[Card]:
        """Scrape all cards from a set.
@ -706,6 +792,11 @@ class PokemonPocketScraper:
            if soup:
                card = self.parse_card_page(soup, url, set_code)
                if card:
+                    # Download image if enabled
+                    if self.download_images and card.image_url:
+                        card.image_file = self.download_image(card)
+                        time.sleep(IMAGE_REQUEST_DELAY)
+
                    cards.append(card)
                    self.save_card(card)

@ -812,6 +903,83 @@ class PokemonPocketScraper:
        logger.warning(f"Errors logged to: {error_log}")
        return error_log

+    def download_images_for_existing_cards(self, set_code: str | None = None) -> int:
+        """Download images for cards that already have JSON files.
+
+        This is useful for downloading images separately from scraping,
+        or for retrying failed image downloads.
+
+        Args:
+            set_code: Optional set code to limit downloads to a specific set.
+
+        Returns:
+            Number of images downloaded.
+        """
+        sets_to_process = [set_code] if set_code else list(SETS.keys())
+        total_downloaded = 0
+
+        for sc in sets_to_process:
+            set_dir = self.data_dir / sc
+            if not set_dir.exists():
+                logger.warning(f"No card data found for set {sc}")
+                continue
+
+            card_files = sorted(set_dir.glob("*.json"))
+            logger.info(f"Processing {len(card_files)} cards from set {sc}")
+
+            for i, card_file in enumerate(card_files, 1):
+                with open(card_file, encoding="utf-8") as f:
+                    card_data = json.load(f)
+
+                image_url = card_data.get("image_url")
+                existing_file = card_data.get("image_file")
+
+                if not image_url:
+                    continue
+
+                # Create a minimal Card object for download
+                card = Card(
+                    id=card_data["id"],
+                    name=card_data["name"],
+                    set_code=card_data["set_code"],
+                    set_name=card_data["set_name"],
+                    card_number=card_data["card_number"],
+                    rarity=card_data["rarity"],
+                    card_type=card_data["card_type"],
+                    image_url=image_url,
+                )
+
+                # Check if image already exists
+                if existing_file:
+                    image_path = self.images_dir / existing_file
+                    if image_path.exists():
+                        self.image_stats["skipped"] += 1
+                        continue
+
+                logger.info(f"[{i}/{len(card_files)}] Downloading: {card.name}")
+                image_file = self.download_image(card)
+
+                if image_file:
+                    # Update the JSON file with the image path
+                    card_data["image_file"] = image_file
+                    with open(card_file, "w", encoding="utf-8") as f:
+                        json.dump(card_data, f, indent=2, ensure_ascii=False)
+                    total_downloaded += 1
+
+                time.sleep(IMAGE_REQUEST_DELAY)
+
+        return total_downloaded
+
+    def log_image_stats(self) -> None:
+        """Log image download statistics."""
+        stats = self.image_stats
+        total = stats["downloaded"] + stats["skipped"] + stats["failed"]
+        if total > 0:
+            logger.info(
+                f"Images: {stats['downloaded']} downloaded, "
+                f"{stats['skipped']} skipped, {stats['failed']} failed"
+            )
+

 # =============================================================================
 # CLI
@ -831,6 +999,13 @@ Examples:
    # Scrape with limit (for testing)
    uv run python scripts/scrape_pokemon_pocket.py --set a1 --limit 5

+    # Scrape set with images
+    uv run python scripts/scrape_pokemon_pocket.py --set a1 --images
+
+    # Download images for existing card data
+    uv run python scripts/scrape_pokemon_pocket.py --download-images
+    uv run python scripts/scrape_pokemon_pocket.py --download-images --set a1
+
    # Scrape single card by ID
    uv run python scripts/scrape_pokemon_pocket.py --card a1-132-gardevoir

@ -839,42 +1014,67 @@ Examples:
        """,
    )

-    group = parser.add_mutually_exclusive_group(required=True)
-    group.add_argument(
+    # Main action group - what operation to perform
+    action_group = parser.add_mutually_exclusive_group(required=True)
+    action_group.add_argument(
        "--set",
        choices=list(SETS.keys()),
        help="Scrape all cards from a set",
    )
-    group.add_argument(
+    action_group.add_argument(
        "--card",
        type=str,
        help="Scrape a single card by ID (e.g., a1-132-gardevoir)",
    )
-    group.add_argument(
+    action_group.add_argument(
        "--reindex",
        action="store_true",
        help="Regenerate index from existing card files",
    )
+    action_group.add_argument(
+        "--download-images",
+        choices=list(SETS.keys()) + ["all"],
+        nargs="?",
+        const="all",
+        help="Download images for existing card data (specify set or 'all')",
+    )

    parser.add_argument(
        "--limit",
        type=int,
        help="Maximum number of cards to scrape (for testing)",
    )
+    parser.add_argument(
+        "--images",
+        action="store_true",
+        help="Download card images while scraping",
+    )

    args = parser.parse_args()

-    scraper = PokemonPocketScraper()
+    scraper = PokemonPocketScraper(download_images=args.images)

    if args.reindex:
        scraper.generate_index()
        return 0

+    if args.download_images:
+        # Download images for existing cards
+        set_code = None if args.download_images == "all" else args.download_images
+        set_info = f"set {set_code}" if set_code else "all sets"
+        logger.info(f"Downloading images for existing card data ({set_info})...")
+        scraper.download_images = True  # Enable downloads
+        downloaded = scraper.download_images_for_existing_cards(set_code)
+        scraper.log_image_stats()
+        logger.info(f"Image download complete: {downloaded} new images")
+        return 0
+
    if args.card:
        card = scraper.scrape_card(args.card)
        if card:
            scraper.save_card(card)
            scraper.generate_index()
+            scraper.log_image_stats()
            logger.info(f"Successfully scraped: {card.name}")
            return 0
        else:
@ -885,6 +1085,7 @@ Examples:
        cards = scraper.scrape_set(args.set, limit=args.limit)
        scraper.generate_index()
        scraper.save_errors()
+        scraper.log_image_stats()

        success_count = len(cards)
        error_count = len(scraper.errors)