diff --git a/backend/data/cards/_index.json b/backend/data/cards/_index.json index 37d1061..b5a2ede 100644 --- a/backend/data/cards/_index.json +++ b/backend/data/cards/_index.json @@ -1,5 +1,5 @@ { - "generated_at": "2026-01-27T04:45:04.962697+00:00", + "generated_at": "2026-01-27T04:54:59.854435+00:00", "schema_version": "1.0", "sets": { "a1": { diff --git a/backend/data/cards/a1/001-bulbasaur.json b/backend/data/cards/a1/001-bulbasaur.json index 99b302c..ec2b393 100644 --- a/backend/data/cards/a1/001-bulbasaur.json +++ b/backend/data/cards/a1/001-bulbasaur.json @@ -34,5 +34,6 @@ "resistance": null, "retreat_cost": 1, "flavor_text": null, - "illustrator": "Narumi Sato" + "illustrator": "Narumi Sato", + "image_file": "a1/001-bulbasaur.webp" } \ No newline at end of file diff --git a/backend/data/cards/a1/002-ivysaur.json b/backend/data/cards/a1/002-ivysaur.json index 1f0b774..ec8c680 100644 --- a/backend/data/cards/a1/002-ivysaur.json +++ b/backend/data/cards/a1/002-ivysaur.json @@ -35,5 +35,6 @@ "resistance": null, "retreat_cost": 2, "flavor_text": null, - "illustrator": "Kurata So" + "illustrator": "Kurata So", + "image_file": "a1/002-ivysaur.webp" } \ No newline at end of file diff --git a/backend/data/cards/a1/003-venusaur.json b/backend/data/cards/a1/003-venusaur.json index 5fae561..b360936 100644 --- a/backend/data/cards/a1/003-venusaur.json +++ b/backend/data/cards/a1/003-venusaur.json @@ -36,5 +36,6 @@ "resistance": null, "retreat_cost": 3, "flavor_text": null, - "illustrator": "Ryota Murayama" + "illustrator": "Ryota Murayama", + "image_file": "a1/003-venusaur.webp" } \ No newline at end of file diff --git a/backend/data/cards/a1/004-venusaur-ex.json b/backend/data/cards/a1/004-venusaur-ex.json index 0bf7359..500ee2d 100644 --- a/backend/data/cards/a1/004-venusaur-ex.json +++ b/backend/data/cards/a1/004-venusaur-ex.json @@ -48,5 +48,6 @@ "resistance": null, "retreat_cost": 3, "flavor_text": null, - "illustrator": "PLANETA CG Works" + "illustrator": "PLANETA CG Works", + "image_file": "a1/004-venusaur-ex.webp" } \ No newline at end of file diff --git a/backend/data/cards/a1/005-caterpie.json b/backend/data/cards/a1/005-caterpie.json index 8e1466d..5485a54 100644 --- a/backend/data/cards/a1/005-caterpie.json +++ b/backend/data/cards/a1/005-caterpie.json @@ -7,6 +7,7 @@ "rarity": "Common", "card_type": "pokemon", "image_url": "https://assets.pokemon-zone.com/game-assets/CardPreviews/cPK_10_000050_00_CATERPIE_C.webp", + "image_file": "a1/005-caterpie.webp", "source_url": "https://www.pokemon-zone.com/cards/a1/5/caterpie/", "hp": 50, "pokemon_type": "grass", diff --git a/backend/data/images/a1/001-bulbasaur.webp b/backend/data/images/a1/001-bulbasaur.webp new file mode 100644 index 0000000..edf1b02 Binary files /dev/null and b/backend/data/images/a1/001-bulbasaur.webp differ diff --git a/backend/data/images/a1/002-ivysaur.webp b/backend/data/images/a1/002-ivysaur.webp new file mode 100644 index 0000000..1cabff1 Binary files /dev/null and b/backend/data/images/a1/002-ivysaur.webp differ diff --git a/backend/data/images/a1/003-venusaur.webp b/backend/data/images/a1/003-venusaur.webp new file mode 100644 index 0000000..b15441c Binary files /dev/null and b/backend/data/images/a1/003-venusaur.webp differ diff --git a/backend/data/images/a1/004-venusaur-ex.webp b/backend/data/images/a1/004-venusaur-ex.webp new file mode 100644 index 0000000..263a856 Binary files /dev/null and b/backend/data/images/a1/004-venusaur-ex.webp differ diff --git a/backend/data/images/a1/005-caterpie.webp b/backend/data/images/a1/005-caterpie.webp new file mode 100644 index 0000000..16deb75 Binary files /dev/null and b/backend/data/images/a1/005-caterpie.webp differ diff --git a/backend/scripts/scrape_pokemon_pocket.py b/backend/scripts/scrape_pokemon_pocket.py index d244c5c..cbb735c 100644 --- a/backend/scripts/scrape_pokemon_pocket.py +++ b/backend/scripts/scrape_pokemon_pocket.py @@ -43,7 +43,9 @@ from bs4 import BeautifulSoup, Tag BASE_URL = "https://www.pokemon-zone.com" DATA_DIR = Path(__file__).parent.parent / "data" / "cards" +IMAGES_DIR = Path(__file__).parent.parent / "data" / "images" REQUEST_DELAY = 1.5 # seconds between requests +IMAGE_REQUEST_DELAY = 0.5 # faster for images (different server) MAX_RETRIES = 3 RETRY_DELAY = 5 # seconds @@ -150,6 +152,7 @@ class Card: rarity: str card_type: str # "pokemon", "trainer", "energy" image_url: str | None = None # URL to card image for offline caching + image_file: str | None = None # Local path to downloaded image (relative to images dir) hp: int | None = None pokemon_type: str | None = None stage: str | None = None # "basic", "stage_1", "stage_2" @@ -177,6 +180,7 @@ class Card: "rarity": self.rarity, "card_type": self.card_type, "image_url": self.image_url, + "image_file": self.image_file, "source_url": self.source_url, } @@ -232,13 +236,22 @@ class Card: class PokemonPocketScraper: """Scraper for Pokemon TCG Pocket card data from pokemon-zone.com.""" - def __init__(self, data_dir: Path = DATA_DIR): + def __init__( + self, + data_dir: Path = DATA_DIR, + images_dir: Path = IMAGES_DIR, + download_images: bool = False, + ): """Initialize the scraper. Args: data_dir: Directory to save card data files. + images_dir: Directory to save downloaded card images. + download_images: Whether to download card images. """ self.data_dir = data_dir + self.images_dir = images_dir + self.download_images = download_images self.session = requests.Session() self.session.headers.update( { @@ -247,6 +260,7 @@ class PokemonPocketScraper: } ) self.errors: list[dict[str, Any]] = [] + self.image_stats = {"downloaded": 0, "skipped": 0, "failed": 0} def fetch_page(self, url: str) -> BeautifulSoup | None: """Fetch a page with retry logic. @@ -270,6 +284,72 @@ class PokemonPocketScraper: self.errors.append({"url": url, "error": "Max retries exceeded"}) return None + def download_image(self, card: Card) -> str | None: + """Download a card image and save it locally. + + Args: + card: Card object with image_url set. + + Returns: + Relative path to the saved image, or None if download failed. + """ + if not card.image_url: + return None + + # Create directory structure: images/{set_code}/ + set_dir = self.images_dir / card.set_code + set_dir.mkdir(parents=True, exist_ok=True) + + # Determine file extension from URL + url_path = card.image_url.split("?")[0] # Remove query params + ext = Path(url_path).suffix or ".webp" + + # Generate filename: {number:03d}-{name}{ext} + url_name = card.id.split("-", 2)[2] # Get name part from ID + filename = f"{card.card_number:03d}-{url_name}{ext}" + filepath = set_dir / filename + relative_path = f"{card.set_code}/{filename}" + + # Skip if already downloaded + if filepath.exists(): + logger.debug(f"Image exists, skipping: {relative_path}") + self.image_stats["skipped"] += 1 + return relative_path + + # Download the image with appropriate headers + image_headers = { + "Accept": "image/webp,image/apng,image/*,*/*;q=0.8", + "Referer": "https://www.pokemon-zone.com/", + } + for attempt in range(MAX_RETRIES): + try: + response = self.session.get(card.image_url, timeout=30, headers=image_headers) + response.raise_for_status() + + # Verify it's an image + content_type = response.headers.get("content-type", "") + if not content_type.startswith("image/"): + logger.warning(f"Not an image: {content_type} for {card.image_url}") + self.image_stats["failed"] += 1 + return None + + # Save the image + with open(filepath, "wb") as f: + f.write(response.content) + + logger.debug(f"Downloaded: {relative_path} ({len(response.content)} bytes)") + self.image_stats["downloaded"] += 1 + return relative_path + + except requests.RequestException as e: + logger.warning(f"Image download attempt {attempt + 1}/{MAX_RETRIES} failed: {e}") + if attempt < MAX_RETRIES - 1: + time.sleep(RETRY_DELAY) + + logger.error(f"Failed to download image for {card.id}") + self.image_stats["failed"] += 1 + return None + def get_card_urls_for_set(self, set_code: str) -> list[tuple[int, str, str]]: """Get all card URLs for a set. @@ -676,7 +756,13 @@ class PokemonPocketScraper: if not soup: return None - return self.parse_card_page(soup, url, set_code) + card = self.parse_card_page(soup, url, set_code) + + # Download image if enabled + if card and self.download_images and card.image_url: + card.image_file = self.download_image(card) + + return card def scrape_set(self, set_code: str, limit: int | None = None) -> list[Card]: """Scrape all cards from a set. @@ -706,6 +792,11 @@ class PokemonPocketScraper: if soup: card = self.parse_card_page(soup, url, set_code) if card: + # Download image if enabled + if self.download_images and card.image_url: + card.image_file = self.download_image(card) + time.sleep(IMAGE_REQUEST_DELAY) + cards.append(card) self.save_card(card) @@ -812,6 +903,83 @@ class PokemonPocketScraper: logger.warning(f"Errors logged to: {error_log}") return error_log + def download_images_for_existing_cards(self, set_code: str | None = None) -> int: + """Download images for cards that already have JSON files. + + This is useful for downloading images separately from scraping, + or for retrying failed image downloads. + + Args: + set_code: Optional set code to limit downloads to a specific set. + + Returns: + Number of images downloaded. + """ + sets_to_process = [set_code] if set_code else list(SETS.keys()) + total_downloaded = 0 + + for sc in sets_to_process: + set_dir = self.data_dir / sc + if not set_dir.exists(): + logger.warning(f"No card data found for set {sc}") + continue + + card_files = sorted(set_dir.glob("*.json")) + logger.info(f"Processing {len(card_files)} cards from set {sc}") + + for i, card_file in enumerate(card_files, 1): + with open(card_file, encoding="utf-8") as f: + card_data = json.load(f) + + image_url = card_data.get("image_url") + existing_file = card_data.get("image_file") + + if not image_url: + continue + + # Create a minimal Card object for download + card = Card( + id=card_data["id"], + name=card_data["name"], + set_code=card_data["set_code"], + set_name=card_data["set_name"], + card_number=card_data["card_number"], + rarity=card_data["rarity"], + card_type=card_data["card_type"], + image_url=image_url, + ) + + # Check if image already exists + if existing_file: + image_path = self.images_dir / existing_file + if image_path.exists(): + self.image_stats["skipped"] += 1 + continue + + logger.info(f"[{i}/{len(card_files)}] Downloading: {card.name}") + image_file = self.download_image(card) + + if image_file: + # Update the JSON file with the image path + card_data["image_file"] = image_file + with open(card_file, "w", encoding="utf-8") as f: + json.dump(card_data, f, indent=2, ensure_ascii=False) + total_downloaded += 1 + + time.sleep(IMAGE_REQUEST_DELAY) + + return total_downloaded + + def log_image_stats(self) -> None: + """Log image download statistics.""" + stats = self.image_stats + total = stats["downloaded"] + stats["skipped"] + stats["failed"] + if total > 0: + logger.info( + f"Images: {stats['downloaded']} downloaded, " + f"{stats['skipped']} skipped, {stats['failed']} failed" + ) + # ============================================================================= # CLI @@ -831,6 +999,13 @@ Examples: # Scrape with limit (for testing) uv run python scripts/scrape_pokemon_pocket.py --set a1 --limit 5 + # Scrape set with images + uv run python scripts/scrape_pokemon_pocket.py --set a1 --images + + # Download images for existing card data + uv run python scripts/scrape_pokemon_pocket.py --download-images + uv run python scripts/scrape_pokemon_pocket.py --download-images --set a1 + # Scrape single card by ID uv run python scripts/scrape_pokemon_pocket.py --card a1-132-gardevoir @@ -839,42 +1014,67 @@ Examples: """, ) - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument( + # Main action group - what operation to perform + action_group = parser.add_mutually_exclusive_group(required=True) + action_group.add_argument( "--set", choices=list(SETS.keys()), help="Scrape all cards from a set", ) - group.add_argument( + action_group.add_argument( "--card", type=str, help="Scrape a single card by ID (e.g., a1-132-gardevoir)", ) - group.add_argument( + action_group.add_argument( "--reindex", action="store_true", help="Regenerate index from existing card files", ) + action_group.add_argument( + "--download-images", + choices=list(SETS.keys()) + ["all"], + nargs="?", + const="all", + help="Download images for existing card data (specify set or 'all')", + ) parser.add_argument( "--limit", type=int, help="Maximum number of cards to scrape (for testing)", ) + parser.add_argument( + "--images", + action="store_true", + help="Download card images while scraping", + ) args = parser.parse_args() - scraper = PokemonPocketScraper() + scraper = PokemonPocketScraper(download_images=args.images) if args.reindex: scraper.generate_index() return 0 + if args.download_images: + # Download images for existing cards + set_code = None if args.download_images == "all" else args.download_images + set_info = f"set {set_code}" if set_code else "all sets" + logger.info(f"Downloading images for existing card data ({set_info})...") + scraper.download_images = True # Enable downloads + downloaded = scraper.download_images_for_existing_cards(set_code) + scraper.log_image_stats() + logger.info(f"Image download complete: {downloaded} new images") + return 0 + if args.card: card = scraper.scrape_card(args.card) if card: scraper.save_card(card) scraper.generate_index() + scraper.log_image_stats() logger.info(f"Successfully scraped: {card.name}") return 0 else: @@ -885,6 +1085,7 @@ Examples: cards = scraper.scrape_set(args.set, limit=args.limit) scraper.generate_index() scraper.save_errors() + scraper.log_image_stats() success_count = len(cards) error_count = len(scraper.errors)