Add image download support to card scraper

- Add --images flag to download images during scraping
- Add --download-images command to fetch images for existing card data
- Images saved to data/images/{set}/ directory
- Card JSON updated with image_file field (relative path)
- Uses Referer header for asset server compatibility
- Supports skip-if-exists for incremental downloads
This commit is contained in:
Cal Corum 2026-01-26 22:55:13 -06:00
parent 2517d241ac
commit 5e1229aa7c
12 changed files with 218 additions and 12 deletions

View File

@ -1,5 +1,5 @@
{
"generated_at": "2026-01-27T04:45:04.962697+00:00",
"generated_at": "2026-01-27T04:54:59.854435+00:00",
"schema_version": "1.0",
"sets": {
"a1": {

View File

@ -34,5 +34,6 @@
"resistance": null,
"retreat_cost": 1,
"flavor_text": null,
"illustrator": "Narumi Sato"
"illustrator": "Narumi Sato",
"image_file": "a1/001-bulbasaur.webp"
}

View File

@ -35,5 +35,6 @@
"resistance": null,
"retreat_cost": 2,
"flavor_text": null,
"illustrator": "Kurata So"
"illustrator": "Kurata So",
"image_file": "a1/002-ivysaur.webp"
}

View File

@ -36,5 +36,6 @@
"resistance": null,
"retreat_cost": 3,
"flavor_text": null,
"illustrator": "Ryota Murayama"
"illustrator": "Ryota Murayama",
"image_file": "a1/003-venusaur.webp"
}

View File

@ -48,5 +48,6 @@
"resistance": null,
"retreat_cost": 3,
"flavor_text": null,
"illustrator": "PLANETA CG Works"
"illustrator": "PLANETA CG Works",
"image_file": "a1/004-venusaur-ex.webp"
}

View File

@ -7,6 +7,7 @@
"rarity": "Common",
"card_type": "pokemon",
"image_url": "https://assets.pokemon-zone.com/game-assets/CardPreviews/cPK_10_000050_00_CATERPIE_C.webp",
"image_file": "a1/005-caterpie.webp",
"source_url": "https://www.pokemon-zone.com/cards/a1/5/caterpie/",
"hp": 50,
"pokemon_type": "grass",

Binary file not shown.

After

Width:  |  Height:  |  Size: 43 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 38 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 38 KiB

View File

@ -43,7 +43,9 @@ from bs4 import BeautifulSoup, Tag
BASE_URL = "https://www.pokemon-zone.com"
DATA_DIR = Path(__file__).parent.parent / "data" / "cards"
IMAGES_DIR = Path(__file__).parent.parent / "data" / "images"
REQUEST_DELAY = 1.5 # seconds between requests
IMAGE_REQUEST_DELAY = 0.5 # faster for images (different server)
MAX_RETRIES = 3
RETRY_DELAY = 5 # seconds
@ -150,6 +152,7 @@ class Card:
rarity: str
card_type: str # "pokemon", "trainer", "energy"
image_url: str | None = None # URL to card image for offline caching
image_file: str | None = None # Local path to downloaded image (relative to images dir)
hp: int | None = None
pokemon_type: str | None = None
stage: str | None = None # "basic", "stage_1", "stage_2"
@ -177,6 +180,7 @@ class Card:
"rarity": self.rarity,
"card_type": self.card_type,
"image_url": self.image_url,
"image_file": self.image_file,
"source_url": self.source_url,
}
@ -232,13 +236,22 @@ class Card:
class PokemonPocketScraper:
"""Scraper for Pokemon TCG Pocket card data from pokemon-zone.com."""
def __init__(self, data_dir: Path = DATA_DIR):
def __init__(
self,
data_dir: Path = DATA_DIR,
images_dir: Path = IMAGES_DIR,
download_images: bool = False,
):
"""Initialize the scraper.
Args:
data_dir: Directory to save card data files.
images_dir: Directory to save downloaded card images.
download_images: Whether to download card images.
"""
self.data_dir = data_dir
self.images_dir = images_dir
self.download_images = download_images
self.session = requests.Session()
self.session.headers.update(
{
@ -247,6 +260,7 @@ class PokemonPocketScraper:
}
)
self.errors: list[dict[str, Any]] = []
self.image_stats = {"downloaded": 0, "skipped": 0, "failed": 0}
def fetch_page(self, url: str) -> BeautifulSoup | None:
"""Fetch a page with retry logic.
@ -270,6 +284,72 @@ class PokemonPocketScraper:
self.errors.append({"url": url, "error": "Max retries exceeded"})
return None
def download_image(self, card: Card) -> str | None:
"""Download a card image and save it locally.
Args:
card: Card object with image_url set.
Returns:
Relative path to the saved image, or None if download failed.
"""
if not card.image_url:
return None
# Create directory structure: images/{set_code}/
set_dir = self.images_dir / card.set_code
set_dir.mkdir(parents=True, exist_ok=True)
# Determine file extension from URL
url_path = card.image_url.split("?")[0] # Remove query params
ext = Path(url_path).suffix or ".webp"
# Generate filename: {number:03d}-{name}{ext}
url_name = card.id.split("-", 2)[2] # Get name part from ID
filename = f"{card.card_number:03d}-{url_name}{ext}"
filepath = set_dir / filename
relative_path = f"{card.set_code}/{filename}"
# Skip if already downloaded
if filepath.exists():
logger.debug(f"Image exists, skipping: {relative_path}")
self.image_stats["skipped"] += 1
return relative_path
# Download the image with appropriate headers
image_headers = {
"Accept": "image/webp,image/apng,image/*,*/*;q=0.8",
"Referer": "https://www.pokemon-zone.com/",
}
for attempt in range(MAX_RETRIES):
try:
response = self.session.get(card.image_url, timeout=30, headers=image_headers)
response.raise_for_status()
# Verify it's an image
content_type = response.headers.get("content-type", "")
if not content_type.startswith("image/"):
logger.warning(f"Not an image: {content_type} for {card.image_url}")
self.image_stats["failed"] += 1
return None
# Save the image
with open(filepath, "wb") as f:
f.write(response.content)
logger.debug(f"Downloaded: {relative_path} ({len(response.content)} bytes)")
self.image_stats["downloaded"] += 1
return relative_path
except requests.RequestException as e:
logger.warning(f"Image download attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
if attempt < MAX_RETRIES - 1:
time.sleep(RETRY_DELAY)
logger.error(f"Failed to download image for {card.id}")
self.image_stats["failed"] += 1
return None
def get_card_urls_for_set(self, set_code: str) -> list[tuple[int, str, str]]:
"""Get all card URLs for a set.
@ -676,7 +756,13 @@ class PokemonPocketScraper:
if not soup:
return None
return self.parse_card_page(soup, url, set_code)
card = self.parse_card_page(soup, url, set_code)
# Download image if enabled
if card and self.download_images and card.image_url:
card.image_file = self.download_image(card)
return card
def scrape_set(self, set_code: str, limit: int | None = None) -> list[Card]:
"""Scrape all cards from a set.
@ -706,6 +792,11 @@ class PokemonPocketScraper:
if soup:
card = self.parse_card_page(soup, url, set_code)
if card:
# Download image if enabled
if self.download_images and card.image_url:
card.image_file = self.download_image(card)
time.sleep(IMAGE_REQUEST_DELAY)
cards.append(card)
self.save_card(card)
@ -812,6 +903,83 @@ class PokemonPocketScraper:
logger.warning(f"Errors logged to: {error_log}")
return error_log
def download_images_for_existing_cards(self, set_code: str | None = None) -> int:
"""Download images for cards that already have JSON files.
This is useful for downloading images separately from scraping,
or for retrying failed image downloads.
Args:
set_code: Optional set code to limit downloads to a specific set.
Returns:
Number of images downloaded.
"""
sets_to_process = [set_code] if set_code else list(SETS.keys())
total_downloaded = 0
for sc in sets_to_process:
set_dir = self.data_dir / sc
if not set_dir.exists():
logger.warning(f"No card data found for set {sc}")
continue
card_files = sorted(set_dir.glob("*.json"))
logger.info(f"Processing {len(card_files)} cards from set {sc}")
for i, card_file in enumerate(card_files, 1):
with open(card_file, encoding="utf-8") as f:
card_data = json.load(f)
image_url = card_data.get("image_url")
existing_file = card_data.get("image_file")
if not image_url:
continue
# Create a minimal Card object for download
card = Card(
id=card_data["id"],
name=card_data["name"],
set_code=card_data["set_code"],
set_name=card_data["set_name"],
card_number=card_data["card_number"],
rarity=card_data["rarity"],
card_type=card_data["card_type"],
image_url=image_url,
)
# Check if image already exists
if existing_file:
image_path = self.images_dir / existing_file
if image_path.exists():
self.image_stats["skipped"] += 1
continue
logger.info(f"[{i}/{len(card_files)}] Downloading: {card.name}")
image_file = self.download_image(card)
if image_file:
# Update the JSON file with the image path
card_data["image_file"] = image_file
with open(card_file, "w", encoding="utf-8") as f:
json.dump(card_data, f, indent=2, ensure_ascii=False)
total_downloaded += 1
time.sleep(IMAGE_REQUEST_DELAY)
return total_downloaded
def log_image_stats(self) -> None:
"""Log image download statistics."""
stats = self.image_stats
total = stats["downloaded"] + stats["skipped"] + stats["failed"]
if total > 0:
logger.info(
f"Images: {stats['downloaded']} downloaded, "
f"{stats['skipped']} skipped, {stats['failed']} failed"
)
# =============================================================================
# CLI
@ -831,6 +999,13 @@ Examples:
# Scrape with limit (for testing)
uv run python scripts/scrape_pokemon_pocket.py --set a1 --limit 5
# Scrape set with images
uv run python scripts/scrape_pokemon_pocket.py --set a1 --images
# Download images for existing card data
uv run python scripts/scrape_pokemon_pocket.py --download-images
uv run python scripts/scrape_pokemon_pocket.py --download-images --set a1
# Scrape single card by ID
uv run python scripts/scrape_pokemon_pocket.py --card a1-132-gardevoir
@ -839,42 +1014,67 @@ Examples:
""",
)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument(
# Main action group - what operation to perform
action_group = parser.add_mutually_exclusive_group(required=True)
action_group.add_argument(
"--set",
choices=list(SETS.keys()),
help="Scrape all cards from a set",
)
group.add_argument(
action_group.add_argument(
"--card",
type=str,
help="Scrape a single card by ID (e.g., a1-132-gardevoir)",
)
group.add_argument(
action_group.add_argument(
"--reindex",
action="store_true",
help="Regenerate index from existing card files",
)
action_group.add_argument(
"--download-images",
choices=list(SETS.keys()) + ["all"],
nargs="?",
const="all",
help="Download images for existing card data (specify set or 'all')",
)
parser.add_argument(
"--limit",
type=int,
help="Maximum number of cards to scrape (for testing)",
)
parser.add_argument(
"--images",
action="store_true",
help="Download card images while scraping",
)
args = parser.parse_args()
scraper = PokemonPocketScraper()
scraper = PokemonPocketScraper(download_images=args.images)
if args.reindex:
scraper.generate_index()
return 0
if args.download_images:
# Download images for existing cards
set_code = None if args.download_images == "all" else args.download_images
set_info = f"set {set_code}" if set_code else "all sets"
logger.info(f"Downloading images for existing card data ({set_info})...")
scraper.download_images = True # Enable downloads
downloaded = scraper.download_images_for_existing_cards(set_code)
scraper.log_image_stats()
logger.info(f"Image download complete: {downloaded} new images")
return 0
if args.card:
card = scraper.scrape_card(args.card)
if card:
scraper.save_card(card)
scraper.generate_index()
scraper.log_image_stats()
logger.info(f"Successfully scraped: {card.name}")
return 0
else:
@ -885,6 +1085,7 @@ Examples:
cards = scraper.scrape_set(args.set, limit=args.limit)
scraper.generate_index()
scraper.save_errors()
scraper.log_image_stats()
success_count = len(cards)
error_count = len(scraper.errors)