Add image download support to card scraper
- Add --images flag to download images during scraping
- Add --download-images command to fetch images for existing card data
- Images saved to data/images/{set}/ directory
- Card JSON updated with image_file field (relative path)
- Uses Referer header for asset server compatibility
- Supports skip-if-exists for incremental downloads
This commit is contained in:
parent
2517d241ac
commit
5e1229aa7c
@ -1,5 +1,5 @@
|
||||
{
|
||||
"generated_at": "2026-01-27T04:45:04.962697+00:00",
|
||||
"generated_at": "2026-01-27T04:54:59.854435+00:00",
|
||||
"schema_version": "1.0",
|
||||
"sets": {
|
||||
"a1": {
|
||||
|
||||
@ -34,5 +34,6 @@
|
||||
"resistance": null,
|
||||
"retreat_cost": 1,
|
||||
"flavor_text": null,
|
||||
"illustrator": "Narumi Sato"
|
||||
"illustrator": "Narumi Sato",
|
||||
"image_file": "a1/001-bulbasaur.webp"
|
||||
}
|
||||
@ -35,5 +35,6 @@
|
||||
"resistance": null,
|
||||
"retreat_cost": 2,
|
||||
"flavor_text": null,
|
||||
"illustrator": "Kurata So"
|
||||
"illustrator": "Kurata So",
|
||||
"image_file": "a1/002-ivysaur.webp"
|
||||
}
|
||||
@ -36,5 +36,6 @@
|
||||
"resistance": null,
|
||||
"retreat_cost": 3,
|
||||
"flavor_text": null,
|
||||
"illustrator": "Ryota Murayama"
|
||||
"illustrator": "Ryota Murayama",
|
||||
"image_file": "a1/003-venusaur.webp"
|
||||
}
|
||||
@ -48,5 +48,6 @@
|
||||
"resistance": null,
|
||||
"retreat_cost": 3,
|
||||
"flavor_text": null,
|
||||
"illustrator": "PLANETA CG Works"
|
||||
"illustrator": "PLANETA CG Works",
|
||||
"image_file": "a1/004-venusaur-ex.webp"
|
||||
}
|
||||
@ -7,6 +7,7 @@
|
||||
"rarity": "Common",
|
||||
"card_type": "pokemon",
|
||||
"image_url": "https://assets.pokemon-zone.com/game-assets/CardPreviews/cPK_10_000050_00_CATERPIE_C.webp",
|
||||
"image_file": "a1/005-caterpie.webp",
|
||||
"source_url": "https://www.pokemon-zone.com/cards/a1/5/caterpie/",
|
||||
"hp": 50,
|
||||
"pokemon_type": "grass",
|
||||
|
||||
BIN
backend/data/images/a1/001-bulbasaur.webp
Normal file
BIN
backend/data/images/a1/001-bulbasaur.webp
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 43 KiB |
BIN
backend/data/images/a1/002-ivysaur.webp
Normal file
BIN
backend/data/images/a1/002-ivysaur.webp
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 38 KiB |
BIN
backend/data/images/a1/003-venusaur.webp
Normal file
BIN
backend/data/images/a1/003-venusaur.webp
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 46 KiB |
BIN
backend/data/images/a1/004-venusaur-ex.webp
Normal file
BIN
backend/data/images/a1/004-venusaur-ex.webp
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 54 KiB |
BIN
backend/data/images/a1/005-caterpie.webp
Normal file
BIN
backend/data/images/a1/005-caterpie.webp
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 38 KiB |
@ -43,7 +43,9 @@ from bs4 import BeautifulSoup, Tag
|
||||
|
||||
BASE_URL = "https://www.pokemon-zone.com"
|
||||
DATA_DIR = Path(__file__).parent.parent / "data" / "cards"
|
||||
IMAGES_DIR = Path(__file__).parent.parent / "data" / "images"
|
||||
REQUEST_DELAY = 1.5 # seconds between requests
|
||||
IMAGE_REQUEST_DELAY = 0.5 # faster for images (different server)
|
||||
MAX_RETRIES = 3
|
||||
RETRY_DELAY = 5 # seconds
|
||||
|
||||
@ -150,6 +152,7 @@ class Card:
|
||||
rarity: str
|
||||
card_type: str # "pokemon", "trainer", "energy"
|
||||
image_url: str | None = None # URL to card image for offline caching
|
||||
image_file: str | None = None # Local path to downloaded image (relative to images dir)
|
||||
hp: int | None = None
|
||||
pokemon_type: str | None = None
|
||||
stage: str | None = None # "basic", "stage_1", "stage_2"
|
||||
@ -177,6 +180,7 @@ class Card:
|
||||
"rarity": self.rarity,
|
||||
"card_type": self.card_type,
|
||||
"image_url": self.image_url,
|
||||
"image_file": self.image_file,
|
||||
"source_url": self.source_url,
|
||||
}
|
||||
|
||||
@ -232,13 +236,22 @@ class Card:
|
||||
class PokemonPocketScraper:
|
||||
"""Scraper for Pokemon TCG Pocket card data from pokemon-zone.com."""
|
||||
|
||||
def __init__(self, data_dir: Path = DATA_DIR):
|
||||
def __init__(
|
||||
self,
|
||||
data_dir: Path = DATA_DIR,
|
||||
images_dir: Path = IMAGES_DIR,
|
||||
download_images: bool = False,
|
||||
):
|
||||
"""Initialize the scraper.
|
||||
|
||||
Args:
|
||||
data_dir: Directory to save card data files.
|
||||
images_dir: Directory to save downloaded card images.
|
||||
download_images: Whether to download card images.
|
||||
"""
|
||||
self.data_dir = data_dir
|
||||
self.images_dir = images_dir
|
||||
self.download_images = download_images
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{
|
||||
@ -247,6 +260,7 @@ class PokemonPocketScraper:
|
||||
}
|
||||
)
|
||||
self.errors: list[dict[str, Any]] = []
|
||||
self.image_stats = {"downloaded": 0, "skipped": 0, "failed": 0}
|
||||
|
||||
def fetch_page(self, url: str) -> BeautifulSoup | None:
|
||||
"""Fetch a page with retry logic.
|
||||
@ -270,6 +284,72 @@ class PokemonPocketScraper:
|
||||
self.errors.append({"url": url, "error": "Max retries exceeded"})
|
||||
return None
|
||||
|
||||
def download_image(self, card: Card) -> str | None:
|
||||
"""Download a card image and save it locally.
|
||||
|
||||
Args:
|
||||
card: Card object with image_url set.
|
||||
|
||||
Returns:
|
||||
Relative path to the saved image, or None if download failed.
|
||||
"""
|
||||
if not card.image_url:
|
||||
return None
|
||||
|
||||
# Create directory structure: images/{set_code}/
|
||||
set_dir = self.images_dir / card.set_code
|
||||
set_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Determine file extension from URL
|
||||
url_path = card.image_url.split("?")[0] # Remove query params
|
||||
ext = Path(url_path).suffix or ".webp"
|
||||
|
||||
# Generate filename: {number:03d}-{name}{ext}
|
||||
url_name = card.id.split("-", 2)[2] # Get name part from ID
|
||||
filename = f"{card.card_number:03d}-{url_name}{ext}"
|
||||
filepath = set_dir / filename
|
||||
relative_path = f"{card.set_code}/{filename}"
|
||||
|
||||
# Skip if already downloaded
|
||||
if filepath.exists():
|
||||
logger.debug(f"Image exists, skipping: {relative_path}")
|
||||
self.image_stats["skipped"] += 1
|
||||
return relative_path
|
||||
|
||||
# Download the image with appropriate headers
|
||||
image_headers = {
|
||||
"Accept": "image/webp,image/apng,image/*,*/*;q=0.8",
|
||||
"Referer": "https://www.pokemon-zone.com/",
|
||||
}
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
response = self.session.get(card.image_url, timeout=30, headers=image_headers)
|
||||
response.raise_for_status()
|
||||
|
||||
# Verify it's an image
|
||||
content_type = response.headers.get("content-type", "")
|
||||
if not content_type.startswith("image/"):
|
||||
logger.warning(f"Not an image: {content_type} for {card.image_url}")
|
||||
self.image_stats["failed"] += 1
|
||||
return None
|
||||
|
||||
# Save the image
|
||||
with open(filepath, "wb") as f:
|
||||
f.write(response.content)
|
||||
|
||||
logger.debug(f"Downloaded: {relative_path} ({len(response.content)} bytes)")
|
||||
self.image_stats["downloaded"] += 1
|
||||
return relative_path
|
||||
|
||||
except requests.RequestException as e:
|
||||
logger.warning(f"Image download attempt {attempt + 1}/{MAX_RETRIES} failed: {e}")
|
||||
if attempt < MAX_RETRIES - 1:
|
||||
time.sleep(RETRY_DELAY)
|
||||
|
||||
logger.error(f"Failed to download image for {card.id}")
|
||||
self.image_stats["failed"] += 1
|
||||
return None
|
||||
|
||||
def get_card_urls_for_set(self, set_code: str) -> list[tuple[int, str, str]]:
|
||||
"""Get all card URLs for a set.
|
||||
|
||||
@ -676,7 +756,13 @@ class PokemonPocketScraper:
|
||||
if not soup:
|
||||
return None
|
||||
|
||||
return self.parse_card_page(soup, url, set_code)
|
||||
card = self.parse_card_page(soup, url, set_code)
|
||||
|
||||
# Download image if enabled
|
||||
if card and self.download_images and card.image_url:
|
||||
card.image_file = self.download_image(card)
|
||||
|
||||
return card
|
||||
|
||||
def scrape_set(self, set_code: str, limit: int | None = None) -> list[Card]:
|
||||
"""Scrape all cards from a set.
|
||||
@ -706,6 +792,11 @@ class PokemonPocketScraper:
|
||||
if soup:
|
||||
card = self.parse_card_page(soup, url, set_code)
|
||||
if card:
|
||||
# Download image if enabled
|
||||
if self.download_images and card.image_url:
|
||||
card.image_file = self.download_image(card)
|
||||
time.sleep(IMAGE_REQUEST_DELAY)
|
||||
|
||||
cards.append(card)
|
||||
self.save_card(card)
|
||||
|
||||
@ -812,6 +903,83 @@ class PokemonPocketScraper:
|
||||
logger.warning(f"Errors logged to: {error_log}")
|
||||
return error_log
|
||||
|
||||
def download_images_for_existing_cards(self, set_code: str | None = None) -> int:
|
||||
"""Download images for cards that already have JSON files.
|
||||
|
||||
This is useful for downloading images separately from scraping,
|
||||
or for retrying failed image downloads.
|
||||
|
||||
Args:
|
||||
set_code: Optional set code to limit downloads to a specific set.
|
||||
|
||||
Returns:
|
||||
Number of images downloaded.
|
||||
"""
|
||||
sets_to_process = [set_code] if set_code else list(SETS.keys())
|
||||
total_downloaded = 0
|
||||
|
||||
for sc in sets_to_process:
|
||||
set_dir = self.data_dir / sc
|
||||
if not set_dir.exists():
|
||||
logger.warning(f"No card data found for set {sc}")
|
||||
continue
|
||||
|
||||
card_files = sorted(set_dir.glob("*.json"))
|
||||
logger.info(f"Processing {len(card_files)} cards from set {sc}")
|
||||
|
||||
for i, card_file in enumerate(card_files, 1):
|
||||
with open(card_file, encoding="utf-8") as f:
|
||||
card_data = json.load(f)
|
||||
|
||||
image_url = card_data.get("image_url")
|
||||
existing_file = card_data.get("image_file")
|
||||
|
||||
if not image_url:
|
||||
continue
|
||||
|
||||
# Create a minimal Card object for download
|
||||
card = Card(
|
||||
id=card_data["id"],
|
||||
name=card_data["name"],
|
||||
set_code=card_data["set_code"],
|
||||
set_name=card_data["set_name"],
|
||||
card_number=card_data["card_number"],
|
||||
rarity=card_data["rarity"],
|
||||
card_type=card_data["card_type"],
|
||||
image_url=image_url,
|
||||
)
|
||||
|
||||
# Check if image already exists
|
||||
if existing_file:
|
||||
image_path = self.images_dir / existing_file
|
||||
if image_path.exists():
|
||||
self.image_stats["skipped"] += 1
|
||||
continue
|
||||
|
||||
logger.info(f"[{i}/{len(card_files)}] Downloading: {card.name}")
|
||||
image_file = self.download_image(card)
|
||||
|
||||
if image_file:
|
||||
# Update the JSON file with the image path
|
||||
card_data["image_file"] = image_file
|
||||
with open(card_file, "w", encoding="utf-8") as f:
|
||||
json.dump(card_data, f, indent=2, ensure_ascii=False)
|
||||
total_downloaded += 1
|
||||
|
||||
time.sleep(IMAGE_REQUEST_DELAY)
|
||||
|
||||
return total_downloaded
|
||||
|
||||
def log_image_stats(self) -> None:
|
||||
"""Log image download statistics."""
|
||||
stats = self.image_stats
|
||||
total = stats["downloaded"] + stats["skipped"] + stats["failed"]
|
||||
if total > 0:
|
||||
logger.info(
|
||||
f"Images: {stats['downloaded']} downloaded, "
|
||||
f"{stats['skipped']} skipped, {stats['failed']} failed"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI
|
||||
@ -831,6 +999,13 @@ Examples:
|
||||
# Scrape with limit (for testing)
|
||||
uv run python scripts/scrape_pokemon_pocket.py --set a1 --limit 5
|
||||
|
||||
# Scrape set with images
|
||||
uv run python scripts/scrape_pokemon_pocket.py --set a1 --images
|
||||
|
||||
# Download images for existing card data
|
||||
uv run python scripts/scrape_pokemon_pocket.py --download-images
|
||||
uv run python scripts/scrape_pokemon_pocket.py --download-images --set a1
|
||||
|
||||
# Scrape single card by ID
|
||||
uv run python scripts/scrape_pokemon_pocket.py --card a1-132-gardevoir
|
||||
|
||||
@ -839,42 +1014,67 @@ Examples:
|
||||
""",
|
||||
)
|
||||
|
||||
group = parser.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument(
|
||||
# Main action group - what operation to perform
|
||||
action_group = parser.add_mutually_exclusive_group(required=True)
|
||||
action_group.add_argument(
|
||||
"--set",
|
||||
choices=list(SETS.keys()),
|
||||
help="Scrape all cards from a set",
|
||||
)
|
||||
group.add_argument(
|
||||
action_group.add_argument(
|
||||
"--card",
|
||||
type=str,
|
||||
help="Scrape a single card by ID (e.g., a1-132-gardevoir)",
|
||||
)
|
||||
group.add_argument(
|
||||
action_group.add_argument(
|
||||
"--reindex",
|
||||
action="store_true",
|
||||
help="Regenerate index from existing card files",
|
||||
)
|
||||
action_group.add_argument(
|
||||
"--download-images",
|
||||
choices=list(SETS.keys()) + ["all"],
|
||||
nargs="?",
|
||||
const="all",
|
||||
help="Download images for existing card data (specify set or 'all')",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
help="Maximum number of cards to scrape (for testing)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--images",
|
||||
action="store_true",
|
||||
help="Download card images while scraping",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
scraper = PokemonPocketScraper()
|
||||
scraper = PokemonPocketScraper(download_images=args.images)
|
||||
|
||||
if args.reindex:
|
||||
scraper.generate_index()
|
||||
return 0
|
||||
|
||||
if args.download_images:
|
||||
# Download images for existing cards
|
||||
set_code = None if args.download_images == "all" else args.download_images
|
||||
set_info = f"set {set_code}" if set_code else "all sets"
|
||||
logger.info(f"Downloading images for existing card data ({set_info})...")
|
||||
scraper.download_images = True # Enable downloads
|
||||
downloaded = scraper.download_images_for_existing_cards(set_code)
|
||||
scraper.log_image_stats()
|
||||
logger.info(f"Image download complete: {downloaded} new images")
|
||||
return 0
|
||||
|
||||
if args.card:
|
||||
card = scraper.scrape_card(args.card)
|
||||
if card:
|
||||
scraper.save_card(card)
|
||||
scraper.generate_index()
|
||||
scraper.log_image_stats()
|
||||
logger.info(f"Successfully scraped: {card.name}")
|
||||
return 0
|
||||
else:
|
||||
@ -885,6 +1085,7 @@ Examples:
|
||||
cards = scraper.scrape_set(args.set, limit=args.limit)
|
||||
scraper.generate_index()
|
||||
scraper.save_errors()
|
||||
scraper.log_image_stats()
|
||||
|
||||
success_count = len(cards)
|
||||
error_count = len(scraper.errors)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user