paper-dynasty-card-creation/migrate_all_cards_to_s3.py
Cal Corum 0a17745389 Run black and ruff across entire codebase
Standardize formatting with black and apply ruff auto-fixes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 14:24:33 -05:00

377 lines
12 KiB
Python

import asyncio
import datetime
import sys
import boto3
from db_calls import db_get, db_patch
from exceptions import logger
# Configuration
DRY_RUN = False # Set to False to actually upload and update
SKIP_AWS_URLS = True # Skip URLs already pointing to S3
START_CARDSET_ID = 21 # Optional: Start from specific cardset ID
END_CARDSET_ID = 29 # Optional: End at specific cardset ID
EXCLUDE_CARDSET_IDS = [] # List of cardset IDs to skip (e.g., [1, 2, 3])
MAX_PLAYERS_PER_CARDSET = None # Optional: Limit for testing (e.g., 10)
# AWS Configuration
AWS_BUCKET_NAME = "paper-dynasty" # Change to your bucket name
AWS_REGION = "us-east-1" # Change to your region
S3_BASE_URL = f"https://{AWS_BUCKET_NAME}.s3.{AWS_REGION}.amazonaws.com"
# Initialize S3 client (only if not dry run)
s3_client = boto3.client("s3", region_name=AWS_REGION) if not DRY_RUN else None
def is_aws_url(url: str) -> bool:
"""
Check if a URL is already pointing to AWS S3.
Args:
url: URL to check
Returns:
True if URL is already on S3, False otherwise
"""
if not url:
return False
# Check for common S3 URL patterns
s3_patterns = [
"s3.amazonaws.com",
"s3-", # Regional S3 URLs like s3-us-east-1
f"{AWS_BUCKET_NAME}.s3",
f"s3://{AWS_BUCKET_NAME}",
]
return any(pattern in url.lower() for pattern in s3_patterns)
async def fetch_card_image(session, card_url: str, timeout: int = 6) -> bytes:
"""
Fetch card image from URL and return raw bytes.
Args:
session: aiohttp ClientSession to use for the request
card_url: URL to fetch the card from
timeout: Request timeout in seconds
Returns:
Raw PNG image bytes
"""
import aiohttp
async with session.get(
card_url, timeout=aiohttp.ClientTimeout(total=timeout)
) as resp:
if resp.status == 200:
logger.info(f"Fetched card image from {card_url}")
return await resp.read()
else:
error_text = await resp.text()
logger.error(f"Failed to fetch card: {error_text}")
raise ValueError(f"Card fetch error: {error_text}")
def upload_card_to_s3(
image_data: bytes,
player_id: int,
card_type: str,
release_date: str,
cardset_id: int,
) -> str:
"""
Upload card image to S3 and return the S3 URL with cache-busting param.
Args:
image_data: Raw PNG image bytes
player_id: Player ID
card_type: 'batting' or 'pitching'
release_date: Date string for cache busting (e.g., '2025-11-8')
cardset_id: Cardset ID (will be zero-padded to 3 digits)
Returns:
Full S3 URL with ?d= parameter
"""
if DRY_RUN:
# In dry run, just return what the URL would be
cardset_str = f"{cardset_id:03d}"
s3_key = f"cards/cardset-{cardset_str}/player-{player_id}/{card_type}card.png"
s3_url = f"{S3_BASE_URL}/{s3_key}?d={release_date}"
logger.info(
f"[DRY RUN] Would upload {card_type} card for player {player_id} to: {s3_url}"
)
return s3_url
# Format cardset_id with 3 digits and leading zeros
cardset_str = f"{cardset_id:03d}"
s3_key = f"cards/cardset-{cardset_str}/player-{player_id}/{card_type}card.png"
try:
s3_client.put_object(
Bucket=AWS_BUCKET_NAME,
Key=s3_key,
Body=image_data,
ContentType="image/png",
CacheControl="public, max-age=300", # 5 minute cache
Metadata={
"player-id": str(player_id),
"card-type": card_type,
"upload-date": datetime.datetime.now().isoformat(),
},
)
# Return URL with cache-busting parameter
s3_url = f"{S3_BASE_URL}/{s3_key}?d={release_date}"
logger.info(f"Uploaded {card_type} card for player {player_id} to S3: {s3_url}")
return s3_url
except Exception as e:
logger.error(f"Failed to upload {card_type} card for player {player_id}: {e}")
raise
async def process_cardset(cardset: dict, session, release_date: str, stats: dict):
"""
Process all players in a single cardset.
Args:
cardset: Cardset dictionary from API
session: aiohttp ClientSession
release_date: Release date for cache busting
stats: Stats dictionary to update
"""
cardset_id = cardset["id"]
cardset_name = cardset["name"]
print(f'\n{"="*60}')
print(f"Processing Cardset: {cardset_name} (ID: {cardset_id})")
print(f'{"="*60}')
# Get all players for this cardset
p_query = await db_get(
"players",
params=[("inc_dex", False), ("cardset_id", cardset_id), ("short_output", True)],
)
if not p_query or p_query["count"] == 0:
print(f"No players found for cardset {cardset_name}")
return
all_players = p_query["players"]
# Apply max players limit if set
if MAX_PLAYERS_PER_CARDSET:
all_players = all_players[:MAX_PLAYERS_PER_CARDSET]
print(f"Limited to first {MAX_PLAYERS_PER_CARDSET} players for testing")
print(f"Found {len(all_players)} players")
cardset_stats = {
"total": len(all_players),
"skipped_aws": 0,
"uploaded": 0,
"errors": 0,
"url_updates": 0,
}
for idx, player in enumerate(all_players):
player_id = player["player_id"]
player_name = player["p_name"]
if idx % 50 == 0 and idx > 0:
print(f" Progress: {idx}/{len(all_players)} players processed...")
# Process primary image
if player["image"]:
try:
if SKIP_AWS_URLS and is_aws_url(player["image"]):
logger.debug(f"Skipping player {player_id} image - already on AWS")
cardset_stats["skipped_aws"] += 1
else:
card_type = (
"pitching" if "pitching" in player["image"] else "batting"
)
if not DRY_RUN:
# Fetch and upload
image_bytes = await fetch_card_image(
session, player["image"], timeout=6
)
s3_url = upload_card_to_s3(
image_bytes, player_id, card_type, release_date, cardset_id
)
# Update player record
await db_patch(
"players", object_id=player_id, params=[("image", s3_url)]
)
cardset_stats["url_updates"] += 1
else:
# Dry run - just log what would happen
s3_url = upload_card_to_s3(
None, player_id, card_type, release_date, cardset_id
)
logger.info(
f"[DRY RUN] Would update player {player_id} image to: {s3_url}"
)
cardset_stats["uploaded"] += 1
except Exception as e:
logger.error(
f"Error processing player {player_id} ({player_name}) image: {e}"
)
cardset_stats["errors"] += 1
# Process secondary image (dual-position players)
if player["image2"]:
try:
if SKIP_AWS_URLS and is_aws_url(player["image2"]):
logger.debug(f"Skipping player {player_id} image2 - already on AWS")
cardset_stats["skipped_aws"] += 1
else:
card_type = (
"pitching" if "pitching" in player["image2"] else "batting"
)
if not DRY_RUN:
# Fetch and upload
image_bytes = await fetch_card_image(
session, player["image2"], timeout=6
)
s3_url = upload_card_to_s3(
image_bytes, player_id, card_type, release_date, cardset_id
)
# Update player record
await db_patch(
"players", object_id=player_id, params=[("image2", s3_url)]
)
cardset_stats["url_updates"] += 1
else:
# Dry run - just log what would happen
s3_url = upload_card_to_s3(
None, player_id, card_type, release_date, cardset_id
)
logger.info(
f"[DRY RUN] Would update player {player_id} image2 to: {s3_url}"
)
cardset_stats["uploaded"] += 1
except Exception as e:
logger.error(
f"Error processing player {player_id} ({player_name}) image2: {e}"
)
cardset_stats["errors"] += 1
# Print cardset summary
print(f"\nCardset {cardset_name} Summary:")
print(f' Total players: {cardset_stats["total"]}')
print(f' Skipped (already AWS): {cardset_stats["skipped_aws"]}')
print(f' Uploaded: {cardset_stats["uploaded"]}')
print(f' URL updates: {cardset_stats["url_updates"]}')
print(f' Errors: {cardset_stats["errors"]}')
# Update global stats
for key in cardset_stats:
stats[key] = stats.get(key, 0) + cardset_stats[key]
async def main(args):
import aiohttp
print(f'\n{"="*60}')
print("PAPER DYNASTY - BATCH CARD MIGRATION TO AWS S3")
print(f'{"="*60}')
print(
f'Mode: {"DRY RUN (no changes will be made)" if DRY_RUN else "LIVE (will upload and update)"}'
)
print(f"Skip AWS URLs: {SKIP_AWS_URLS}")
if START_CARDSET_ID:
print(f"Start Cardset ID: {START_CARDSET_ID}")
if END_CARDSET_ID:
print(f"End Cardset ID: {END_CARDSET_ID}")
if EXCLUDE_CARDSET_IDS:
print(f"Excluded Cardset IDs: {EXCLUDE_CARDSET_IDS}")
print(f'{"="*60}\n')
# Get all cardsets
print("Fetching all cardsets...")
c_query = await db_get("cardsets")
if not c_query or c_query["count"] == 0:
print("No cardsets found!")
return
all_cardsets = c_query["cardsets"]
print(f"Found {len(all_cardsets)} total cardsets")
# Filter cardsets based on configuration
filtered_cardsets = []
for cardset in all_cardsets:
cardset_id = cardset["id"]
# Apply filters
if START_CARDSET_ID and cardset_id < START_CARDSET_ID:
continue
if END_CARDSET_ID and cardset_id > END_CARDSET_ID:
continue
if cardset_id in EXCLUDE_CARDSET_IDS:
continue
filtered_cardsets.append(cardset)
print(f"Processing {len(filtered_cardsets)} cardsets after filters\n")
# Generate release date for cache busting
now = datetime.datetime.now()
release_date = f"{now.year}-{now.month}-{now.day}"
# Global statistics
stats = {
"cardsets_processed": 0,
"total": 0,
"skipped_aws": 0,
"uploaded": 0,
"errors": 0,
"url_updates": 0,
}
start_time = datetime.datetime.now()
# Create persistent aiohttp session for all card fetches
async with aiohttp.ClientSession() as session:
for cardset in filtered_cardsets:
try:
await process_cardset(cardset, session, release_date, stats)
stats["cardsets_processed"] += 1
except Exception as e:
logger.error(f'Failed to process cardset {cardset["name"]}: {e}')
continue
# Print final summary
runtime = datetime.datetime.now() - start_time
print(f'\n{"="*60}')
print("FINAL SUMMARY")
print(f'{"="*60}')
print(f'Mode: {"DRY RUN" if DRY_RUN else "LIVE"}')
print(f'Cardsets processed: {stats["cardsets_processed"]}')
print(f'Total player cards: {stats["total"]}')
print(f'Skipped (already AWS): {stats["skipped_aws"]}')
print(f'Uploaded to S3: {stats["uploaded"]}')
print(f'URL updates: {stats["url_updates"]}')
print(f'Errors: {stats["errors"]}')
print(f"Runtime: {runtime.total_seconds():.2f} seconds")
print(f'{"="*60}')
if DRY_RUN:
print("\n*** THIS WAS A DRY RUN - NO CHANGES WERE MADE ***")
print("Set DRY_RUN = False to actually upload and update")
if __name__ == "__main__":
asyncio.run(main(sys.argv[1:]))