Add batch migration script for uploading all card images to AWS S3

Creates migrate_all_cards_to_s3.py to migrate historical card images from
Paper Dynasty API to S3 bucket. Key features:

- Processes all cardsets automatically (12,966 player cards across 29 cardsets)
- Detects and skips URLs already pointing to AWS S3
- Dry-run mode for previewing changes before execution
- Flexible filtering by cardset ID ranges or exclusion lists
- Per-cardset and global statistics tracking
- Updates player records with new S3 URLs after upload

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Cal Corum 2025-11-16 08:42:40 -06:00
parent 2081a8f0ac
commit de253eb561

339
migrate_all_cards_to_s3.py Normal file
View File

@ -0,0 +1,339 @@
import asyncio
import datetime
import sys
import boto3
from io import BytesIO
from creation_helpers import get_args
from db_calls import db_get, db_patch, url_get
from exceptions import logger
# Configuration
DRY_RUN = False # Set to False to actually upload and update
SKIP_AWS_URLS = True # Skip URLs already pointing to S3
START_CARDSET_ID = 21 # Optional: Start from specific cardset ID
END_CARDSET_ID = 29 # Optional: End at specific cardset ID
EXCLUDE_CARDSET_IDS = [] # List of cardset IDs to skip (e.g., [1, 2, 3])
MAX_PLAYERS_PER_CARDSET = None # Optional: Limit for testing (e.g., 10)
# AWS Configuration
AWS_BUCKET_NAME = 'paper-dynasty' # Change to your bucket name
AWS_REGION = 'us-east-1' # Change to your region
S3_BASE_URL = f'https://{AWS_BUCKET_NAME}.s3.{AWS_REGION}.amazonaws.com'
# Initialize S3 client (only if not dry run)
s3_client = boto3.client('s3', region_name=AWS_REGION) if not DRY_RUN else None
def is_aws_url(url: str) -> bool:
"""
Check if a URL is already pointing to AWS S3.
Args:
url: URL to check
Returns:
True if URL is already on S3, False otherwise
"""
if not url:
return False
# Check for common S3 URL patterns
s3_patterns = [
's3.amazonaws.com',
's3-', # Regional S3 URLs like s3-us-east-1
f'{AWS_BUCKET_NAME}.s3',
f's3://{AWS_BUCKET_NAME}',
]
return any(pattern in url.lower() for pattern in s3_patterns)
async def fetch_card_image(session, card_url: str, timeout: int = 6) -> bytes:
"""
Fetch card image from URL and return raw bytes.
Args:
session: aiohttp ClientSession to use for the request
card_url: URL to fetch the card from
timeout: Request timeout in seconds
Returns:
Raw PNG image bytes
"""
import aiohttp
async with session.get(card_url, timeout=aiohttp.ClientTimeout(total=timeout)) as resp:
if resp.status == 200:
logger.info(f'Fetched card image from {card_url}')
return await resp.read()
else:
error_text = await resp.text()
logger.error(f'Failed to fetch card: {error_text}')
raise ValueError(f'Card fetch error: {error_text}')
def upload_card_to_s3(image_data: bytes, player_id: int, card_type: str, release_date: str, cardset_id: int) -> str:
"""
Upload card image to S3 and return the S3 URL with cache-busting param.
Args:
image_data: Raw PNG image bytes
player_id: Player ID
card_type: 'batting' or 'pitching'
release_date: Date string for cache busting (e.g., '2025-11-8')
cardset_id: Cardset ID (will be zero-padded to 3 digits)
Returns:
Full S3 URL with ?d= parameter
"""
if DRY_RUN:
# In dry run, just return what the URL would be
cardset_str = f'{cardset_id:03d}'
s3_key = f'cards/cardset-{cardset_str}/player-{player_id}/{card_type}card.png'
s3_url = f'{S3_BASE_URL}/{s3_key}?d={release_date}'
logger.info(f'[DRY RUN] Would upload {card_type} card for player {player_id} to: {s3_url}')
return s3_url
# Format cardset_id with 3 digits and leading zeros
cardset_str = f'{cardset_id:03d}'
s3_key = f'cards/cardset-{cardset_str}/player-{player_id}/{card_type}card.png'
try:
s3_client.put_object(
Bucket=AWS_BUCKET_NAME,
Key=s3_key,
Body=image_data,
ContentType='image/png',
CacheControl='public, max-age=300', # 5 minute cache
Metadata={
'player-id': str(player_id),
'card-type': card_type,
'upload-date': datetime.datetime.now().isoformat()
}
)
# Return URL with cache-busting parameter
s3_url = f'{S3_BASE_URL}/{s3_key}?d={release_date}'
logger.info(f'Uploaded {card_type} card for player {player_id} to S3: {s3_url}')
return s3_url
except Exception as e:
logger.error(f'Failed to upload {card_type} card for player {player_id}: {e}')
raise
async def process_cardset(cardset: dict, session, release_date: str, stats: dict):
"""
Process all players in a single cardset.
Args:
cardset: Cardset dictionary from API
session: aiohttp ClientSession
release_date: Release date for cache busting
stats: Stats dictionary to update
"""
cardset_id = cardset['id']
cardset_name = cardset['name']
print(f'\n{"="*60}')
print(f'Processing Cardset: {cardset_name} (ID: {cardset_id})')
print(f'{"="*60}')
# Get all players for this cardset
p_query = await db_get(
'players',
params=[('inc_dex', False), ('cardset_id', cardset_id), ('short_output', True)]
)
if not p_query or p_query['count'] == 0:
print(f'No players found for cardset {cardset_name}')
return
all_players = p_query['players']
# Apply max players limit if set
if MAX_PLAYERS_PER_CARDSET:
all_players = all_players[:MAX_PLAYERS_PER_CARDSET]
print(f'Limited to first {MAX_PLAYERS_PER_CARDSET} players for testing')
print(f'Found {len(all_players)} players')
cardset_stats = {
'total': len(all_players),
'skipped_aws': 0,
'uploaded': 0,
'errors': 0,
'url_updates': 0
}
for idx, player in enumerate(all_players):
player_id = player['player_id']
player_name = player['p_name']
if idx % 50 == 0 and idx > 0:
print(f' Progress: {idx}/{len(all_players)} players processed...')
# Process primary image
if player['image']:
try:
if SKIP_AWS_URLS and is_aws_url(player['image']):
logger.debug(f'Skipping player {player_id} image - already on AWS')
cardset_stats['skipped_aws'] += 1
else:
card_type = 'pitching' if 'pitching' in player['image'] else 'batting'
if not DRY_RUN:
# Fetch and upload
image_bytes = await fetch_card_image(session, player['image'], timeout=6)
s3_url = upload_card_to_s3(image_bytes, player_id, card_type, release_date, cardset_id)
# Update player record
await db_patch('players', object_id=player_id, params=[('image', s3_url)])
cardset_stats['url_updates'] += 1
else:
# Dry run - just log what would happen
s3_url = upload_card_to_s3(None, player_id, card_type, release_date, cardset_id)
logger.info(f'[DRY RUN] Would update player {player_id} image to: {s3_url}')
cardset_stats['uploaded'] += 1
except Exception as e:
logger.error(f'Error processing player {player_id} ({player_name}) image: {e}')
cardset_stats['errors'] += 1
# Process secondary image (dual-position players)
if player['image2']:
try:
if SKIP_AWS_URLS and is_aws_url(player['image2']):
logger.debug(f'Skipping player {player_id} image2 - already on AWS')
cardset_stats['skipped_aws'] += 1
else:
card_type = 'pitching' if 'pitching' in player['image2'] else 'batting'
if not DRY_RUN:
# Fetch and upload
image_bytes = await fetch_card_image(session, player['image2'], timeout=6)
s3_url = upload_card_to_s3(image_bytes, player_id, card_type, release_date, cardset_id)
# Update player record
await db_patch('players', object_id=player_id, params=[('image2', s3_url)])
cardset_stats['url_updates'] += 1
else:
# Dry run - just log what would happen
s3_url = upload_card_to_s3(None, player_id, card_type, release_date, cardset_id)
logger.info(f'[DRY RUN] Would update player {player_id} image2 to: {s3_url}')
cardset_stats['uploaded'] += 1
except Exception as e:
logger.error(f'Error processing player {player_id} ({player_name}) image2: {e}')
cardset_stats['errors'] += 1
# Print cardset summary
print(f'\nCardset {cardset_name} Summary:')
print(f' Total players: {cardset_stats["total"]}')
print(f' Skipped (already AWS): {cardset_stats["skipped_aws"]}')
print(f' Uploaded: {cardset_stats["uploaded"]}')
print(f' URL updates: {cardset_stats["url_updates"]}')
print(f' Errors: {cardset_stats["errors"]}')
# Update global stats
for key in cardset_stats:
stats[key] = stats.get(key, 0) + cardset_stats[key]
async def main(args):
import aiohttp
print(f'\n{"="*60}')
print(f'PAPER DYNASTY - BATCH CARD MIGRATION TO AWS S3')
print(f'{"="*60}')
print(f'Mode: {"DRY RUN (no changes will be made)" if DRY_RUN else "LIVE (will upload and update)"}')
print(f'Skip AWS URLs: {SKIP_AWS_URLS}')
if START_CARDSET_ID:
print(f'Start Cardset ID: {START_CARDSET_ID}')
if END_CARDSET_ID:
print(f'End Cardset ID: {END_CARDSET_ID}')
if EXCLUDE_CARDSET_IDS:
print(f'Excluded Cardset IDs: {EXCLUDE_CARDSET_IDS}')
print(f'{"="*60}\n')
# Get all cardsets
print('Fetching all cardsets...')
c_query = await db_get('cardsets')
if not c_query or c_query['count'] == 0:
print('No cardsets found!')
return
all_cardsets = c_query['cardsets']
print(f'Found {len(all_cardsets)} total cardsets')
# Filter cardsets based on configuration
filtered_cardsets = []
for cardset in all_cardsets:
cardset_id = cardset['id']
# Apply filters
if START_CARDSET_ID and cardset_id < START_CARDSET_ID:
continue
if END_CARDSET_ID and cardset_id > END_CARDSET_ID:
continue
if cardset_id in EXCLUDE_CARDSET_IDS:
continue
filtered_cardsets.append(cardset)
print(f'Processing {len(filtered_cardsets)} cardsets after filters\n')
# Generate release date for cache busting
now = datetime.datetime.now()
release_date = f'{now.year}-{now.month}-{now.day}'
# Global statistics
stats = {
'cardsets_processed': 0,
'total': 0,
'skipped_aws': 0,
'uploaded': 0,
'errors': 0,
'url_updates': 0
}
start_time = datetime.datetime.now()
# Create persistent aiohttp session for all card fetches
async with aiohttp.ClientSession() as session:
for cardset in filtered_cardsets:
try:
await process_cardset(cardset, session, release_date, stats)
stats['cardsets_processed'] += 1
except Exception as e:
logger.error(f'Failed to process cardset {cardset["name"]}: {e}')
continue
# Print final summary
runtime = datetime.datetime.now() - start_time
print(f'\n{"="*60}')
print(f'FINAL SUMMARY')
print(f'{"="*60}')
print(f'Mode: {"DRY RUN" if DRY_RUN else "LIVE"}')
print(f'Cardsets processed: {stats["cardsets_processed"]}')
print(f'Total player cards: {stats["total"]}')
print(f'Skipped (already AWS): {stats["skipped_aws"]}')
print(f'Uploaded to S3: {stats["uploaded"]}')
print(f'URL updates: {stats["url_updates"]}')
print(f'Errors: {stats["errors"]}')
print(f'Runtime: {runtime.total_seconds():.2f} seconds')
print(f'{"="*60}')
if DRY_RUN:
print('\n*** THIS WAS A DRY RUN - NO CHANGES WERE MADE ***')
print('Set DRY_RUN = False to actually upload and update')
if __name__ == '__main__':
asyncio.run(main(sys.argv[1:]))