Add batch migration script for uploading all card images to AWS S3
Creates migrate_all_cards_to_s3.py to migrate historical card images from Paper Dynasty API to S3 bucket. Key features: - Processes all cardsets automatically (12,966 player cards across 29 cardsets) - Detects and skips URLs already pointing to AWS S3 - Dry-run mode for previewing changes before execution - Flexible filtering by cardset ID ranges or exclusion lists - Per-cardset and global statistics tracking - Updates player records with new S3 URLs after upload 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
2081a8f0ac
commit
de253eb561
339
migrate_all_cards_to_s3.py
Normal file
339
migrate_all_cards_to_s3.py
Normal file
@ -0,0 +1,339 @@
|
|||||||
|
import asyncio
|
||||||
|
import datetime
|
||||||
|
import sys
|
||||||
|
import boto3
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
from creation_helpers import get_args
|
||||||
|
from db_calls import db_get, db_patch, url_get
|
||||||
|
from exceptions import logger
|
||||||
|
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
DRY_RUN = False # Set to False to actually upload and update
|
||||||
|
SKIP_AWS_URLS = True # Skip URLs already pointing to S3
|
||||||
|
START_CARDSET_ID = 21 # Optional: Start from specific cardset ID
|
||||||
|
END_CARDSET_ID = 29 # Optional: End at specific cardset ID
|
||||||
|
EXCLUDE_CARDSET_IDS = [] # List of cardset IDs to skip (e.g., [1, 2, 3])
|
||||||
|
MAX_PLAYERS_PER_CARDSET = None # Optional: Limit for testing (e.g., 10)
|
||||||
|
|
||||||
|
# AWS Configuration
|
||||||
|
AWS_BUCKET_NAME = 'paper-dynasty' # Change to your bucket name
|
||||||
|
AWS_REGION = 'us-east-1' # Change to your region
|
||||||
|
S3_BASE_URL = f'https://{AWS_BUCKET_NAME}.s3.{AWS_REGION}.amazonaws.com'
|
||||||
|
|
||||||
|
# Initialize S3 client (only if not dry run)
|
||||||
|
s3_client = boto3.client('s3', region_name=AWS_REGION) if not DRY_RUN else None
|
||||||
|
|
||||||
|
|
||||||
|
def is_aws_url(url: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a URL is already pointing to AWS S3.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if URL is already on S3, False otherwise
|
||||||
|
"""
|
||||||
|
if not url:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check for common S3 URL patterns
|
||||||
|
s3_patterns = [
|
||||||
|
's3.amazonaws.com',
|
||||||
|
's3-', # Regional S3 URLs like s3-us-east-1
|
||||||
|
f'{AWS_BUCKET_NAME}.s3',
|
||||||
|
f's3://{AWS_BUCKET_NAME}',
|
||||||
|
]
|
||||||
|
|
||||||
|
return any(pattern in url.lower() for pattern in s3_patterns)
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_card_image(session, card_url: str, timeout: int = 6) -> bytes:
|
||||||
|
"""
|
||||||
|
Fetch card image from URL and return raw bytes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
session: aiohttp ClientSession to use for the request
|
||||||
|
card_url: URL to fetch the card from
|
||||||
|
timeout: Request timeout in seconds
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Raw PNG image bytes
|
||||||
|
"""
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
async with session.get(card_url, timeout=aiohttp.ClientTimeout(total=timeout)) as resp:
|
||||||
|
if resp.status == 200:
|
||||||
|
logger.info(f'Fetched card image from {card_url}')
|
||||||
|
return await resp.read()
|
||||||
|
else:
|
||||||
|
error_text = await resp.text()
|
||||||
|
logger.error(f'Failed to fetch card: {error_text}')
|
||||||
|
raise ValueError(f'Card fetch error: {error_text}')
|
||||||
|
|
||||||
|
|
||||||
|
def upload_card_to_s3(image_data: bytes, player_id: int, card_type: str, release_date: str, cardset_id: int) -> str:
|
||||||
|
"""
|
||||||
|
Upload card image to S3 and return the S3 URL with cache-busting param.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_data: Raw PNG image bytes
|
||||||
|
player_id: Player ID
|
||||||
|
card_type: 'batting' or 'pitching'
|
||||||
|
release_date: Date string for cache busting (e.g., '2025-11-8')
|
||||||
|
cardset_id: Cardset ID (will be zero-padded to 3 digits)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Full S3 URL with ?d= parameter
|
||||||
|
"""
|
||||||
|
if DRY_RUN:
|
||||||
|
# In dry run, just return what the URL would be
|
||||||
|
cardset_str = f'{cardset_id:03d}'
|
||||||
|
s3_key = f'cards/cardset-{cardset_str}/player-{player_id}/{card_type}card.png'
|
||||||
|
s3_url = f'{S3_BASE_URL}/{s3_key}?d={release_date}'
|
||||||
|
logger.info(f'[DRY RUN] Would upload {card_type} card for player {player_id} to: {s3_url}')
|
||||||
|
return s3_url
|
||||||
|
|
||||||
|
# Format cardset_id with 3 digits and leading zeros
|
||||||
|
cardset_str = f'{cardset_id:03d}'
|
||||||
|
s3_key = f'cards/cardset-{cardset_str}/player-{player_id}/{card_type}card.png'
|
||||||
|
|
||||||
|
try:
|
||||||
|
s3_client.put_object(
|
||||||
|
Bucket=AWS_BUCKET_NAME,
|
||||||
|
Key=s3_key,
|
||||||
|
Body=image_data,
|
||||||
|
ContentType='image/png',
|
||||||
|
CacheControl='public, max-age=300', # 5 minute cache
|
||||||
|
Metadata={
|
||||||
|
'player-id': str(player_id),
|
||||||
|
'card-type': card_type,
|
||||||
|
'upload-date': datetime.datetime.now().isoformat()
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Return URL with cache-busting parameter
|
||||||
|
s3_url = f'{S3_BASE_URL}/{s3_key}?d={release_date}'
|
||||||
|
logger.info(f'Uploaded {card_type} card for player {player_id} to S3: {s3_url}')
|
||||||
|
return s3_url
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f'Failed to upload {card_type} card for player {player_id}: {e}')
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
async def process_cardset(cardset: dict, session, release_date: str, stats: dict):
|
||||||
|
"""
|
||||||
|
Process all players in a single cardset.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cardset: Cardset dictionary from API
|
||||||
|
session: aiohttp ClientSession
|
||||||
|
release_date: Release date for cache busting
|
||||||
|
stats: Stats dictionary to update
|
||||||
|
"""
|
||||||
|
cardset_id = cardset['id']
|
||||||
|
cardset_name = cardset['name']
|
||||||
|
|
||||||
|
print(f'\n{"="*60}')
|
||||||
|
print(f'Processing Cardset: {cardset_name} (ID: {cardset_id})')
|
||||||
|
print(f'{"="*60}')
|
||||||
|
|
||||||
|
# Get all players for this cardset
|
||||||
|
p_query = await db_get(
|
||||||
|
'players',
|
||||||
|
params=[('inc_dex', False), ('cardset_id', cardset_id), ('short_output', True)]
|
||||||
|
)
|
||||||
|
|
||||||
|
if not p_query or p_query['count'] == 0:
|
||||||
|
print(f'No players found for cardset {cardset_name}')
|
||||||
|
return
|
||||||
|
|
||||||
|
all_players = p_query['players']
|
||||||
|
|
||||||
|
# Apply max players limit if set
|
||||||
|
if MAX_PLAYERS_PER_CARDSET:
|
||||||
|
all_players = all_players[:MAX_PLAYERS_PER_CARDSET]
|
||||||
|
print(f'Limited to first {MAX_PLAYERS_PER_CARDSET} players for testing')
|
||||||
|
|
||||||
|
print(f'Found {len(all_players)} players')
|
||||||
|
|
||||||
|
cardset_stats = {
|
||||||
|
'total': len(all_players),
|
||||||
|
'skipped_aws': 0,
|
||||||
|
'uploaded': 0,
|
||||||
|
'errors': 0,
|
||||||
|
'url_updates': 0
|
||||||
|
}
|
||||||
|
|
||||||
|
for idx, player in enumerate(all_players):
|
||||||
|
player_id = player['player_id']
|
||||||
|
player_name = player['p_name']
|
||||||
|
|
||||||
|
if idx % 50 == 0 and idx > 0:
|
||||||
|
print(f' Progress: {idx}/{len(all_players)} players processed...')
|
||||||
|
|
||||||
|
# Process primary image
|
||||||
|
if player['image']:
|
||||||
|
try:
|
||||||
|
if SKIP_AWS_URLS and is_aws_url(player['image']):
|
||||||
|
logger.debug(f'Skipping player {player_id} image - already on AWS')
|
||||||
|
cardset_stats['skipped_aws'] += 1
|
||||||
|
else:
|
||||||
|
card_type = 'pitching' if 'pitching' in player['image'] else 'batting'
|
||||||
|
|
||||||
|
if not DRY_RUN:
|
||||||
|
# Fetch and upload
|
||||||
|
image_bytes = await fetch_card_image(session, player['image'], timeout=6)
|
||||||
|
s3_url = upload_card_to_s3(image_bytes, player_id, card_type, release_date, cardset_id)
|
||||||
|
|
||||||
|
# Update player record
|
||||||
|
await db_patch('players', object_id=player_id, params=[('image', s3_url)])
|
||||||
|
cardset_stats['url_updates'] += 1
|
||||||
|
else:
|
||||||
|
# Dry run - just log what would happen
|
||||||
|
s3_url = upload_card_to_s3(None, player_id, card_type, release_date, cardset_id)
|
||||||
|
logger.info(f'[DRY RUN] Would update player {player_id} image to: {s3_url}')
|
||||||
|
|
||||||
|
cardset_stats['uploaded'] += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f'Error processing player {player_id} ({player_name}) image: {e}')
|
||||||
|
cardset_stats['errors'] += 1
|
||||||
|
|
||||||
|
# Process secondary image (dual-position players)
|
||||||
|
if player['image2']:
|
||||||
|
try:
|
||||||
|
if SKIP_AWS_URLS and is_aws_url(player['image2']):
|
||||||
|
logger.debug(f'Skipping player {player_id} image2 - already on AWS')
|
||||||
|
cardset_stats['skipped_aws'] += 1
|
||||||
|
else:
|
||||||
|
card_type = 'pitching' if 'pitching' in player['image2'] else 'batting'
|
||||||
|
|
||||||
|
if not DRY_RUN:
|
||||||
|
# Fetch and upload
|
||||||
|
image_bytes = await fetch_card_image(session, player['image2'], timeout=6)
|
||||||
|
s3_url = upload_card_to_s3(image_bytes, player_id, card_type, release_date, cardset_id)
|
||||||
|
|
||||||
|
# Update player record
|
||||||
|
await db_patch('players', object_id=player_id, params=[('image2', s3_url)])
|
||||||
|
cardset_stats['url_updates'] += 1
|
||||||
|
else:
|
||||||
|
# Dry run - just log what would happen
|
||||||
|
s3_url = upload_card_to_s3(None, player_id, card_type, release_date, cardset_id)
|
||||||
|
logger.info(f'[DRY RUN] Would update player {player_id} image2 to: {s3_url}')
|
||||||
|
|
||||||
|
cardset_stats['uploaded'] += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f'Error processing player {player_id} ({player_name}) image2: {e}')
|
||||||
|
cardset_stats['errors'] += 1
|
||||||
|
|
||||||
|
# Print cardset summary
|
||||||
|
print(f'\nCardset {cardset_name} Summary:')
|
||||||
|
print(f' Total players: {cardset_stats["total"]}')
|
||||||
|
print(f' Skipped (already AWS): {cardset_stats["skipped_aws"]}')
|
||||||
|
print(f' Uploaded: {cardset_stats["uploaded"]}')
|
||||||
|
print(f' URL updates: {cardset_stats["url_updates"]}')
|
||||||
|
print(f' Errors: {cardset_stats["errors"]}')
|
||||||
|
|
||||||
|
# Update global stats
|
||||||
|
for key in cardset_stats:
|
||||||
|
stats[key] = stats.get(key, 0) + cardset_stats[key]
|
||||||
|
|
||||||
|
|
||||||
|
async def main(args):
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
print(f'\n{"="*60}')
|
||||||
|
print(f'PAPER DYNASTY - BATCH CARD MIGRATION TO AWS S3')
|
||||||
|
print(f'{"="*60}')
|
||||||
|
print(f'Mode: {"DRY RUN (no changes will be made)" if DRY_RUN else "LIVE (will upload and update)"}')
|
||||||
|
print(f'Skip AWS URLs: {SKIP_AWS_URLS}')
|
||||||
|
if START_CARDSET_ID:
|
||||||
|
print(f'Start Cardset ID: {START_CARDSET_ID}')
|
||||||
|
if END_CARDSET_ID:
|
||||||
|
print(f'End Cardset ID: {END_CARDSET_ID}')
|
||||||
|
if EXCLUDE_CARDSET_IDS:
|
||||||
|
print(f'Excluded Cardset IDs: {EXCLUDE_CARDSET_IDS}')
|
||||||
|
print(f'{"="*60}\n')
|
||||||
|
|
||||||
|
# Get all cardsets
|
||||||
|
print('Fetching all cardsets...')
|
||||||
|
c_query = await db_get('cardsets')
|
||||||
|
|
||||||
|
if not c_query or c_query['count'] == 0:
|
||||||
|
print('No cardsets found!')
|
||||||
|
return
|
||||||
|
|
||||||
|
all_cardsets = c_query['cardsets']
|
||||||
|
print(f'Found {len(all_cardsets)} total cardsets')
|
||||||
|
|
||||||
|
# Filter cardsets based on configuration
|
||||||
|
filtered_cardsets = []
|
||||||
|
for cardset in all_cardsets:
|
||||||
|
cardset_id = cardset['id']
|
||||||
|
|
||||||
|
# Apply filters
|
||||||
|
if START_CARDSET_ID and cardset_id < START_CARDSET_ID:
|
||||||
|
continue
|
||||||
|
if END_CARDSET_ID and cardset_id > END_CARDSET_ID:
|
||||||
|
continue
|
||||||
|
if cardset_id in EXCLUDE_CARDSET_IDS:
|
||||||
|
continue
|
||||||
|
|
||||||
|
filtered_cardsets.append(cardset)
|
||||||
|
|
||||||
|
print(f'Processing {len(filtered_cardsets)} cardsets after filters\n')
|
||||||
|
|
||||||
|
# Generate release date for cache busting
|
||||||
|
now = datetime.datetime.now()
|
||||||
|
release_date = f'{now.year}-{now.month}-{now.day}'
|
||||||
|
|
||||||
|
# Global statistics
|
||||||
|
stats = {
|
||||||
|
'cardsets_processed': 0,
|
||||||
|
'total': 0,
|
||||||
|
'skipped_aws': 0,
|
||||||
|
'uploaded': 0,
|
||||||
|
'errors': 0,
|
||||||
|
'url_updates': 0
|
||||||
|
}
|
||||||
|
|
||||||
|
start_time = datetime.datetime.now()
|
||||||
|
|
||||||
|
# Create persistent aiohttp session for all card fetches
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
for cardset in filtered_cardsets:
|
||||||
|
try:
|
||||||
|
await process_cardset(cardset, session, release_date, stats)
|
||||||
|
stats['cardsets_processed'] += 1
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f'Failed to process cardset {cardset["name"]}: {e}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Print final summary
|
||||||
|
runtime = datetime.datetime.now() - start_time
|
||||||
|
|
||||||
|
print(f'\n{"="*60}')
|
||||||
|
print(f'FINAL SUMMARY')
|
||||||
|
print(f'{"="*60}')
|
||||||
|
print(f'Mode: {"DRY RUN" if DRY_RUN else "LIVE"}')
|
||||||
|
print(f'Cardsets processed: {stats["cardsets_processed"]}')
|
||||||
|
print(f'Total player cards: {stats["total"]}')
|
||||||
|
print(f'Skipped (already AWS): {stats["skipped_aws"]}')
|
||||||
|
print(f'Uploaded to S3: {stats["uploaded"]}')
|
||||||
|
print(f'URL updates: {stats["url_updates"]}')
|
||||||
|
print(f'Errors: {stats["errors"]}')
|
||||||
|
print(f'Runtime: {runtime.total_seconds():.2f} seconds')
|
||||||
|
print(f'{"="*60}')
|
||||||
|
|
||||||
|
if DRY_RUN:
|
||||||
|
print('\n*** THIS WAS A DRY RUN - NO CHANGES WERE MADE ***')
|
||||||
|
print('Set DRY_RUN = False to actually upload and update')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
asyncio.run(main(sys.argv[1:]))
|
||||||
Loading…
Reference in New Issue
Block a user