paper-dynasty-card-creation/migrate_all_cards_to_s3.py

import asyncio
import datetime
import sys
import boto3

from db_calls import db_get, db_patch
from exceptions import logger

# Configuration
DRY_RUN = False  # Set to False to actually upload and update
SKIP_AWS_URLS = True  # Skip URLs already pointing to S3
START_CARDSET_ID = 21  # Optional: Start from specific cardset ID
END_CARDSET_ID = 29  # Optional: End at specific cardset ID
EXCLUDE_CARDSET_IDS = []  # List of cardset IDs to skip (e.g., [1, 2, 3])
MAX_PLAYERS_PER_CARDSET = None  # Optional: Limit for testing (e.g., 10)

# AWS Configuration
AWS_BUCKET_NAME = "paper-dynasty"  # Change to your bucket name
AWS_REGION = "us-east-1"  # Change to your region
S3_BASE_URL = f"https://{AWS_BUCKET_NAME}.s3.{AWS_REGION}.amazonaws.com"

# Initialize S3 client (only if not dry run)
s3_client = boto3.client("s3", region_name=AWS_REGION) if not DRY_RUN else None


def is_aws_url(url: str) -> bool:
    """
    Check if a URL is already pointing to AWS S3.

    Args:
        url: URL to check

    Returns:
        True if URL is already on S3, False otherwise
    """
    if not url:
        return False

    # Check for common S3 URL patterns
    s3_patterns = [
        "s3.amazonaws.com",
        "s3-",  # Regional S3 URLs like s3-us-east-1
        f"{AWS_BUCKET_NAME}.s3",
        f"s3://{AWS_BUCKET_NAME}",
    ]

    return any(pattern in url.lower() for pattern in s3_patterns)


async def fetch_card_image(session, card_url: str, timeout: int = 6) -> bytes:
    """
    Fetch card image from URL and return raw bytes.

    Args:
        session: aiohttp ClientSession to use for the request
        card_url: URL to fetch the card from
        timeout: Request timeout in seconds

    Returns:
        Raw PNG image bytes
    """
    import aiohttp

    async with session.get(
        card_url, timeout=aiohttp.ClientTimeout(total=timeout)
    ) as resp:
        if resp.status == 200:
            logger.info(f"Fetched card image from {card_url}")
            return await resp.read()
        else:
            error_text = await resp.text()
            logger.error(f"Failed to fetch card: {error_text}")
            raise ValueError(f"Card fetch error: {error_text}")


def upload_card_to_s3(
    image_data: bytes,
    player_id: int,
    card_type: str,
    release_date: str,
    cardset_id: int,
) -> str:
    """
    Upload card image to S3 and return the S3 URL with cache-busting param.

    Args:
        image_data: Raw PNG image bytes
        player_id: Player ID
        card_type: 'batting' or 'pitching'
        release_date: Date string for cache busting (e.g., '2025-11-8')
        cardset_id: Cardset ID (will be zero-padded to 3 digits)

    Returns:
        Full S3 URL with ?d= parameter
    """
    if DRY_RUN:
        # In dry run, just return what the URL would be
        cardset_str = f"{cardset_id:03d}"
        s3_key = f"cards/cardset-{cardset_str}/player-{player_id}/{card_type}card.png"
        s3_url = f"{S3_BASE_URL}/{s3_key}?d={release_date}"
        logger.info(
            f"[DRY RUN] Would upload {card_type} card for player {player_id} to: {s3_url}"
        )
        return s3_url

    # Format cardset_id with 3 digits and leading zeros
    cardset_str = f"{cardset_id:03d}"
    s3_key = f"cards/cardset-{cardset_str}/player-{player_id}/{card_type}card.png"

    try:
        s3_client.put_object(
            Bucket=AWS_BUCKET_NAME,
            Key=s3_key,
            Body=image_data,
            ContentType="image/png",
            CacheControl="public, max-age=300",  # 5 minute cache
            Metadata={
                "player-id": str(player_id),
                "card-type": card_type,
                "upload-date": datetime.datetime.now().isoformat(),
            },
        )

        # Return URL with cache-busting parameter
        s3_url = f"{S3_BASE_URL}/{s3_key}?d={release_date}"
        logger.info(f"Uploaded {card_type} card for player {player_id} to S3: {s3_url}")
        return s3_url

    except Exception as e:
        logger.error(f"Failed to upload {card_type} card for player {player_id}: {e}")
        raise


async def process_cardset(cardset: dict, session, release_date: str, stats: dict):
    """
    Process all players in a single cardset.

    Args:
        cardset: Cardset dictionary from API
        session: aiohttp ClientSession
        release_date: Release date for cache busting
        stats: Stats dictionary to update
    """
    cardset_id = cardset["id"]
    cardset_name = cardset["name"]

    print(f'\n{"="*60}')
    print(f"Processing Cardset: {cardset_name} (ID: {cardset_id})")
    print(f'{"="*60}')

    # Get all players for this cardset
    p_query = await db_get(
        "players",
        params=[("inc_dex", False), ("cardset_id", cardset_id), ("short_output", True)],
    )

    if not p_query or p_query["count"] == 0:
        print(f"No players found for cardset {cardset_name}")
        return

    all_players = p_query["players"]

    # Apply max players limit if set
    if MAX_PLAYERS_PER_CARDSET:
        all_players = all_players[:MAX_PLAYERS_PER_CARDSET]
        print(f"Limited to first {MAX_PLAYERS_PER_CARDSET} players for testing")

    print(f"Found {len(all_players)} players")

    cardset_stats = {
        "total": len(all_players),
        "skipped_aws": 0,
        "uploaded": 0,
        "errors": 0,
        "url_updates": 0,
    }

    for idx, player in enumerate(all_players):
        player_id = player["player_id"]
        player_name = player["p_name"]

        if idx % 50 == 0 and idx > 0:
            print(f"  Progress: {idx}/{len(all_players)} players processed...")

        # Process primary image
        if player["image"]:
            try:
                if SKIP_AWS_URLS and is_aws_url(player["image"]):
                    logger.debug(f"Skipping player {player_id} image - already on AWS")
                    cardset_stats["skipped_aws"] += 1
                else:
                    card_type = (
                        "pitching" if "pitching" in player["image"] else "batting"
                    )

                    if not DRY_RUN:
                        # Fetch and upload
                        image_bytes = await fetch_card_image(
                            session, player["image"], timeout=6
                        )
                        s3_url = upload_card_to_s3(
                            image_bytes, player_id, card_type, release_date, cardset_id
                        )

                        # Update player record
                        await db_patch(
                            "players", object_id=player_id, params=[("image", s3_url)]
                        )
                        cardset_stats["url_updates"] += 1
                    else:
                        # Dry run - just log what would happen
                        s3_url = upload_card_to_s3(
                            None, player_id, card_type, release_date, cardset_id
                        )
                        logger.info(
                            f"[DRY RUN] Would update player {player_id} image to: {s3_url}"
                        )

                    cardset_stats["uploaded"] += 1

            except Exception as e:
                logger.error(
                    f"Error processing player {player_id} ({player_name}) image: {e}"
                )
                cardset_stats["errors"] += 1

        # Process secondary image (dual-position players)
        if player["image2"]:
            try:
                if SKIP_AWS_URLS and is_aws_url(player["image2"]):
                    logger.debug(f"Skipping player {player_id} image2 - already on AWS")
                    cardset_stats["skipped_aws"] += 1
                else:
                    card_type = (
                        "pitching" if "pitching" in player["image2"] else "batting"
                    )

                    if not DRY_RUN:
                        # Fetch and upload
                        image_bytes = await fetch_card_image(
                            session, player["image2"], timeout=6
                        )
                        s3_url = upload_card_to_s3(
                            image_bytes, player_id, card_type, release_date, cardset_id
                        )

                        # Update player record
                        await db_patch(
                            "players", object_id=player_id, params=[("image2", s3_url)]
                        )
                        cardset_stats["url_updates"] += 1
                    else:
                        # Dry run - just log what would happen
                        s3_url = upload_card_to_s3(
                            None, player_id, card_type, release_date, cardset_id
                        )
                        logger.info(
                            f"[DRY RUN] Would update player {player_id} image2 to: {s3_url}"
                        )

                    cardset_stats["uploaded"] += 1

            except Exception as e:
                logger.error(
                    f"Error processing player {player_id} ({player_name}) image2: {e}"
                )
                cardset_stats["errors"] += 1

    # Print cardset summary
    print(f"\nCardset {cardset_name} Summary:")
    print(f'  Total players: {cardset_stats["total"]}')
    print(f'  Skipped (already AWS): {cardset_stats["skipped_aws"]}')
    print(f'  Uploaded: {cardset_stats["uploaded"]}')
    print(f'  URL updates: {cardset_stats["url_updates"]}')
    print(f'  Errors: {cardset_stats["errors"]}')

    # Update global stats
    for key in cardset_stats:
        stats[key] = stats.get(key, 0) + cardset_stats[key]


async def main(args):
    import aiohttp

    print(f'\n{"="*60}')
    print("PAPER DYNASTY - BATCH CARD MIGRATION TO AWS S3")
    print(f'{"="*60}')
    print(
        f'Mode: {"DRY RUN (no changes will be made)" if DRY_RUN else "LIVE (will upload and update)"}'
    )
    print(f"Skip AWS URLs: {SKIP_AWS_URLS}")
    if START_CARDSET_ID:
        print(f"Start Cardset ID: {START_CARDSET_ID}")
    if END_CARDSET_ID:
        print(f"End Cardset ID: {END_CARDSET_ID}")
    if EXCLUDE_CARDSET_IDS:
        print(f"Excluded Cardset IDs: {EXCLUDE_CARDSET_IDS}")
    print(f'{"="*60}\n')

    # Get all cardsets
    print("Fetching all cardsets...")
    c_query = await db_get("cardsets")

    if not c_query or c_query["count"] == 0:
        print("No cardsets found!")
        return

    all_cardsets = c_query["cardsets"]
    print(f"Found {len(all_cardsets)} total cardsets")

    # Filter cardsets based on configuration
    filtered_cardsets = []
    for cardset in all_cardsets:
        cardset_id = cardset["id"]

        # Apply filters
        if START_CARDSET_ID and cardset_id < START_CARDSET_ID:
            continue
        if END_CARDSET_ID and cardset_id > END_CARDSET_ID:
            continue
        if cardset_id in EXCLUDE_CARDSET_IDS:
            continue

        filtered_cardsets.append(cardset)

    print(f"Processing {len(filtered_cardsets)} cardsets after filters\n")

    # Generate release date for cache busting
    now = datetime.datetime.now()
    release_date = f"{now.year}-{now.month}-{now.day}"

    # Global statistics
    stats = {
        "cardsets_processed": 0,
        "total": 0,
        "skipped_aws": 0,
        "uploaded": 0,
        "errors": 0,
        "url_updates": 0,
    }

    start_time = datetime.datetime.now()

    # Create persistent aiohttp session for all card fetches
    async with aiohttp.ClientSession() as session:
        for cardset in filtered_cardsets:
            try:
                await process_cardset(cardset, session, release_date, stats)
                stats["cardsets_processed"] += 1
            except Exception as e:
                logger.error(f'Failed to process cardset {cardset["name"]}: {e}')
                continue

    # Print final summary
    runtime = datetime.datetime.now() - start_time

    print(f'\n{"="*60}')
    print("FINAL SUMMARY")
    print(f'{"="*60}')
    print(f'Mode: {"DRY RUN" if DRY_RUN else "LIVE"}')
    print(f'Cardsets processed: {stats["cardsets_processed"]}')
    print(f'Total player cards: {stats["total"]}')
    print(f'Skipped (already AWS): {stats["skipped_aws"]}')
    print(f'Uploaded to S3: {stats["uploaded"]}')
    print(f'URL updates: {stats["url_updates"]}')
    print(f'Errors: {stats["errors"]}')
    print(f"Runtime: {runtime.total_seconds():.2f} seconds")
    print(f'{"="*60}')

    if DRY_RUN:
        print("\n*** THIS WAS A DRY RUN - NO CHANGES WERE MADE ***")
        print("Set DRY_RUN = False to actually upload and update")


if __name__ == "__main__":
    asyncio.run(main(sys.argv[1:]))