paper-dynasty-card-creation/check_cardset_rarity.py

"""
Script to analyze rarity distribution and card costs for a specific cardset.
"""

import asyncio
import logging
import pandas as pd
from db_calls import db_get
from rarity_thresholds import get_pitcher_thresholds, get_batter_thresholds

# Set up rotating logger
logger = logging.getLogger(f"{__name__}")
handler = logging.StreamHandler()
handler.setFormatter(
    logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
)
logger.addHandler(handler)
logger.setLevel(logging.INFO)


async def analyze_cardset(cardset_id: int):
    """Analyze rarity distribution and card costs for a specific cardset."""

    logger.info(f"Starting analysis for cardset {cardset_id}...\n")

    # Get cardset info to extract the season year
    c_query = await db_get("cardsets", object_id=cardset_id)
    if c_query is None:
        logger.error(f"Cardset {cardset_id} not found")
        return

    cardset_name = c_query.get("name", "")
    logger.info(f"Cardset: {cardset_name}")

    # Extract year from cardset name (e.g., "2005 Live" -> 2005)
    import re

    year_match = re.search(r"(\d{4})", cardset_name)
    if year_match:
        season = int(year_match.group(1))
        logger.info(f"Detected season: {season}")
    else:
        season = 2024  # Default fallback
        logger.warning(
            f"Could not detect season from cardset name, using default: {season}"
        )

    logger.info("")

    # Get all players in this cardset
    p_query = await db_get("players", params=[("cardset_id", cardset_id)])
    if p_query is None or p_query["count"] == 0:
        logger.error("No players found")
        return

    players_df = pd.DataFrame(p_query["players"])
    logger.info(f"Found {len(players_df)} total players")

    # Extract rarity ID if it's a dict
    if isinstance(players_df["rarity"].iloc[0], dict):
        players_df["rarity_id"] = players_df["rarity"].apply(
            lambda x: x["id"] if isinstance(x, dict) else x
        )
    else:
        players_df["rarity_id"] = players_df["rarity"]

    # Get batting and pitching cards separately
    bc_query = await db_get("battingcards", params=[("cardset_id", cardset_id)])
    pc_query = await db_get("pitchingcards", params=[("cardset_id", cardset_id)])

    batting_cards_df = (
        pd.DataFrame(bc_query["cards"]) if bc_query["count"] > 0 else pd.DataFrame()
    )
    pitching_cards_df = (
        pd.DataFrame(pc_query["cards"]) if pc_query["count"] > 0 else pd.DataFrame()
    )

    logger.info(f"Found {len(batting_cards_df)} batting cards")
    logger.info(f"Found {len(pitching_cards_df)} pitching cards\n")

    # Get thresholds for the season
    pitcher_thresholds = get_pitcher_thresholds(season)
    batter_thresholds = get_batter_thresholds(season)

    # Analyze overall rarity distribution
    analyze_overall_rarity(players_df)

    # Analyze batting cards
    if len(batting_cards_df) > 0:
        analyze_batting_cards(batting_cards_df, players_df, batter_thresholds)

    # Analyze pitching cards
    if len(pitching_cards_df) > 0:
        analyze_pitching_cards(pitching_cards_df, players_df, pitcher_thresholds)


def analyze_overall_rarity(players_df: pd.DataFrame):
    """Analyze overall rarity distribution."""
    logger.info("=" * 60)
    logger.info("OVERALL RARITY DISTRIBUTION")
    logger.info("=" * 60)

    rarity_counts = players_df["rarity_id"].value_counts().sort_index()

    rarity_names = {
        1: "Diamond",
        2: "Gold",
        3: "Silver",
        4: "Bronze",
        5: "Common",
        99: "Hall of Fame",
    }

    total = len(players_df)
    for rarity_id, count in rarity_counts.items():
        rarity_name = rarity_names.get(rarity_id, f"Unknown ({rarity_id})")
        pct = (count / total) * 100
        logger.info(f"{rarity_name:15} ({rarity_id:2}): {count:5} cards ({pct:5.1f}%)")

    logger.info("-" * 60)
    logger.info(f"Total: {total} cards\n")


def analyze_batting_cards(
    batting_cards_df: pd.DataFrame, players_df: pd.DataFrame, thresholds
):
    """Analyze batting card rarities and costs."""
    logger.info("=" * 60)
    logger.info("BATTING CARD ANALYSIS")
    logger.info("=" * 60)

    # Extract player ID from player reference (may be URL or dict)
    sample_player = batting_cards_df["player"].iloc[0]
    if isinstance(sample_player, dict):
        # The dict has 'player_id' not 'id'
        batting_cards_df["player_ref"] = batting_cards_df["player"].apply(
            lambda x: (
                int(x.get("player_id"))
                if isinstance(x, dict) and x.get("player_id")
                else None
            )
        )
    elif isinstance(sample_player, str):
        # Extract ID from URL like "/api/v2/players/123"
        batting_cards_df["player_ref"] = (
            batting_cards_df["player"].str.extract(r"/(\d+)$")[0].astype(int)
        )
    elif isinstance(sample_player, int):
        batting_cards_df["player_ref"] = batting_cards_df["player"]
    else:
        logger.error(f"Unknown player reference type: {type(sample_player)}")
        batting_cards_df["player_ref"] = batting_cards_df["player"]

    # Merge with player data to get rarity
    batting_with_player = batting_cards_df.merge(
        players_df[["player_id", "p_name", "rarity_id"]],
        left_on="player_ref",
        right_on="player_id",
        how="left",
    )

    # Count rarities
    rarity_counts = batting_with_player["rarity_id"].value_counts().sort_index()

    rarity_names = {
        1: "Diamond",
        2: "Gold",
        3: "Silver",
        4: "Bronze",
        5: "Common",
        99: "Hall of Fame",
    }

    logger.info("\nBatting Card Rarity Distribution:")
    total = len(batting_with_player)
    for rarity_id, count in rarity_counts.items():
        rarity_name = rarity_names.get(rarity_id, f"Unknown ({rarity_id})")
        pct = (count / total) * 100
        logger.info(f"{rarity_name:15} ({rarity_id:2}): {count:5} cards ({pct:5.1f}%)")

    # Check for cost anomalies
    logger.info("\n" + "-" * 60)
    logger.info("BATTING CARD COST ANALYSIS")
    logger.info("-" * 60)

    expected_costs = {
        5: 20,  # Common
        4: 40,  # Bronze
        3: 80,  # Silver
        2: 160,  # Gold
        1: 320,  # Diamond
        99: 640,  # Hall of Fame
    }

    # Note: cost is stored on the player record, not the card record
    # We'll check player costs instead
    player_costs = players_df[
        players_df["player_id"].isin(batting_with_player["player_ref"])
    ].copy()
    player_costs_with_rarity = player_costs.merge(
        batting_with_player[["player_ref", "rarity_id"]].drop_duplicates(),
        left_on="player_id",
        right_on="player_ref",
        how="left",
        suffixes=("", "_card"),
    )

    cost_issues = []
    for _, player in player_costs_with_rarity.iterrows():
        rarity = player.get("rarity_id_card")
        cost = player.get("cost")
        expected = expected_costs.get(rarity)

        if cost != expected:
            cost_issues.append(
                {
                    "player": player.get("p_name"),
                    "player_id": player.get("player_id"),
                    "rarity": rarity,
                    "actual_cost": cost,
                    "expected_cost": expected,
                }
            )

    if cost_issues:
        logger.warning(f"Found {len(cost_issues)} cost anomalies:")
        for issue in cost_issues[:20]:  # Show first 20
            logger.warning(
                f"  {issue['player']} (Player ID: {issue['player_id']}): "
                f"Rarity {issue['rarity']} has cost {issue['actual_cost']}, "
                f"expected {issue['expected_cost']}"
            )
        if len(cost_issues) > 20:
            logger.warning(f"  ... and {len(cost_issues) - 20} more")
    else:
        logger.info("✓ No cost anomalies found")

    # Check for OPS-rarity alignment
    logger.info("\n" + "-" * 60)
    logger.info("BATTING OPS-RARITY ALIGNMENT")
    logger.info("-" * 60)

    ops_mismatches = []
    for _, card in batting_with_player.iterrows():
        ops = card.get("total_OPS")
        rarity = card.get("rarity_id")

        if pd.isna(ops) or ops is None:
            continue

        expected_rarity = thresholds.get_rarity(ops)

        if expected_rarity != rarity:
            ops_mismatches.append(
                {
                    "player": card.get("p_name"),
                    "card_id": card.get("id"),
                    "ops": ops,
                    "actual_rarity": rarity,
                    "expected_rarity": expected_rarity,
                }
            )

    if ops_mismatches:
        logger.warning(f"Found {len(ops_mismatches)} OPS-rarity mismatches:")
        for issue in ops_mismatches[:20]:
            logger.warning(
                f"  {issue['player']} (Card ID: {issue['card_id']}): "
                f"OPS {issue['ops']:.3f} assigned rarity {issue['actual_rarity']}, "
                f"expected {issue['expected_rarity']}"
            )
        if len(ops_mismatches) > 20:
            logger.warning(f"  ... and {len(ops_mismatches) - 20} more")
    else:
        logger.info("✓ All OPS values align with rarity assignments")

    logger.info("")


def analyze_pitching_cards(
    pitching_cards_df: pd.DataFrame, players_df: pd.DataFrame, thresholds
):
    """Analyze pitching card rarities and costs."""
    logger.info("=" * 60)
    logger.info("PITCHING CARD ANALYSIS")
    logger.info("=" * 60)

    # Extract player ID from player reference (may be URL or dict)
    sample_player = pitching_cards_df["player"].iloc[0]
    if isinstance(sample_player, dict):
        # The dict has 'player_id' not 'id'
        pitching_cards_df["player_ref"] = pitching_cards_df["player"].apply(
            lambda x: (
                int(x.get("player_id"))
                if isinstance(x, dict) and x.get("player_id")
                else None
            )
        )
    elif isinstance(sample_player, str):
        # Extract ID from URL like "/api/v2/players/123"
        pitching_cards_df["player_ref"] = (
            pitching_cards_df["player"].str.extract(r"/(\d+)$")[0].astype(int)
        )
    else:
        pitching_cards_df["player_ref"] = pitching_cards_df["player"]

    # Merge with player data to get rarity
    pitching_with_player = pitching_cards_df.merge(
        players_df[["player_id", "p_name", "rarity_id"]],
        left_on="player_ref",
        right_on="player_id",
        how="left",
    )

    # Count rarities
    rarity_counts = pitching_with_player["rarity_id"].value_counts().sort_index()

    rarity_names = {
        1: "Diamond",
        2: "Gold",
        3: "Silver",
        4: "Bronze",
        5: "Common",
        99: "Hall of Fame",
    }

    logger.info("\nPitching Card Rarity Distribution:")
    total = len(pitching_with_player)
    for rarity_id, count in rarity_counts.items():
        rarity_name = rarity_names.get(rarity_id, f"Unknown ({rarity_id})")
        pct = (count / total) * 100
        logger.info(f"{rarity_name:15} ({rarity_id:2}): {count:5} cards ({pct:5.1f}%)")

    # Separate starters and relievers
    pitching_with_player["is_starter"] = (
        pitching_with_player["starter_rating"].fillna(0) >= 4
    )
    starters = pitching_with_player[pitching_with_player["is_starter"]]
    relievers = pitching_with_player[~pitching_with_player["is_starter"]]

    logger.info(f"\nStarters: {len(starters)}, Relievers: {len(relievers)}")

    # Check for cost anomalies
    logger.info("\n" + "-" * 60)
    logger.info("PITCHING CARD COST ANALYSIS")
    logger.info("-" * 60)

    expected_costs = {
        5: 20,  # Common
        4: 40,  # Bronze
        3: 80,  # Silver
        2: 160,  # Gold
        1: 320,  # Diamond
        99: 640,  # Hall of Fame
    }

    # Note: cost is stored on the player record, not the card record
    # We'll check player costs instead
    player_costs = players_df[
        players_df["player_id"].isin(pitching_with_player["player_ref"])
    ].copy()
    player_costs_with_rarity = player_costs.merge(
        pitching_with_player[["player_ref", "rarity_id"]].drop_duplicates(),
        left_on="player_id",
        right_on="player_ref",
        how="left",
        suffixes=("", "_card"),
    )

    cost_issues = []
    for _, player in player_costs_with_rarity.iterrows():
        rarity = player.get("rarity_id_card")
        cost = player.get("cost")
        expected = expected_costs.get(rarity)

        if cost != expected:
            cost_issues.append(
                {
                    "player": player.get("p_name"),
                    "player_id": player.get("player_id"),
                    "rarity": rarity,
                    "actual_cost": cost,
                    "expected_cost": expected,
                }
            )

    if cost_issues:
        logger.warning(f"Found {len(cost_issues)} cost anomalies:")
        for issue in cost_issues[:20]:
            logger.warning(
                f"  {issue['player']} (Player ID: {issue['player_id']}): "
                f"Rarity {issue['rarity']} has cost {issue['actual_cost']}, "
                f"expected {issue['expected_cost']}"
            )
        if len(cost_issues) > 20:
            logger.warning(f"  ... and {len(cost_issues) - 20} more")
    else:
        logger.info("✓ No cost anomalies found")

    # Check for OPS-rarity alignment
    logger.info("\n" + "-" * 60)
    logger.info("PITCHING OPS-RARITY ALIGNMENT")
    logger.info("-" * 60)

    ops_mismatches = []
    for _, card in pitching_with_player.iterrows():
        ops = card.get("total_OPS")
        rarity = card.get("rarity_id")
        is_starter = card.get("is_starter", False)

        if pd.isna(ops) or ops is None:
            continue

        if is_starter:
            expected_rarity = thresholds.get_rarity_for_starter(ops)
        else:
            expected_rarity = thresholds.get_rarity_for_reliever(ops)

        if expected_rarity != rarity:
            ops_mismatches.append(
                {
                    "player": card.get("p_name"),
                    "card_id": card.get("id"),
                    "ops": ops,
                    "is_starter": is_starter,
                    "actual_rarity": rarity,
                    "expected_rarity": expected_rarity,
                }
            )

    if ops_mismatches:
        logger.warning(f"Found {len(ops_mismatches)} OPS-rarity mismatches:")
        for issue in ops_mismatches[:20]:
            role = "SP" if issue["is_starter"] else "RP"
            logger.warning(
                f"  {issue['player']} ({role}, Card ID: {issue['card_id']}): "
                f"OPS {issue['ops']:.3f} assigned rarity {issue['actual_rarity']}, "
                f"expected {issue['expected_rarity']}"
            )
        if len(ops_mismatches) > 20:
            logger.warning(f"  ... and {len(ops_mismatches) - 20} more")
    else:
        logger.info("✓ All OPS values align with rarity assignments")

    logger.info("")


async def main():
    """Main execution function."""
    import sys

    # Parse command-line arguments
    if len(sys.argv) >= 2:
        CARDSET_ID = int(sys.argv[1])
    else:
        # Default value if no arguments provided
        CARDSET_ID = 27
        logger.info(f"Using default cardset_id: {CARDSET_ID}")
        logger.info("Usage: python check_cardset_rarity.py <cardset_id>\n")

    await analyze_cardset(CARDSET_ID)

    logger.info("=" * 60)
    logger.info("Analysis complete")
    logger.info("=" * 60)


if __name__ == "__main__":
    asyncio.run(main())