paper-dynasty-card-creation/check_cardset_rarity.py
Cal Corum 0a17745389 Run black and ruff across entire codebase
Standardize formatting with black and apply ruff auto-fixes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 14:24:33 -05:00

470 lines
15 KiB
Python

"""
Script to analyze rarity distribution and card costs for a specific cardset.
"""
import asyncio
import logging
import pandas as pd
from db_calls import db_get
from rarity_thresholds import get_pitcher_thresholds, get_batter_thresholds
# Set up rotating logger
logger = logging.getLogger(f"{__name__}")
handler = logging.StreamHandler()
handler.setFormatter(
logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
async def analyze_cardset(cardset_id: int):
"""Analyze rarity distribution and card costs for a specific cardset."""
logger.info(f"Starting analysis for cardset {cardset_id}...\n")
# Get cardset info to extract the season year
c_query = await db_get("cardsets", object_id=cardset_id)
if c_query is None:
logger.error(f"Cardset {cardset_id} not found")
return
cardset_name = c_query.get("name", "")
logger.info(f"Cardset: {cardset_name}")
# Extract year from cardset name (e.g., "2005 Live" -> 2005)
import re
year_match = re.search(r"(\d{4})", cardset_name)
if year_match:
season = int(year_match.group(1))
logger.info(f"Detected season: {season}")
else:
season = 2024 # Default fallback
logger.warning(
f"Could not detect season from cardset name, using default: {season}"
)
logger.info("")
# Get all players in this cardset
p_query = await db_get("players", params=[("cardset_id", cardset_id)])
if p_query is None or p_query["count"] == 0:
logger.error("No players found")
return
players_df = pd.DataFrame(p_query["players"])
logger.info(f"Found {len(players_df)} total players")
# Extract rarity ID if it's a dict
if isinstance(players_df["rarity"].iloc[0], dict):
players_df["rarity_id"] = players_df["rarity"].apply(
lambda x: x["id"] if isinstance(x, dict) else x
)
else:
players_df["rarity_id"] = players_df["rarity"]
# Get batting and pitching cards separately
bc_query = await db_get("battingcards", params=[("cardset_id", cardset_id)])
pc_query = await db_get("pitchingcards", params=[("cardset_id", cardset_id)])
batting_cards_df = (
pd.DataFrame(bc_query["cards"]) if bc_query["count"] > 0 else pd.DataFrame()
)
pitching_cards_df = (
pd.DataFrame(pc_query["cards"]) if pc_query["count"] > 0 else pd.DataFrame()
)
logger.info(f"Found {len(batting_cards_df)} batting cards")
logger.info(f"Found {len(pitching_cards_df)} pitching cards\n")
# Get thresholds for the season
pitcher_thresholds = get_pitcher_thresholds(season)
batter_thresholds = get_batter_thresholds(season)
# Analyze overall rarity distribution
analyze_overall_rarity(players_df)
# Analyze batting cards
if len(batting_cards_df) > 0:
analyze_batting_cards(batting_cards_df, players_df, batter_thresholds)
# Analyze pitching cards
if len(pitching_cards_df) > 0:
analyze_pitching_cards(pitching_cards_df, players_df, pitcher_thresholds)
def analyze_overall_rarity(players_df: pd.DataFrame):
"""Analyze overall rarity distribution."""
logger.info("=" * 60)
logger.info("OVERALL RARITY DISTRIBUTION")
logger.info("=" * 60)
rarity_counts = players_df["rarity_id"].value_counts().sort_index()
rarity_names = {
1: "Diamond",
2: "Gold",
3: "Silver",
4: "Bronze",
5: "Common",
99: "Hall of Fame",
}
total = len(players_df)
for rarity_id, count in rarity_counts.items():
rarity_name = rarity_names.get(rarity_id, f"Unknown ({rarity_id})")
pct = (count / total) * 100
logger.info(f"{rarity_name:15} ({rarity_id:2}): {count:5} cards ({pct:5.1f}%)")
logger.info("-" * 60)
logger.info(f"Total: {total} cards\n")
def analyze_batting_cards(
batting_cards_df: pd.DataFrame, players_df: pd.DataFrame, thresholds
):
"""Analyze batting card rarities and costs."""
logger.info("=" * 60)
logger.info("BATTING CARD ANALYSIS")
logger.info("=" * 60)
# Extract player ID from player reference (may be URL or dict)
sample_player = batting_cards_df["player"].iloc[0]
if isinstance(sample_player, dict):
# The dict has 'player_id' not 'id'
batting_cards_df["player_ref"] = batting_cards_df["player"].apply(
lambda x: (
int(x.get("player_id"))
if isinstance(x, dict) and x.get("player_id")
else None
)
)
elif isinstance(sample_player, str):
# Extract ID from URL like "/api/v2/players/123"
batting_cards_df["player_ref"] = (
batting_cards_df["player"].str.extract(r"/(\d+)$")[0].astype(int)
)
elif isinstance(sample_player, int):
batting_cards_df["player_ref"] = batting_cards_df["player"]
else:
logger.error(f"Unknown player reference type: {type(sample_player)}")
batting_cards_df["player_ref"] = batting_cards_df["player"]
# Merge with player data to get rarity
batting_with_player = batting_cards_df.merge(
players_df[["player_id", "p_name", "rarity_id"]],
left_on="player_ref",
right_on="player_id",
how="left",
)
# Count rarities
rarity_counts = batting_with_player["rarity_id"].value_counts().sort_index()
rarity_names = {
1: "Diamond",
2: "Gold",
3: "Silver",
4: "Bronze",
5: "Common",
99: "Hall of Fame",
}
logger.info("\nBatting Card Rarity Distribution:")
total = len(batting_with_player)
for rarity_id, count in rarity_counts.items():
rarity_name = rarity_names.get(rarity_id, f"Unknown ({rarity_id})")
pct = (count / total) * 100
logger.info(f"{rarity_name:15} ({rarity_id:2}): {count:5} cards ({pct:5.1f}%)")
# Check for cost anomalies
logger.info("\n" + "-" * 60)
logger.info("BATTING CARD COST ANALYSIS")
logger.info("-" * 60)
expected_costs = {
5: 20, # Common
4: 40, # Bronze
3: 80, # Silver
2: 160, # Gold
1: 320, # Diamond
99: 640, # Hall of Fame
}
# Note: cost is stored on the player record, not the card record
# We'll check player costs instead
player_costs = players_df[
players_df["player_id"].isin(batting_with_player["player_ref"])
].copy()
player_costs_with_rarity = player_costs.merge(
batting_with_player[["player_ref", "rarity_id"]].drop_duplicates(),
left_on="player_id",
right_on="player_ref",
how="left",
suffixes=("", "_card"),
)
cost_issues = []
for _, player in player_costs_with_rarity.iterrows():
rarity = player.get("rarity_id_card")
cost = player.get("cost")
expected = expected_costs.get(rarity)
if cost != expected:
cost_issues.append(
{
"player": player.get("p_name"),
"player_id": player.get("player_id"),
"rarity": rarity,
"actual_cost": cost,
"expected_cost": expected,
}
)
if cost_issues:
logger.warning(f"Found {len(cost_issues)} cost anomalies:")
for issue in cost_issues[:20]: # Show first 20
logger.warning(
f" {issue['player']} (Player ID: {issue['player_id']}): "
f"Rarity {issue['rarity']} has cost {issue['actual_cost']}, "
f"expected {issue['expected_cost']}"
)
if len(cost_issues) > 20:
logger.warning(f" ... and {len(cost_issues) - 20} more")
else:
logger.info("✓ No cost anomalies found")
# Check for OPS-rarity alignment
logger.info("\n" + "-" * 60)
logger.info("BATTING OPS-RARITY ALIGNMENT")
logger.info("-" * 60)
ops_mismatches = []
for _, card in batting_with_player.iterrows():
ops = card.get("total_OPS")
rarity = card.get("rarity_id")
if pd.isna(ops) or ops is None:
continue
expected_rarity = thresholds.get_rarity(ops)
if expected_rarity != rarity:
ops_mismatches.append(
{
"player": card.get("p_name"),
"card_id": card.get("id"),
"ops": ops,
"actual_rarity": rarity,
"expected_rarity": expected_rarity,
}
)
if ops_mismatches:
logger.warning(f"Found {len(ops_mismatches)} OPS-rarity mismatches:")
for issue in ops_mismatches[:20]:
logger.warning(
f" {issue['player']} (Card ID: {issue['card_id']}): "
f"OPS {issue['ops']:.3f} assigned rarity {issue['actual_rarity']}, "
f"expected {issue['expected_rarity']}"
)
if len(ops_mismatches) > 20:
logger.warning(f" ... and {len(ops_mismatches) - 20} more")
else:
logger.info("✓ All OPS values align with rarity assignments")
logger.info("")
def analyze_pitching_cards(
pitching_cards_df: pd.DataFrame, players_df: pd.DataFrame, thresholds
):
"""Analyze pitching card rarities and costs."""
logger.info("=" * 60)
logger.info("PITCHING CARD ANALYSIS")
logger.info("=" * 60)
# Extract player ID from player reference (may be URL or dict)
sample_player = pitching_cards_df["player"].iloc[0]
if isinstance(sample_player, dict):
# The dict has 'player_id' not 'id'
pitching_cards_df["player_ref"] = pitching_cards_df["player"].apply(
lambda x: (
int(x.get("player_id"))
if isinstance(x, dict) and x.get("player_id")
else None
)
)
elif isinstance(sample_player, str):
# Extract ID from URL like "/api/v2/players/123"
pitching_cards_df["player_ref"] = (
pitching_cards_df["player"].str.extract(r"/(\d+)$")[0].astype(int)
)
else:
pitching_cards_df["player_ref"] = pitching_cards_df["player"]
# Merge with player data to get rarity
pitching_with_player = pitching_cards_df.merge(
players_df[["player_id", "p_name", "rarity_id"]],
left_on="player_ref",
right_on="player_id",
how="left",
)
# Count rarities
rarity_counts = pitching_with_player["rarity_id"].value_counts().sort_index()
rarity_names = {
1: "Diamond",
2: "Gold",
3: "Silver",
4: "Bronze",
5: "Common",
99: "Hall of Fame",
}
logger.info("\nPitching Card Rarity Distribution:")
total = len(pitching_with_player)
for rarity_id, count in rarity_counts.items():
rarity_name = rarity_names.get(rarity_id, f"Unknown ({rarity_id})")
pct = (count / total) * 100
logger.info(f"{rarity_name:15} ({rarity_id:2}): {count:5} cards ({pct:5.1f}%)")
# Separate starters and relievers
pitching_with_player["is_starter"] = (
pitching_with_player["starter_rating"].fillna(0) >= 4
)
starters = pitching_with_player[pitching_with_player["is_starter"]]
relievers = pitching_with_player[~pitching_with_player["is_starter"]]
logger.info(f"\nStarters: {len(starters)}, Relievers: {len(relievers)}")
# Check for cost anomalies
logger.info("\n" + "-" * 60)
logger.info("PITCHING CARD COST ANALYSIS")
logger.info("-" * 60)
expected_costs = {
5: 20, # Common
4: 40, # Bronze
3: 80, # Silver
2: 160, # Gold
1: 320, # Diamond
99: 640, # Hall of Fame
}
# Note: cost is stored on the player record, not the card record
# We'll check player costs instead
player_costs = players_df[
players_df["player_id"].isin(pitching_with_player["player_ref"])
].copy()
player_costs_with_rarity = player_costs.merge(
pitching_with_player[["player_ref", "rarity_id"]].drop_duplicates(),
left_on="player_id",
right_on="player_ref",
how="left",
suffixes=("", "_card"),
)
cost_issues = []
for _, player in player_costs_with_rarity.iterrows():
rarity = player.get("rarity_id_card")
cost = player.get("cost")
expected = expected_costs.get(rarity)
if cost != expected:
cost_issues.append(
{
"player": player.get("p_name"),
"player_id": player.get("player_id"),
"rarity": rarity,
"actual_cost": cost,
"expected_cost": expected,
}
)
if cost_issues:
logger.warning(f"Found {len(cost_issues)} cost anomalies:")
for issue in cost_issues[:20]:
logger.warning(
f" {issue['player']} (Player ID: {issue['player_id']}): "
f"Rarity {issue['rarity']} has cost {issue['actual_cost']}, "
f"expected {issue['expected_cost']}"
)
if len(cost_issues) > 20:
logger.warning(f" ... and {len(cost_issues) - 20} more")
else:
logger.info("✓ No cost anomalies found")
# Check for OPS-rarity alignment
logger.info("\n" + "-" * 60)
logger.info("PITCHING OPS-RARITY ALIGNMENT")
logger.info("-" * 60)
ops_mismatches = []
for _, card in pitching_with_player.iterrows():
ops = card.get("total_OPS")
rarity = card.get("rarity_id")
is_starter = card.get("is_starter", False)
if pd.isna(ops) or ops is None:
continue
if is_starter:
expected_rarity = thresholds.get_rarity_for_starter(ops)
else:
expected_rarity = thresholds.get_rarity_for_reliever(ops)
if expected_rarity != rarity:
ops_mismatches.append(
{
"player": card.get("p_name"),
"card_id": card.get("id"),
"ops": ops,
"is_starter": is_starter,
"actual_rarity": rarity,
"expected_rarity": expected_rarity,
}
)
if ops_mismatches:
logger.warning(f"Found {len(ops_mismatches)} OPS-rarity mismatches:")
for issue in ops_mismatches[:20]:
role = "SP" if issue["is_starter"] else "RP"
logger.warning(
f" {issue['player']} ({role}, Card ID: {issue['card_id']}): "
f"OPS {issue['ops']:.3f} assigned rarity {issue['actual_rarity']}, "
f"expected {issue['expected_rarity']}"
)
if len(ops_mismatches) > 20:
logger.warning(f" ... and {len(ops_mismatches) - 20} more")
else:
logger.info("✓ All OPS values align with rarity assignments")
logger.info("")
async def main():
"""Main execution function."""
import sys
# Parse command-line arguments
if len(sys.argv) >= 2:
CARDSET_ID = int(sys.argv[1])
else:
# Default value if no arguments provided
CARDSET_ID = 27
logger.info(f"Using default cardset_id: {CARDSET_ID}")
logger.info("Usage: python check_cardset_rarity.py <cardset_id>\n")
await analyze_cardset(CARDSET_ID)
logger.info("=" * 60)
logger.info("Analysis complete")
logger.info("=" * 60)
if __name__ == "__main__":
asyncio.run(main())