paper-dynasty-card-creation/check_cardset_rarity.py
2025-11-08 16:57:35 -06:00

422 lines
15 KiB
Python

"""
Script to analyze rarity distribution and card costs for a specific cardset.
"""
import asyncio
import logging
from collections import Counter
import pandas as pd
from db_calls import db_get
from rarity_thresholds import get_pitcher_thresholds, get_batter_thresholds
# Set up rotating logger
logger = logging.getLogger(f'{__name__}')
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(handler)
logger.setLevel(logging.INFO)
async def analyze_cardset(cardset_id: int):
"""Analyze rarity distribution and card costs for a specific cardset."""
logger.info(f'Starting analysis for cardset {cardset_id}...\n')
# Get cardset info to extract the season year
c_query = await db_get('cardsets', object_id=cardset_id)
if c_query is None:
logger.error(f'Cardset {cardset_id} not found')
return
cardset_name = c_query.get('name', '')
logger.info(f'Cardset: {cardset_name}')
# Extract year from cardset name (e.g., "2005 Live" -> 2005)
import re
year_match = re.search(r'(\d{4})', cardset_name)
if year_match:
season = int(year_match.group(1))
logger.info(f'Detected season: {season}')
else:
season = 2024 # Default fallback
logger.warning(f'Could not detect season from cardset name, using default: {season}')
logger.info('')
# Get all players in this cardset
p_query = await db_get('players', params=[('cardset_id', cardset_id)])
if p_query is None or p_query['count'] == 0:
logger.error('No players found')
return
players_df = pd.DataFrame(p_query['players'])
logger.info(f'Found {len(players_df)} total players')
# Extract rarity ID if it's a dict
if isinstance(players_df['rarity'].iloc[0], dict):
players_df['rarity_id'] = players_df['rarity'].apply(lambda x: x['id'] if isinstance(x, dict) else x)
else:
players_df['rarity_id'] = players_df['rarity']
# Get batting and pitching cards separately
bc_query = await db_get('battingcards', params=[('cardset_id', cardset_id)])
pc_query = await db_get('pitchingcards', params=[('cardset_id', cardset_id)])
batting_cards_df = pd.DataFrame(bc_query['cards']) if bc_query['count'] > 0 else pd.DataFrame()
pitching_cards_df = pd.DataFrame(pc_query['cards']) if pc_query['count'] > 0 else pd.DataFrame()
logger.info(f'Found {len(batting_cards_df)} batting cards')
logger.info(f'Found {len(pitching_cards_df)} pitching cards\n')
# Get thresholds for the season
pitcher_thresholds = get_pitcher_thresholds(season)
batter_thresholds = get_batter_thresholds(season)
# Analyze overall rarity distribution
analyze_overall_rarity(players_df)
# Analyze batting cards
if len(batting_cards_df) > 0:
analyze_batting_cards(batting_cards_df, players_df, batter_thresholds)
# Analyze pitching cards
if len(pitching_cards_df) > 0:
analyze_pitching_cards(pitching_cards_df, players_df, pitcher_thresholds)
def analyze_overall_rarity(players_df: pd.DataFrame):
"""Analyze overall rarity distribution."""
logger.info('=' * 60)
logger.info('OVERALL RARITY DISTRIBUTION')
logger.info('=' * 60)
rarity_counts = players_df['rarity_id'].value_counts().sort_index()
rarity_names = {
1: 'Diamond',
2: 'Gold',
3: 'Silver',
4: 'Bronze',
5: 'Common',
99: 'Hall of Fame'
}
total = len(players_df)
for rarity_id, count in rarity_counts.items():
rarity_name = rarity_names.get(rarity_id, f'Unknown ({rarity_id})')
pct = (count / total) * 100
logger.info(f'{rarity_name:15} ({rarity_id:2}): {count:5} cards ({pct:5.1f}%)')
logger.info('-' * 60)
logger.info(f'Total: {total} cards\n')
def analyze_batting_cards(batting_cards_df: pd.DataFrame, players_df: pd.DataFrame, thresholds):
"""Analyze batting card rarities and costs."""
logger.info('=' * 60)
logger.info('BATTING CARD ANALYSIS')
logger.info('=' * 60)
# Extract player ID from player reference (may be URL or dict)
sample_player = batting_cards_df['player'].iloc[0]
if isinstance(sample_player, dict):
# The dict has 'player_id' not 'id'
batting_cards_df['player_ref'] = batting_cards_df['player'].apply(
lambda x: int(x.get('player_id')) if isinstance(x, dict) and x.get('player_id') else None
)
elif isinstance(sample_player, str):
# Extract ID from URL like "/api/v2/players/123"
batting_cards_df['player_ref'] = batting_cards_df['player'].str.extract(r'/(\d+)$')[0].astype(int)
elif isinstance(sample_player, int):
batting_cards_df['player_ref'] = batting_cards_df['player']
else:
logger.error(f'Unknown player reference type: {type(sample_player)}')
batting_cards_df['player_ref'] = batting_cards_df['player']
# Merge with player data to get rarity
batting_with_player = batting_cards_df.merge(
players_df[['player_id', 'p_name', 'rarity_id']],
left_on='player_ref',
right_on='player_id',
how='left'
)
# Count rarities
rarity_counts = batting_with_player['rarity_id'].value_counts().sort_index()
rarity_names = {
1: 'Diamond',
2: 'Gold',
3: 'Silver',
4: 'Bronze',
5: 'Common',
99: 'Hall of Fame'
}
logger.info('\nBatting Card Rarity Distribution:')
total = len(batting_with_player)
for rarity_id, count in rarity_counts.items():
rarity_name = rarity_names.get(rarity_id, f'Unknown ({rarity_id})')
pct = (count / total) * 100
logger.info(f'{rarity_name:15} ({rarity_id:2}): {count:5} cards ({pct:5.1f}%)')
# Check for cost anomalies
logger.info('\n' + '-' * 60)
logger.info('BATTING CARD COST ANALYSIS')
logger.info('-' * 60)
expected_costs = {
5: 20, # Common
4: 40, # Bronze
3: 80, # Silver
2: 160, # Gold
1: 320, # Diamond
99: 640 # Hall of Fame
}
# Note: cost is stored on the player record, not the card record
# We'll check player costs instead
player_costs = players_df[players_df['player_id'].isin(batting_with_player['player_ref'])].copy()
player_costs_with_rarity = player_costs.merge(
batting_with_player[['player_ref', 'rarity_id']].drop_duplicates(),
left_on='player_id',
right_on='player_ref',
how='left',
suffixes=('', '_card')
)
cost_issues = []
for _, player in player_costs_with_rarity.iterrows():
rarity = player.get('rarity_id_card')
cost = player.get('cost')
expected = expected_costs.get(rarity)
if cost != expected:
cost_issues.append({
'player': player.get('p_name'),
'player_id': player.get('player_id'),
'rarity': rarity,
'actual_cost': cost,
'expected_cost': expected
})
if cost_issues:
logger.warning(f'Found {len(cost_issues)} cost anomalies:')
for issue in cost_issues[:20]: # Show first 20
logger.warning(f" {issue['player']} (Player ID: {issue['player_id']}): "
f"Rarity {issue['rarity']} has cost {issue['actual_cost']}, "
f"expected {issue['expected_cost']}")
if len(cost_issues) > 20:
logger.warning(f' ... and {len(cost_issues) - 20} more')
else:
logger.info('✓ No cost anomalies found')
# Check for OPS-rarity alignment
logger.info('\n' + '-' * 60)
logger.info('BATTING OPS-RARITY ALIGNMENT')
logger.info('-' * 60)
ops_mismatches = []
for _, card in batting_with_player.iterrows():
ops = card.get('total_OPS')
rarity = card.get('rarity_id')
if pd.isna(ops) or ops is None:
continue
expected_rarity = thresholds.get_rarity(ops)
if expected_rarity != rarity:
ops_mismatches.append({
'player': card.get('p_name'),
'card_id': card.get('id'),
'ops': ops,
'actual_rarity': rarity,
'expected_rarity': expected_rarity
})
if ops_mismatches:
logger.warning(f'Found {len(ops_mismatches)} OPS-rarity mismatches:')
for issue in ops_mismatches[:20]:
logger.warning(f" {issue['player']} (Card ID: {issue['card_id']}): "
f"OPS {issue['ops']:.3f} assigned rarity {issue['actual_rarity']}, "
f"expected {issue['expected_rarity']}")
if len(ops_mismatches) > 20:
logger.warning(f' ... and {len(ops_mismatches) - 20} more')
else:
logger.info('✓ All OPS values align with rarity assignments')
logger.info('')
def analyze_pitching_cards(pitching_cards_df: pd.DataFrame, players_df: pd.DataFrame, thresholds):
"""Analyze pitching card rarities and costs."""
logger.info('=' * 60)
logger.info('PITCHING CARD ANALYSIS')
logger.info('=' * 60)
# Extract player ID from player reference (may be URL or dict)
sample_player = pitching_cards_df['player'].iloc[0]
if isinstance(sample_player, dict):
# The dict has 'player_id' not 'id'
pitching_cards_df['player_ref'] = pitching_cards_df['player'].apply(
lambda x: int(x.get('player_id')) if isinstance(x, dict) and x.get('player_id') else None
)
elif isinstance(sample_player, str):
# Extract ID from URL like "/api/v2/players/123"
pitching_cards_df['player_ref'] = pitching_cards_df['player'].str.extract(r'/(\d+)$')[0].astype(int)
else:
pitching_cards_df['player_ref'] = pitching_cards_df['player']
# Merge with player data to get rarity
pitching_with_player = pitching_cards_df.merge(
players_df[['player_id', 'p_name', 'rarity_id']],
left_on='player_ref',
right_on='player_id',
how='left'
)
# Count rarities
rarity_counts = pitching_with_player['rarity_id'].value_counts().sort_index()
rarity_names = {
1: 'Diamond',
2: 'Gold',
3: 'Silver',
4: 'Bronze',
5: 'Common',
99: 'Hall of Fame'
}
logger.info('\nPitching Card Rarity Distribution:')
total = len(pitching_with_player)
for rarity_id, count in rarity_counts.items():
rarity_name = rarity_names.get(rarity_id, f'Unknown ({rarity_id})')
pct = (count / total) * 100
logger.info(f'{rarity_name:15} ({rarity_id:2}): {count:5} cards ({pct:5.1f}%)')
# Separate starters and relievers
pitching_with_player['is_starter'] = pitching_with_player['starter_rating'].fillna(0) >= 4
starters = pitching_with_player[pitching_with_player['is_starter']]
relievers = pitching_with_player[~pitching_with_player['is_starter']]
logger.info(f'\nStarters: {len(starters)}, Relievers: {len(relievers)}')
# Check for cost anomalies
logger.info('\n' + '-' * 60)
logger.info('PITCHING CARD COST ANALYSIS')
logger.info('-' * 60)
expected_costs = {
5: 20, # Common
4: 40, # Bronze
3: 80, # Silver
2: 160, # Gold
1: 320, # Diamond
99: 640 # Hall of Fame
}
# Note: cost is stored on the player record, not the card record
# We'll check player costs instead
player_costs = players_df[players_df['player_id'].isin(pitching_with_player['player_ref'])].copy()
player_costs_with_rarity = player_costs.merge(
pitching_with_player[['player_ref', 'rarity_id']].drop_duplicates(),
left_on='player_id',
right_on='player_ref',
how='left',
suffixes=('', '_card')
)
cost_issues = []
for _, player in player_costs_with_rarity.iterrows():
rarity = player.get('rarity_id_card')
cost = player.get('cost')
expected = expected_costs.get(rarity)
if cost != expected:
cost_issues.append({
'player': player.get('p_name'),
'player_id': player.get('player_id'),
'rarity': rarity,
'actual_cost': cost,
'expected_cost': expected
})
if cost_issues:
logger.warning(f'Found {len(cost_issues)} cost anomalies:')
for issue in cost_issues[:20]:
logger.warning(f" {issue['player']} (Player ID: {issue['player_id']}): "
f"Rarity {issue['rarity']} has cost {issue['actual_cost']}, "
f"expected {issue['expected_cost']}")
if len(cost_issues) > 20:
logger.warning(f' ... and {len(cost_issues) - 20} more')
else:
logger.info('✓ No cost anomalies found')
# Check for OPS-rarity alignment
logger.info('\n' + '-' * 60)
logger.info('PITCHING OPS-RARITY ALIGNMENT')
logger.info('-' * 60)
ops_mismatches = []
for _, card in pitching_with_player.iterrows():
ops = card.get('total_OPS')
rarity = card.get('rarity_id')
is_starter = card.get('is_starter', False)
if pd.isna(ops) or ops is None:
continue
if is_starter:
expected_rarity = thresholds.get_rarity_for_starter(ops)
else:
expected_rarity = thresholds.get_rarity_for_reliever(ops)
if expected_rarity != rarity:
ops_mismatches.append({
'player': card.get('p_name'),
'card_id': card.get('id'),
'ops': ops,
'is_starter': is_starter,
'actual_rarity': rarity,
'expected_rarity': expected_rarity
})
if ops_mismatches:
logger.warning(f'Found {len(ops_mismatches)} OPS-rarity mismatches:')
for issue in ops_mismatches[:20]:
role = 'SP' if issue['is_starter'] else 'RP'
logger.warning(f" {issue['player']} ({role}, Card ID: {issue['card_id']}): "
f"OPS {issue['ops']:.3f} assigned rarity {issue['actual_rarity']}, "
f"expected {issue['expected_rarity']}")
if len(ops_mismatches) > 20:
logger.warning(f' ... and {len(ops_mismatches) - 20} more')
else:
logger.info('✓ All OPS values align with rarity assignments')
logger.info('')
async def main():
"""Main execution function."""
import sys
# Parse command-line arguments
if len(sys.argv) >= 2:
CARDSET_ID = int(sys.argv[1])
else:
# Default value if no arguments provided
CARDSET_ID = 27
logger.info(f'Using default cardset_id: {CARDSET_ID}')
logger.info('Usage: python check_cardset_rarity.py <cardset_id>\n')
await analyze_cardset(CARDSET_ID)
logger.info('=' * 60)
logger.info('Analysis complete')
logger.info('=' * 60)
if __name__ == '__main__':
asyncio.run(main())