""" Script to analyze rarity distribution and card costs for a specific cardset. """ import asyncio import logging from collections import Counter import pandas as pd from db_calls import db_get from rarity_thresholds import get_pitcher_thresholds, get_batter_thresholds # Set up rotating logger logger = logging.getLogger(f'{__name__}') handler = logging.StreamHandler() handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(handler) logger.setLevel(logging.INFO) async def analyze_cardset(cardset_id: int): """Analyze rarity distribution and card costs for a specific cardset.""" logger.info(f'Starting analysis for cardset {cardset_id}...\n') # Get cardset info to extract the season year c_query = await db_get('cardsets', object_id=cardset_id) if c_query is None: logger.error(f'Cardset {cardset_id} not found') return cardset_name = c_query.get('name', '') logger.info(f'Cardset: {cardset_name}') # Extract year from cardset name (e.g., "2005 Live" -> 2005) import re year_match = re.search(r'(\d{4})', cardset_name) if year_match: season = int(year_match.group(1)) logger.info(f'Detected season: {season}') else: season = 2024 # Default fallback logger.warning(f'Could not detect season from cardset name, using default: {season}') logger.info('') # Get all players in this cardset p_query = await db_get('players', params=[('cardset_id', cardset_id)]) if p_query is None or p_query['count'] == 0: logger.error('No players found') return players_df = pd.DataFrame(p_query['players']) logger.info(f'Found {len(players_df)} total players') # Extract rarity ID if it's a dict if isinstance(players_df['rarity'].iloc[0], dict): players_df['rarity_id'] = players_df['rarity'].apply(lambda x: x['id'] if isinstance(x, dict) else x) else: players_df['rarity_id'] = players_df['rarity'] # Get batting and pitching cards separately bc_query = await db_get('battingcards', params=[('cardset_id', cardset_id)]) pc_query = await db_get('pitchingcards', params=[('cardset_id', cardset_id)]) batting_cards_df = pd.DataFrame(bc_query['cards']) if bc_query['count'] > 0 else pd.DataFrame() pitching_cards_df = pd.DataFrame(pc_query['cards']) if pc_query['count'] > 0 else pd.DataFrame() logger.info(f'Found {len(batting_cards_df)} batting cards') logger.info(f'Found {len(pitching_cards_df)} pitching cards\n') # Get thresholds for the season pitcher_thresholds = get_pitcher_thresholds(season) batter_thresholds = get_batter_thresholds(season) # Analyze overall rarity distribution analyze_overall_rarity(players_df) # Analyze batting cards if len(batting_cards_df) > 0: analyze_batting_cards(batting_cards_df, players_df, batter_thresholds) # Analyze pitching cards if len(pitching_cards_df) > 0: analyze_pitching_cards(pitching_cards_df, players_df, pitcher_thresholds) def analyze_overall_rarity(players_df: pd.DataFrame): """Analyze overall rarity distribution.""" logger.info('=' * 60) logger.info('OVERALL RARITY DISTRIBUTION') logger.info('=' * 60) rarity_counts = players_df['rarity_id'].value_counts().sort_index() rarity_names = { 1: 'Diamond', 2: 'Gold', 3: 'Silver', 4: 'Bronze', 5: 'Common', 99: 'Hall of Fame' } total = len(players_df) for rarity_id, count in rarity_counts.items(): rarity_name = rarity_names.get(rarity_id, f'Unknown ({rarity_id})') pct = (count / total) * 100 logger.info(f'{rarity_name:15} ({rarity_id:2}): {count:5} cards ({pct:5.1f}%)') logger.info('-' * 60) logger.info(f'Total: {total} cards\n') def analyze_batting_cards(batting_cards_df: pd.DataFrame, players_df: pd.DataFrame, thresholds): """Analyze batting card rarities and costs.""" logger.info('=' * 60) logger.info('BATTING CARD ANALYSIS') logger.info('=' * 60) # Extract player ID from player reference (may be URL or dict) sample_player = batting_cards_df['player'].iloc[0] if isinstance(sample_player, dict): # The dict has 'player_id' not 'id' batting_cards_df['player_ref'] = batting_cards_df['player'].apply( lambda x: int(x.get('player_id')) if isinstance(x, dict) and x.get('player_id') else None ) elif isinstance(sample_player, str): # Extract ID from URL like "/api/v2/players/123" batting_cards_df['player_ref'] = batting_cards_df['player'].str.extract(r'/(\d+)$')[0].astype(int) elif isinstance(sample_player, int): batting_cards_df['player_ref'] = batting_cards_df['player'] else: logger.error(f'Unknown player reference type: {type(sample_player)}') batting_cards_df['player_ref'] = batting_cards_df['player'] # Merge with player data to get rarity batting_with_player = batting_cards_df.merge( players_df[['player_id', 'p_name', 'rarity_id']], left_on='player_ref', right_on='player_id', how='left' ) # Count rarities rarity_counts = batting_with_player['rarity_id'].value_counts().sort_index() rarity_names = { 1: 'Diamond', 2: 'Gold', 3: 'Silver', 4: 'Bronze', 5: 'Common', 99: 'Hall of Fame' } logger.info('\nBatting Card Rarity Distribution:') total = len(batting_with_player) for rarity_id, count in rarity_counts.items(): rarity_name = rarity_names.get(rarity_id, f'Unknown ({rarity_id})') pct = (count / total) * 100 logger.info(f'{rarity_name:15} ({rarity_id:2}): {count:5} cards ({pct:5.1f}%)') # Check for cost anomalies logger.info('\n' + '-' * 60) logger.info('BATTING CARD COST ANALYSIS') logger.info('-' * 60) expected_costs = { 5: 20, # Common 4: 40, # Bronze 3: 80, # Silver 2: 160, # Gold 1: 320, # Diamond 99: 640 # Hall of Fame } # Note: cost is stored on the player record, not the card record # We'll check player costs instead player_costs = players_df[players_df['player_id'].isin(batting_with_player['player_ref'])].copy() player_costs_with_rarity = player_costs.merge( batting_with_player[['player_ref', 'rarity_id']].drop_duplicates(), left_on='player_id', right_on='player_ref', how='left', suffixes=('', '_card') ) cost_issues = [] for _, player in player_costs_with_rarity.iterrows(): rarity = player.get('rarity_id_card') cost = player.get('cost') expected = expected_costs.get(rarity) if cost != expected: cost_issues.append({ 'player': player.get('p_name'), 'player_id': player.get('player_id'), 'rarity': rarity, 'actual_cost': cost, 'expected_cost': expected }) if cost_issues: logger.warning(f'Found {len(cost_issues)} cost anomalies:') for issue in cost_issues[:20]: # Show first 20 logger.warning(f" {issue['player']} (Player ID: {issue['player_id']}): " f"Rarity {issue['rarity']} has cost {issue['actual_cost']}, " f"expected {issue['expected_cost']}") if len(cost_issues) > 20: logger.warning(f' ... and {len(cost_issues) - 20} more') else: logger.info('✓ No cost anomalies found') # Check for OPS-rarity alignment logger.info('\n' + '-' * 60) logger.info('BATTING OPS-RARITY ALIGNMENT') logger.info('-' * 60) ops_mismatches = [] for _, card in batting_with_player.iterrows(): ops = card.get('total_OPS') rarity = card.get('rarity_id') if pd.isna(ops) or ops is None: continue expected_rarity = thresholds.get_rarity(ops) if expected_rarity != rarity: ops_mismatches.append({ 'player': card.get('p_name'), 'card_id': card.get('id'), 'ops': ops, 'actual_rarity': rarity, 'expected_rarity': expected_rarity }) if ops_mismatches: logger.warning(f'Found {len(ops_mismatches)} OPS-rarity mismatches:') for issue in ops_mismatches[:20]: logger.warning(f" {issue['player']} (Card ID: {issue['card_id']}): " f"OPS {issue['ops']:.3f} assigned rarity {issue['actual_rarity']}, " f"expected {issue['expected_rarity']}") if len(ops_mismatches) > 20: logger.warning(f' ... and {len(ops_mismatches) - 20} more') else: logger.info('✓ All OPS values align with rarity assignments') logger.info('') def analyze_pitching_cards(pitching_cards_df: pd.DataFrame, players_df: pd.DataFrame, thresholds): """Analyze pitching card rarities and costs.""" logger.info('=' * 60) logger.info('PITCHING CARD ANALYSIS') logger.info('=' * 60) # Extract player ID from player reference (may be URL or dict) sample_player = pitching_cards_df['player'].iloc[0] if isinstance(sample_player, dict): # The dict has 'player_id' not 'id' pitching_cards_df['player_ref'] = pitching_cards_df['player'].apply( lambda x: int(x.get('player_id')) if isinstance(x, dict) and x.get('player_id') else None ) elif isinstance(sample_player, str): # Extract ID from URL like "/api/v2/players/123" pitching_cards_df['player_ref'] = pitching_cards_df['player'].str.extract(r'/(\d+)$')[0].astype(int) else: pitching_cards_df['player_ref'] = pitching_cards_df['player'] # Merge with player data to get rarity pitching_with_player = pitching_cards_df.merge( players_df[['player_id', 'p_name', 'rarity_id']], left_on='player_ref', right_on='player_id', how='left' ) # Count rarities rarity_counts = pitching_with_player['rarity_id'].value_counts().sort_index() rarity_names = { 1: 'Diamond', 2: 'Gold', 3: 'Silver', 4: 'Bronze', 5: 'Common', 99: 'Hall of Fame' } logger.info('\nPitching Card Rarity Distribution:') total = len(pitching_with_player) for rarity_id, count in rarity_counts.items(): rarity_name = rarity_names.get(rarity_id, f'Unknown ({rarity_id})') pct = (count / total) * 100 logger.info(f'{rarity_name:15} ({rarity_id:2}): {count:5} cards ({pct:5.1f}%)') # Separate starters and relievers pitching_with_player['is_starter'] = pitching_with_player['starter_rating'].fillna(0) >= 4 starters = pitching_with_player[pitching_with_player['is_starter']] relievers = pitching_with_player[~pitching_with_player['is_starter']] logger.info(f'\nStarters: {len(starters)}, Relievers: {len(relievers)}') # Check for cost anomalies logger.info('\n' + '-' * 60) logger.info('PITCHING CARD COST ANALYSIS') logger.info('-' * 60) expected_costs = { 5: 20, # Common 4: 40, # Bronze 3: 80, # Silver 2: 160, # Gold 1: 320, # Diamond 99: 640 # Hall of Fame } # Note: cost is stored on the player record, not the card record # We'll check player costs instead player_costs = players_df[players_df['player_id'].isin(pitching_with_player['player_ref'])].copy() player_costs_with_rarity = player_costs.merge( pitching_with_player[['player_ref', 'rarity_id']].drop_duplicates(), left_on='player_id', right_on='player_ref', how='left', suffixes=('', '_card') ) cost_issues = [] for _, player in player_costs_with_rarity.iterrows(): rarity = player.get('rarity_id_card') cost = player.get('cost') expected = expected_costs.get(rarity) if cost != expected: cost_issues.append({ 'player': player.get('p_name'), 'player_id': player.get('player_id'), 'rarity': rarity, 'actual_cost': cost, 'expected_cost': expected }) if cost_issues: logger.warning(f'Found {len(cost_issues)} cost anomalies:') for issue in cost_issues[:20]: logger.warning(f" {issue['player']} (Player ID: {issue['player_id']}): " f"Rarity {issue['rarity']} has cost {issue['actual_cost']}, " f"expected {issue['expected_cost']}") if len(cost_issues) > 20: logger.warning(f' ... and {len(cost_issues) - 20} more') else: logger.info('✓ No cost anomalies found') # Check for OPS-rarity alignment logger.info('\n' + '-' * 60) logger.info('PITCHING OPS-RARITY ALIGNMENT') logger.info('-' * 60) ops_mismatches = [] for _, card in pitching_with_player.iterrows(): ops = card.get('total_OPS') rarity = card.get('rarity_id') is_starter = card.get('is_starter', False) if pd.isna(ops) or ops is None: continue if is_starter: expected_rarity = thresholds.get_rarity_for_starter(ops) else: expected_rarity = thresholds.get_rarity_for_reliever(ops) if expected_rarity != rarity: ops_mismatches.append({ 'player': card.get('p_name'), 'card_id': card.get('id'), 'ops': ops, 'is_starter': is_starter, 'actual_rarity': rarity, 'expected_rarity': expected_rarity }) if ops_mismatches: logger.warning(f'Found {len(ops_mismatches)} OPS-rarity mismatches:') for issue in ops_mismatches[:20]: role = 'SP' if issue['is_starter'] else 'RP' logger.warning(f" {issue['player']} ({role}, Card ID: {issue['card_id']}): " f"OPS {issue['ops']:.3f} assigned rarity {issue['actual_rarity']}, " f"expected {issue['expected_rarity']}") if len(ops_mismatches) > 20: logger.warning(f' ... and {len(ops_mismatches) - 20} more') else: logger.info('✓ All OPS values align with rarity assignments') logger.info('') async def main(): """Main execution function.""" import sys # Parse command-line arguments if len(sys.argv) >= 2: CARDSET_ID = int(sys.argv[1]) else: # Default value if no arguments provided CARDSET_ID = 27 logger.info(f'Using default cardset_id: {CARDSET_ID}') logger.info('Usage: python check_cardset_rarity.py \n') await analyze_cardset(CARDSET_ID) logger.info('=' * 60) logger.info('Analysis complete') logger.info('=' * 60) if __name__ == '__main__': asyncio.run(main())