#!/usr/bin/env python3 """ Script to analyze player data for sbaplayer matching strategy Uses hierarchical matching: bbref_id (Player) -> key_bbref (SbaPlayer) -> name fallback """ import requests import json import logging import os from dataclasses import dataclass, asdict from typing import Dict, List, Set, Optional from collections import defaultdict # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('/tmp/player_analysis.log'), logging.StreamHandler() ] ) logger = logging.getLogger('PlayerAnalysis') @dataclass class PlayerRecord: id: int name: str season: int bbref_id: Optional[str] = None sbaplayer_id: Optional[int] = None @dataclass class SbaPlayerRecord: id: int first_name: str last_name: str key_bbref: Optional[str] = None key_fangraphs: Optional[int] = None key_mlbam: Optional[int] = None key_retro: Optional[str] = None def fetch_players_for_season(season: int, use_cache: bool = True) -> List[PlayerRecord]: """Fetch all players for a given season, with caching""" cache_file = f"/tmp/players_season_{season}.json" # Try to load from cache first if use_cache and os.path.exists(cache_file): try: with open(cache_file, 'r') as f: cached_data = json.load(f) players = [] for player_data in cached_data: player = PlayerRecord(**player_data) players.append(player) logger.info(f"Season {season}: {len(players)} players loaded from cache") return players except Exception as e: logger.warning(f"Failed to load cache for season {season}: {e}") # Fetch from API try: url = f"https://api.sba.manticorum.com/players?season={season}" response = requests.get(url, timeout=30) response.raise_for_status() data = response.json() players = [] for player_data in data.get('players', []): player = PlayerRecord( id=player_data['id'], name=player_data['name'], season=season, bbref_id=player_data.get('bbref_id') if player_data.get('bbref_id') else None, sbaplayer_id=player_data.get('sbaplayer_id') ) players.append(player) # Save to cache try: with open(cache_file, 'w') as f: json.dump([asdict(p) for p in players], f, indent=2) logger.info(f"Season {season}: {len(players)} players fetched and cached") except Exception as e: logger.warning(f"Failed to cache season {season} data: {e}") return players except Exception as e: logger.error(f"Error fetching season {season}: {e}") return [] def fetch_sbaplayers(use_cache: bool = True) -> List[SbaPlayerRecord]: """Fetch all SBA players, with caching""" cache_file = "/tmp/sbaplayers.json" # Try to load from cache first if use_cache and os.path.exists(cache_file): try: with open(cache_file, 'r') as f: cached_data = json.load(f) sbaplayers = [] for sba_data in cached_data: sbaplayer = SbaPlayerRecord(**sba_data) sbaplayers.append(sbaplayer) logger.info(f"SbaPlayers: {len(sbaplayers)} records loaded from cache") return sbaplayers except Exception as e: logger.warning(f"Failed to load sbaplayers cache: {e}") # Fetch from API try: url = "https://api.sba.manticorum.com/sbaplayers" response = requests.get(url, timeout=30) response.raise_for_status() data = response.json() sbaplayers = [] for sba_data in data.get('players', []): sbaplayer = SbaPlayerRecord( id=sba_data['id'], first_name=sba_data['first_name'], last_name=sba_data['last_name'], key_bbref=sba_data.get('key_bbref') if sba_data.get('key_bbref') else None, key_fangraphs=sba_data.get('key_fangraphs'), key_mlbam=sba_data.get('key_mlbam'), key_retro=sba_data.get('key_retro') if sba_data.get('key_retro') else None ) sbaplayers.append(sbaplayer) # Save to cache try: with open(cache_file, 'w') as f: json.dump([asdict(s) for s in sbaplayers], f, indent=2) logger.info(f"SbaPlayers: {len(sbaplayers)} records fetched and cached") except Exception as e: logger.warning(f"Failed to cache sbaplayers data: {e}") return sbaplayers except Exception as e: logger.error(f"Error fetching sbaplayers: {e}") return [] def analyze_matching_potential(): """Analyze potential matches between players and sbaplayers""" # Fetch all data logger.info("Fetching SbaPlayer records...") sbaplayers = fetch_sbaplayers() logger.info("Fetching Player records from all seasons...") all_players: List[PlayerRecord] = [] for season in range(1, 13): players = fetch_players_for_season(season) all_players.extend(players) logger.info(f"Total player-season records: {len(all_players)}") # Create lookup maps sbaplayer_by_bbref = {} sbaplayer_by_name = {} for sba in sbaplayers: if sba.key_bbref: sbaplayer_by_bbref[sba.key_bbref] = sba full_name = f"{sba.first_name} {sba.last_name}" sbaplayer_by_name[full_name] = sba logger.info(f"SbaPlayers with key_bbref: {len(sbaplayer_by_bbref)}") logger.info(f"Total unique SbaPlayers: {len(sbaplayers)}") # Analysis of Player records players_with_bbref = [p for p in all_players if p.bbref_id] players_without_bbref = [p for p in all_players if not p.bbref_id] logger.info(f"Player records with bbref_id: {len(players_with_bbref)} ({len(players_with_bbref)/len(all_players)*100:.1f}%)") logger.info(f"Player records without bbref_id: {len(players_without_bbref)} ({len(players_without_bbref)/len(all_players)*100:.1f}%)") # Try matching by bbref_id bbref_matches = 0 bbref_no_matches = 0 for player in players_with_bbref: if player.bbref_id in sbaplayer_by_bbref: bbref_matches += 1 else: bbref_no_matches += 1 logger.info(f"Player records that can match by bbref_id: {bbref_matches}") logger.info(f"Player records with bbref_id but no SbaPlayer match: {bbref_no_matches}") # Group remaining players by unique identifiers unique_bbref_players = defaultdict(list) # players grouped by bbref_id unique_name_players = defaultdict(list) # players without bbref_id, grouped by name for player in all_players: if player.bbref_id: unique_bbref_players[player.bbref_id].append(player) else: unique_name_players[player.name].append(player) logger.info(f"Unique players identifiable by bbref_id: {len(unique_bbref_players)}") logger.info(f"Unique players identifiable by name only: {len(unique_name_players)}") # Show some examples logger.info("\nExample bbref_id matches:") count = 0 for bbref_id, players in unique_bbref_players.items(): if bbref_id in sbaplayer_by_bbref and count < 5: seasons = sorted([p.season for p in players]) sba = sbaplayer_by_bbref[bbref_id] logger.info(f" {bbref_id}: {players[0].name} -> {sba.first_name} {sba.last_name} (seasons {seasons})") count += 1 logger.info("\nExample name-only players (need new SbaPlayer records):") count = 0 for name, players in unique_name_players.items(): if count < 5: seasons = sorted([p.season for p in players]) logger.info(f" {name} (seasons {seasons})") count += 1 # Summary statistics total_unique_players = len(unique_bbref_players) + len(unique_name_players) matchable_by_bbref = sum(1 for bbref in unique_bbref_players.keys() if bbref in sbaplayer_by_bbref) need_new_sbaplayers = len(unique_bbref_players) - matchable_by_bbref + len(unique_name_players) logger.info(f"\n=== SUMMARY ===") logger.info(f"Total unique players across all seasons: {total_unique_players}") logger.info(f"Can match to existing SbaPlayers by bbref_id: {matchable_by_bbref}") logger.info(f"Need new SbaPlayer records: {need_new_sbaplayers}") logger.info(f"Total player-season records to update: {len(all_players)}") return { 'total_player_records': len(all_players), 'total_unique_players': total_unique_players, 'matchable_by_bbref': matchable_by_bbref, 'need_new_sbaplayers': need_new_sbaplayers, 'unique_bbref_players': dict(unique_bbref_players), 'unique_name_players': dict(unique_name_players), 'sbaplayer_lookup': sbaplayer_by_bbref } if __name__ == "__main__": logger.info("Starting player matching analysis...") results = analyze_matching_potential() logger.info("Analysis complete!")