major-domo-database/.claude/sqlite-to-postgres/botched-sbaplayer-matching/analyze_player_data.py
Cal Corum 7130a1fd43 Postgres Migration
Migration documentation and scripts
2025-08-25 07:18:31 -05:00

252 lines
9.3 KiB
Python

#!/usr/bin/env python3
"""
Script to analyze player data for sbaplayer matching strategy
Uses hierarchical matching: bbref_id (Player) -> key_bbref (SbaPlayer) -> name fallback
"""
import requests
import json
import logging
import os
from dataclasses import dataclass, asdict
from typing import Dict, List, Set, Optional
from collections import defaultdict
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('/tmp/player_analysis.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger('PlayerAnalysis')
@dataclass
class PlayerRecord:
id: int
name: str
season: int
bbref_id: Optional[str] = None
sbaplayer_id: Optional[int] = None
@dataclass
class SbaPlayerRecord:
id: int
first_name: str
last_name: str
key_bbref: Optional[str] = None
key_fangraphs: Optional[int] = None
key_mlbam: Optional[int] = None
key_retro: Optional[str] = None
def fetch_players_for_season(season: int, use_cache: bool = True) -> List[PlayerRecord]:
"""Fetch all players for a given season, with caching"""
cache_file = f"/tmp/players_season_{season}.json"
# Try to load from cache first
if use_cache and os.path.exists(cache_file):
try:
with open(cache_file, 'r') as f:
cached_data = json.load(f)
players = []
for player_data in cached_data:
player = PlayerRecord(**player_data)
players.append(player)
logger.info(f"Season {season}: {len(players)} players loaded from cache")
return players
except Exception as e:
logger.warning(f"Failed to load cache for season {season}: {e}")
# Fetch from API
try:
url = f"https://api.sba.manticorum.com/players?season={season}"
response = requests.get(url, timeout=30)
response.raise_for_status()
data = response.json()
players = []
for player_data in data.get('players', []):
player = PlayerRecord(
id=player_data['id'],
name=player_data['name'],
season=season,
bbref_id=player_data.get('bbref_id') if player_data.get('bbref_id') else None,
sbaplayer_id=player_data.get('sbaplayer_id')
)
players.append(player)
# Save to cache
try:
with open(cache_file, 'w') as f:
json.dump([asdict(p) for p in players], f, indent=2)
logger.info(f"Season {season}: {len(players)} players fetched and cached")
except Exception as e:
logger.warning(f"Failed to cache season {season} data: {e}")
return players
except Exception as e:
logger.error(f"Error fetching season {season}: {e}")
return []
def fetch_sbaplayers(use_cache: bool = True) -> List[SbaPlayerRecord]:
"""Fetch all SBA players, with caching"""
cache_file = "/tmp/sbaplayers.json"
# Try to load from cache first
if use_cache and os.path.exists(cache_file):
try:
with open(cache_file, 'r') as f:
cached_data = json.load(f)
sbaplayers = []
for sba_data in cached_data:
sbaplayer = SbaPlayerRecord(**sba_data)
sbaplayers.append(sbaplayer)
logger.info(f"SbaPlayers: {len(sbaplayers)} records loaded from cache")
return sbaplayers
except Exception as e:
logger.warning(f"Failed to load sbaplayers cache: {e}")
# Fetch from API
try:
url = "https://api.sba.manticorum.com/sbaplayers"
response = requests.get(url, timeout=30)
response.raise_for_status()
data = response.json()
sbaplayers = []
for sba_data in data.get('players', []):
sbaplayer = SbaPlayerRecord(
id=sba_data['id'],
first_name=sba_data['first_name'],
last_name=sba_data['last_name'],
key_bbref=sba_data.get('key_bbref') if sba_data.get('key_bbref') else None,
key_fangraphs=sba_data.get('key_fangraphs'),
key_mlbam=sba_data.get('key_mlbam'),
key_retro=sba_data.get('key_retro') if sba_data.get('key_retro') else None
)
sbaplayers.append(sbaplayer)
# Save to cache
try:
with open(cache_file, 'w') as f:
json.dump([asdict(s) for s in sbaplayers], f, indent=2)
logger.info(f"SbaPlayers: {len(sbaplayers)} records fetched and cached")
except Exception as e:
logger.warning(f"Failed to cache sbaplayers data: {e}")
return sbaplayers
except Exception as e:
logger.error(f"Error fetching sbaplayers: {e}")
return []
def analyze_matching_potential():
"""Analyze potential matches between players and sbaplayers"""
# Fetch all data
logger.info("Fetching SbaPlayer records...")
sbaplayers = fetch_sbaplayers()
logger.info("Fetching Player records from all seasons...")
all_players: List[PlayerRecord] = []
for season in range(1, 13):
players = fetch_players_for_season(season)
all_players.extend(players)
logger.info(f"Total player-season records: {len(all_players)}")
# Create lookup maps
sbaplayer_by_bbref = {}
sbaplayer_by_name = {}
for sba in sbaplayers:
if sba.key_bbref:
sbaplayer_by_bbref[sba.key_bbref] = sba
full_name = f"{sba.first_name} {sba.last_name}"
sbaplayer_by_name[full_name] = sba
logger.info(f"SbaPlayers with key_bbref: {len(sbaplayer_by_bbref)}")
logger.info(f"Total unique SbaPlayers: {len(sbaplayers)}")
# Analysis of Player records
players_with_bbref = [p for p in all_players if p.bbref_id]
players_without_bbref = [p for p in all_players if not p.bbref_id]
logger.info(f"Player records with bbref_id: {len(players_with_bbref)} ({len(players_with_bbref)/len(all_players)*100:.1f}%)")
logger.info(f"Player records without bbref_id: {len(players_without_bbref)} ({len(players_without_bbref)/len(all_players)*100:.1f}%)")
# Try matching by bbref_id
bbref_matches = 0
bbref_no_matches = 0
for player in players_with_bbref:
if player.bbref_id in sbaplayer_by_bbref:
bbref_matches += 1
else:
bbref_no_matches += 1
logger.info(f"Player records that can match by bbref_id: {bbref_matches}")
logger.info(f"Player records with bbref_id but no SbaPlayer match: {bbref_no_matches}")
# Group remaining players by unique identifiers
unique_bbref_players = defaultdict(list) # players grouped by bbref_id
unique_name_players = defaultdict(list) # players without bbref_id, grouped by name
for player in all_players:
if player.bbref_id:
unique_bbref_players[player.bbref_id].append(player)
else:
unique_name_players[player.name].append(player)
logger.info(f"Unique players identifiable by bbref_id: {len(unique_bbref_players)}")
logger.info(f"Unique players identifiable by name only: {len(unique_name_players)}")
# Show some examples
logger.info("\nExample bbref_id matches:")
count = 0
for bbref_id, players in unique_bbref_players.items():
if bbref_id in sbaplayer_by_bbref and count < 5:
seasons = sorted([p.season for p in players])
sba = sbaplayer_by_bbref[bbref_id]
logger.info(f" {bbref_id}: {players[0].name} -> {sba.first_name} {sba.last_name} (seasons {seasons})")
count += 1
logger.info("\nExample name-only players (need new SbaPlayer records):")
count = 0
for name, players in unique_name_players.items():
if count < 5:
seasons = sorted([p.season for p in players])
logger.info(f" {name} (seasons {seasons})")
count += 1
# Summary statistics
total_unique_players = len(unique_bbref_players) + len(unique_name_players)
matchable_by_bbref = sum(1 for bbref in unique_bbref_players.keys() if bbref in sbaplayer_by_bbref)
need_new_sbaplayers = len(unique_bbref_players) - matchable_by_bbref + len(unique_name_players)
logger.info(f"\n=== SUMMARY ===")
logger.info(f"Total unique players across all seasons: {total_unique_players}")
logger.info(f"Can match to existing SbaPlayers by bbref_id: {matchable_by_bbref}")
logger.info(f"Need new SbaPlayer records: {need_new_sbaplayers}")
logger.info(f"Total player-season records to update: {len(all_players)}")
return {
'total_player_records': len(all_players),
'total_unique_players': total_unique_players,
'matchable_by_bbref': matchable_by_bbref,
'need_new_sbaplayers': need_new_sbaplayers,
'unique_bbref_players': dict(unique_bbref_players),
'unique_name_players': dict(unique_name_players),
'sbaplayer_lookup': sbaplayer_by_bbref
}
if __name__ == "__main__":
logger.info("Starting player matching analysis...")
results = analyze_matching_potential()
logger.info("Analysis complete!")