major-domo-database/.claude/sqlite-to-postgres/botched-sbaplayer-matching/generate_unmatched_players.py
Cal Corum 7130a1fd43 Postgres Migration
Migration documentation and scripts
2025-08-25 07:18:31 -05:00

267 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Generate CSV file of players who don't have SbaPlayer matches
For manual review and matching
"""
import json
import csv
import logging
from dataclasses import dataclass
from typing import Dict, List, Set, Optional
from collections import defaultdict
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('/tmp/unmatched_players.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger('UnmatchedPlayers')
@dataclass
class PlayerRecord:
id: int
name: str
season: int
bbref_id: Optional[str] = None
sbaplayer_id: Optional[int] = None
@dataclass
class SbaPlayerRecord:
id: int
first_name: str
last_name: str
key_bbref: Optional[str] = None
key_fangraphs: Optional[int] = None
key_mlbam: Optional[int] = None
key_retro: Optional[str] = None
def load_cached_data():
"""Load all cached data from previous analysis"""
# Load SbaPlayers
logger.info("Loading cached SbaPlayer data...")
with open('/tmp/sbaplayers.json', 'r') as f:
sbaplayer_data = json.load(f)
sbaplayers = []
for data in sbaplayer_data:
sbaplayers.append(SbaPlayerRecord(**data))
logger.info(f"Loaded {len(sbaplayers)} SbaPlayers")
# Load all player seasons
logger.info("Loading cached player data...")
all_players = []
for season in range(1, 13):
cache_file = f"/tmp/players_season_{season}.json"
try:
with open(cache_file, 'r') as f:
season_data = json.load(f)
for data in season_data:
all_players.append(PlayerRecord(**data))
except FileNotFoundError:
logger.error(f"Cache file for season {season} not found. Run analyze_player_data.py first.")
return None, None
logger.info(f"Loaded {len(all_players)} player-season records")
return all_players, sbaplayers
def find_unmatched_players(all_players: List[PlayerRecord], sbaplayers: List[SbaPlayerRecord]):
"""Find all players that don't have matches"""
# Create lookup for SbaPlayers by bbref_id
sbaplayer_by_bbref = {}
for sba in sbaplayers:
if sba.key_bbref:
sbaplayer_by_bbref[sba.key_bbref] = sba
# Group players by unique identifier
unique_players = {} # Will store one representative PlayerRecord for each unique player
# First pass: group by bbref_id where available
players_by_bbref = defaultdict(list)
players_without_bbref = []
for player in all_players:
if player.bbref_id:
players_by_bbref[player.bbref_id].append(player)
else:
players_without_bbref.append(player)
# Find unmatched players with bbref_id
unmatched_with_bbref = []
for bbref_id, players in players_by_bbref.items():
if bbref_id not in sbaplayer_by_bbref:
# This bbref_id has no match in SbaPlayers
# Use the most recent season's player record as representative
representative = max(players, key=lambda p: p.season)
unmatched_with_bbref.append(representative)
logger.info(f"Found {len(unmatched_with_bbref)} unique players with bbref_id but no SbaPlayer match")
# Group players without bbref_id by name
players_by_name = defaultdict(list)
for player in players_without_bbref:
players_by_name[player.name].append(player)
unmatched_without_bbref = []
for name, players in players_by_name.items():
# Use the most recent season's player record as representative
representative = max(players, key=lambda p: p.season)
unmatched_without_bbref.append(representative)
logger.info(f"Found {len(unmatched_without_bbref)} unique players without bbref_id")
return unmatched_with_bbref, unmatched_without_bbref
def generate_csv_reports(unmatched_with_bbref: List[PlayerRecord], unmatched_without_bbref: List[PlayerRecord], sbaplayers: List[SbaPlayerRecord]):
"""Generate CSV files for manual review"""
# Generate main unmatched players file
output_file = '/tmp/unmatched_players_for_review.csv'
with open(output_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
# Header
writer.writerow([
'player_id', 'name', 'season', 'bbref_id', 'match_type', 'seasons_appeared',
'suggested_sbaplayer_id', 'suggested_sbaplayer_name', 'suggested_match_reason'
])
# Players with bbref_id but no match
for player in sorted(unmatched_with_bbref, key=lambda p: p.name):
# Get all seasons this player appeared in
all_seasons = []
cache_file = f"/tmp/players_season_{player.season}.json"
# Find all seasons for this bbref_id across all cached data
seasons_found = []
for season in range(1, 13):
try:
with open(f"/tmp/players_season_{season}.json", 'r') as f:
season_data = json.load(f)
for p_data in season_data:
if p_data.get('bbref_id') == player.bbref_id:
seasons_found.append(season)
break
except:
continue
seasons_str = ','.join(map(str, sorted(seasons_found)))
writer.writerow([
player.id,
player.name,
player.season,
player.bbref_id,
'has_bbref_no_match',
seasons_str,
'', # suggested_sbaplayer_id (empty for manual fill)
'', # suggested_sbaplayer_name (empty for manual fill)
'No existing SbaPlayer with this bbref_id'
])
# Players without bbref_id
for player in sorted(unmatched_without_bbref, key=lambda p: p.name):
# Get all seasons this player appeared in by name
seasons_found = []
for season in range(1, 13):
try:
with open(f"/tmp/players_season_{season}.json", 'r') as f:
season_data = json.load(f)
for p_data in season_data:
if p_data.get('name') == player.name and not p_data.get('bbref_id'):
seasons_found.append(season)
break
except:
continue
seasons_str = ','.join(map(str, sorted(seasons_found)))
# Try to suggest a match from existing SbaPlayers by name
suggested_id = ''
suggested_name = ''
suggested_reason = 'No bbref_id available'
for sba in sbaplayers:
sba_full_name = f"{sba.first_name} {sba.last_name}"
if sba_full_name.lower() == player.name.lower():
suggested_id = str(sba.id)
suggested_name = sba_full_name
suggested_reason = 'Exact name match found in SbaPlayers'
break
elif (sba.first_name.lower() in player.name.lower() and
sba.last_name.lower() in player.name.lower()):
# Partial match - suggest but flag for review
if not suggested_id: # Only suggest first partial match
suggested_id = f"PARTIAL:{sba.id}"
suggested_name = sba_full_name
suggested_reason = 'Partial name match - REVIEW NEEDED'
writer.writerow([
player.id,
player.name,
player.season,
player.bbref_id or '',
'no_bbref',
seasons_str,
suggested_id,
suggested_name,
suggested_reason
])
logger.info(f"Generated CSV report: {output_file}")
# Generate summary statistics
summary_file = '/tmp/unmatched_players_summary.txt'
with open(summary_file, 'w') as f:
f.write("UNMATCHED PLAYERS SUMMARY\n")
f.write("=" * 50 + "\n\n")
f.write(f"Players with bbref_id but no SbaPlayer match: {len(unmatched_with_bbref)}\n")
f.write(f"Players without bbref_id: {len(unmatched_without_bbref)}\n")
f.write(f"Total unique unmatched players: {len(unmatched_with_bbref) + len(unmatched_without_bbref)}\n\n")
f.write("NEXT STEPS:\n")
f.write("1. Review the CSV file: /tmp/unmatched_players_for_review.csv\n")
f.write("2. For players with suggested matches, verify they are correct\n")
f.write("3. For players marked 'PARTIAL:', carefully review the suggestion\n")
f.write("4. Fill in the suggested_sbaplayer_id column for matches you want to use\n")
f.write("5. Leave suggested_sbaplayer_id empty for players needing new SbaPlayer records\n")
logger.info(f"Generated summary: {summary_file}")
return output_file, summary_file
def main():
"""Main execution"""
logger.info("Starting unmatched players report generation...")
# Load cached data
all_players, sbaplayers = load_cached_data()
if not all_players or not sbaplayers:
logger.error("Failed to load cached data. Run analyze_player_data.py first.")
return
# Find unmatched players
unmatched_with_bbref, unmatched_without_bbref = find_unmatched_players(all_players, sbaplayers)
# Generate CSV reports
csv_file, summary_file = generate_csv_reports(unmatched_with_bbref, unmatched_without_bbref, sbaplayers)
logger.info(f"\n=== REPORT COMPLETE ===")
logger.info(f"CSV file for review: {csv_file}")
logger.info(f"Summary: {summary_file}")
logger.info(f"Total unmatched unique players: {len(unmatched_with_bbref) + len(unmatched_without_bbref)}")
if __name__ == "__main__":
main()