#!/usr/bin/env python3 """ Generate CSV file of players who don't have SbaPlayer matches For manual review and matching """ import json import csv import logging from dataclasses import dataclass from typing import Dict, List, Set, Optional from collections import defaultdict # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('/tmp/unmatched_players.log'), logging.StreamHandler() ] ) logger = logging.getLogger('UnmatchedPlayers') @dataclass class PlayerRecord: id: int name: str season: int bbref_id: Optional[str] = None sbaplayer_id: Optional[int] = None @dataclass class SbaPlayerRecord: id: int first_name: str last_name: str key_bbref: Optional[str] = None key_fangraphs: Optional[int] = None key_mlbam: Optional[int] = None key_retro: Optional[str] = None def load_cached_data(): """Load all cached data from previous analysis""" # Load SbaPlayers logger.info("Loading cached SbaPlayer data...") with open('/tmp/sbaplayers.json', 'r') as f: sbaplayer_data = json.load(f) sbaplayers = [] for data in sbaplayer_data: sbaplayers.append(SbaPlayerRecord(**data)) logger.info(f"Loaded {len(sbaplayers)} SbaPlayers") # Load all player seasons logger.info("Loading cached player data...") all_players = [] for season in range(1, 13): cache_file = f"/tmp/players_season_{season}.json" try: with open(cache_file, 'r') as f: season_data = json.load(f) for data in season_data: all_players.append(PlayerRecord(**data)) except FileNotFoundError: logger.error(f"Cache file for season {season} not found. Run analyze_player_data.py first.") return None, None logger.info(f"Loaded {len(all_players)} player-season records") return all_players, sbaplayers def find_unmatched_players(all_players: List[PlayerRecord], sbaplayers: List[SbaPlayerRecord]): """Find all players that don't have matches""" # Create lookup for SbaPlayers by bbref_id sbaplayer_by_bbref = {} for sba in sbaplayers: if sba.key_bbref: sbaplayer_by_bbref[sba.key_bbref] = sba # Group players by unique identifier unique_players = {} # Will store one representative PlayerRecord for each unique player # First pass: group by bbref_id where available players_by_bbref = defaultdict(list) players_without_bbref = [] for player in all_players: if player.bbref_id: players_by_bbref[player.bbref_id].append(player) else: players_without_bbref.append(player) # Find unmatched players with bbref_id unmatched_with_bbref = [] for bbref_id, players in players_by_bbref.items(): if bbref_id not in sbaplayer_by_bbref: # This bbref_id has no match in SbaPlayers # Use the most recent season's player record as representative representative = max(players, key=lambda p: p.season) unmatched_with_bbref.append(representative) logger.info(f"Found {len(unmatched_with_bbref)} unique players with bbref_id but no SbaPlayer match") # Group players without bbref_id by name players_by_name = defaultdict(list) for player in players_without_bbref: players_by_name[player.name].append(player) unmatched_without_bbref = [] for name, players in players_by_name.items(): # Use the most recent season's player record as representative representative = max(players, key=lambda p: p.season) unmatched_without_bbref.append(representative) logger.info(f"Found {len(unmatched_without_bbref)} unique players without bbref_id") return unmatched_with_bbref, unmatched_without_bbref def generate_csv_reports(unmatched_with_bbref: List[PlayerRecord], unmatched_without_bbref: List[PlayerRecord], sbaplayers: List[SbaPlayerRecord]): """Generate CSV files for manual review""" # Generate main unmatched players file output_file = '/tmp/unmatched_players_for_review.csv' with open(output_file, 'w', newline='') as csvfile: writer = csv.writer(csvfile) # Header writer.writerow([ 'player_id', 'name', 'season', 'bbref_id', 'match_type', 'seasons_appeared', 'suggested_sbaplayer_id', 'suggested_sbaplayer_name', 'suggested_match_reason' ]) # Players with bbref_id but no match for player in sorted(unmatched_with_bbref, key=lambda p: p.name): # Get all seasons this player appeared in all_seasons = [] cache_file = f"/tmp/players_season_{player.season}.json" # Find all seasons for this bbref_id across all cached data seasons_found = [] for season in range(1, 13): try: with open(f"/tmp/players_season_{season}.json", 'r') as f: season_data = json.load(f) for p_data in season_data: if p_data.get('bbref_id') == player.bbref_id: seasons_found.append(season) break except: continue seasons_str = ','.join(map(str, sorted(seasons_found))) writer.writerow([ player.id, player.name, player.season, player.bbref_id, 'has_bbref_no_match', seasons_str, '', # suggested_sbaplayer_id (empty for manual fill) '', # suggested_sbaplayer_name (empty for manual fill) 'No existing SbaPlayer with this bbref_id' ]) # Players without bbref_id for player in sorted(unmatched_without_bbref, key=lambda p: p.name): # Get all seasons this player appeared in by name seasons_found = [] for season in range(1, 13): try: with open(f"/tmp/players_season_{season}.json", 'r') as f: season_data = json.load(f) for p_data in season_data: if p_data.get('name') == player.name and not p_data.get('bbref_id'): seasons_found.append(season) break except: continue seasons_str = ','.join(map(str, sorted(seasons_found))) # Try to suggest a match from existing SbaPlayers by name suggested_id = '' suggested_name = '' suggested_reason = 'No bbref_id available' for sba in sbaplayers: sba_full_name = f"{sba.first_name} {sba.last_name}" if sba_full_name.lower() == player.name.lower(): suggested_id = str(sba.id) suggested_name = sba_full_name suggested_reason = 'Exact name match found in SbaPlayers' break elif (sba.first_name.lower() in player.name.lower() and sba.last_name.lower() in player.name.lower()): # Partial match - suggest but flag for review if not suggested_id: # Only suggest first partial match suggested_id = f"PARTIAL:{sba.id}" suggested_name = sba_full_name suggested_reason = 'Partial name match - REVIEW NEEDED' writer.writerow([ player.id, player.name, player.season, player.bbref_id or '', 'no_bbref', seasons_str, suggested_id, suggested_name, suggested_reason ]) logger.info(f"Generated CSV report: {output_file}") # Generate summary statistics summary_file = '/tmp/unmatched_players_summary.txt' with open(summary_file, 'w') as f: f.write("UNMATCHED PLAYERS SUMMARY\n") f.write("=" * 50 + "\n\n") f.write(f"Players with bbref_id but no SbaPlayer match: {len(unmatched_with_bbref)}\n") f.write(f"Players without bbref_id: {len(unmatched_without_bbref)}\n") f.write(f"Total unique unmatched players: {len(unmatched_with_bbref) + len(unmatched_without_bbref)}\n\n") f.write("NEXT STEPS:\n") f.write("1. Review the CSV file: /tmp/unmatched_players_for_review.csv\n") f.write("2. For players with suggested matches, verify they are correct\n") f.write("3. For players marked 'PARTIAL:', carefully review the suggestion\n") f.write("4. Fill in the suggested_sbaplayer_id column for matches you want to use\n") f.write("5. Leave suggested_sbaplayer_id empty for players needing new SbaPlayer records\n") logger.info(f"Generated summary: {summary_file}") return output_file, summary_file def main(): """Main execution""" logger.info("Starting unmatched players report generation...") # Load cached data all_players, sbaplayers = load_cached_data() if not all_players or not sbaplayers: logger.error("Failed to load cached data. Run analyze_player_data.py first.") return # Find unmatched players unmatched_with_bbref, unmatched_without_bbref = find_unmatched_players(all_players, sbaplayers) # Generate CSV reports csv_file, summary_file = generate_csv_reports(unmatched_with_bbref, unmatched_without_bbref, sbaplayers) logger.info(f"\n=== REPORT COMPLETE ===") logger.info(f"CSV file for review: {csv_file}") logger.info(f"Summary: {summary_file}") logger.info(f"Total unmatched unique players: {len(unmatched_with_bbref) + len(unmatched_without_bbref)}") if __name__ == "__main__": main()