#!/usr/bin/env python3 """ Process manual matching decisions from reviewed CSV files Generate two output CSV files: 1. New SbaPlayers to insert 2. All player_id -> sbaplayer_id assignments """ import csv import json import logging from dataclasses import dataclass from typing import Dict, List, Set, Optional from collections import defaultdict logging.basicConfig(level=logging.INFO) logger = logging.getLogger('ProcessDecisions') @dataclass class PlayerRecord: id: int name: str season: int bbref_id: Optional[str] = None sbaplayer_id: Optional[int] = None @dataclass class SbaPlayerRecord: id: int first_name: str last_name: str key_bbref: Optional[str] = None key_fangraphs: Optional[int] = None key_mlbam: Optional[int] = None key_retro: Optional[str] = None @dataclass class NewSbaPlayer: first_name: str last_name: str key_bbref: Optional[str] = None temp_id: int = 0 # Temporary ID for referencing before insertion def load_cached_data(): """Load cached player and sbaplayer data""" logger.info("Loading cached data...") # Load SbaPlayers with open('/tmp/sbaplayers.json', 'r') as f: sbaplayer_data = json.load(f) sbaplayers = [] for data in sbaplayer_data: sbaplayers.append(SbaPlayerRecord(**data)) # Load all players all_players = [] for season in range(1, 13): with open(f"/tmp/players_season_{season}.json", 'r') as f: season_data = json.load(f) for data in season_data: all_players.append(PlayerRecord(**data)) logger.info(f"Loaded {len(sbaplayers)} SbaPlayers and {len(all_players)} player records") return all_players, sbaplayers def parse_unmatched_decisions(): """Parse decisions from the unmatched players CSV""" logger.info("Parsing unmatched player decisions...") decisions = [] csv_file = '/mnt/NV2/Development/major-domo/database/unmatched_players_for_review_updated.csv' with open(csv_file, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) # Debug: print column names logger.info(f"CSV columns: {reader.fieldnames}") for row in reader: resolution = row.get('resolution', '').strip() if not resolution or resolution == 'SKIP': continue # Handle encoded column names player_id_key = next((k for k in row.keys() if 'player' in k.lower() and 'id' in k.lower()), 'player_id') bbref_id_key = next((k for k in row.keys() if 'bbref' in k.lower() and 'id' in k.lower()), 'bbref_id') seasons_key = next((k for k in row.keys() if 'seasons' in k.lower() and 'appeared' in k.lower()), 'seasons_appeared') suggested_id_key = next((k for k in row.keys() if 'suggested' in k.lower() and 'sbaplayer' in k.lower() and 'id' in k.lower()), 'suggested_sbaplayer_id') match_type_key = next((k for k in row.keys() if 'match' in k.lower() and 'type' in k.lower()), 'match_type') decision = { 'player_id': int(row[player_id_key]), 'player_name': row['name'], 'bbref_id': row[bbref_id_key] if row[bbref_id_key] else None, 'seasons_appeared': row[seasons_key], 'resolution': resolution, 'suggested_sbaplayer_id': row[suggested_id_key], 'match_type': row[match_type_key] } decisions.append(decision) logger.info(f"Found {len(decisions)} unmatched player decisions") return decisions def parse_high_risk_decisions(): """Parse decisions from the high risk matches CSV""" logger.info("Parsing high-risk match decisions...") decisions = [] csv_file = '/mnt/NV2/Development/major-domo/database/high_risk_player_matches_updated.csv' with open(csv_file, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) # Debug: print column names logger.info(f"High-risk CSV columns: {reader.fieldnames}") for row in reader: resolution = row.get('resolution', '').strip() if not resolution or resolution == 'SKIP': continue # Handle encoded column names risk_type_key = next((k for k in row.keys() if 'risk' in k.lower() and 'type' in k.lower()), 'risk_type') player_id_key = next((k for k in row.keys() if 'player' in k.lower() and 'id' in k.lower()), 'player_id') player_name_key = next((k for k in row.keys() if 'player' in k.lower() and 'name' in k.lower()), 'player_name') sba1_id_key = next((k for k in row.keys() if 'sba1' in k.lower() and 'id' in k.lower()), 'sba1_id') sba1_name_key = next((k for k in row.keys() if 'sba1' in k.lower() and 'name' in k.lower()), 'sba1_name') sba2_id_key = next((k for k in row.keys() if 'sba2' in k.lower() and 'id' in k.lower()), 'sba2_id') sba2_name_key = next((k for k in row.keys() if 'sba2' in k.lower() and 'name' in k.lower()), 'sba2_name') decision = { 'risk_type': row[risk_type_key], 'resolution': resolution, 'player_id': int(row[player_id_key]) if row[player_id_key] else None, 'player_name': row[player_name_key], 'sba1_id': int(row[sba1_id_key]) if row[sba1_id_key] else None, 'sba1_name': row[sba1_name_key], 'sba2_id': int(row[sba2_id_key]) if row[sba2_id_key] else None, 'sba2_name': row[sba2_name_key] } decisions.append(decision) logger.info(f"Found {len(decisions)} high-risk match decisions") return decisions def process_decisions(all_players, sbaplayers, unmatched_decisions, high_risk_decisions): """Process all decisions and generate new SbaPlayers and assignments""" # Create lookup maps sbaplayers_by_id = {sba.id: sba for sba in sbaplayers} players_by_id = {p.id: p for p in all_players} # Track what we're creating and assigning new_sbaplayers = [] player_assignments = [] # (player_id, sbaplayer_id) pairs sbaplayer_merges = [] # (from_id, to_id) pairs temp_id_counter = 90000 # Start temp IDs at 90000 to avoid conflicts logger.info("Processing unmatched player decisions...") # Process unmatched player decisions for decision in unmatched_decisions: resolution = decision['resolution'] player_id = decision['player_id'] player_name = decision['player_name'] bbref_id = decision['bbref_id'] if resolution == 'ACCEPT': # Use suggested SbaPlayer ID suggested_id = decision['suggested_sbaplayer_id'] if suggested_id and not suggested_id.startswith('PARTIAL:'): sbaplayer_id = int(suggested_id) # Find all players with same identity and assign them if bbref_id: # Group by bbref_id matching_players = [p for p in all_players if p.bbref_id == bbref_id] else: # Group by name (for players without bbref_id) matching_players = [p for p in all_players if p.name == player_name and not p.bbref_id] for player in matching_players: player_assignments.append((player.id, sbaplayer_id)) elif resolution.startswith('USE_SBA_'): # Use specific SbaPlayer ID sbaplayer_id = int(resolution.replace('USE_SBA_', '')) # Find all players with same identity if bbref_id: matching_players = [p for p in all_players if p.bbref_id == bbref_id] else: matching_players = [p for p in all_players if p.name == player_name and not p.bbref_id] for player in matching_players: player_assignments.append((player.id, sbaplayer_id)) elif resolution == 'CREATE_NEW': # Create new SbaPlayer name_parts = player_name.strip().split() if len(name_parts) >= 2: first_name = name_parts[0] last_name = ' '.join(name_parts[1:]) # Handle multiple part last names else: first_name = player_name last_name = "" new_sba = NewSbaPlayer( first_name=first_name, last_name=last_name, key_bbref=bbref_id, temp_id=temp_id_counter ) new_sbaplayers.append(new_sba) # Find all players with same identity and assign to new SbaPlayer if bbref_id: matching_players = [p for p in all_players if p.bbref_id == bbref_id] else: matching_players = [p for p in all_players if p.name == player_name and not p.bbref_id] for player in matching_players: player_assignments.append((player.id, temp_id_counter)) temp_id_counter += 1 logger.info("Processing high-risk match decisions...") # Process high-risk decisions for decision in high_risk_decisions: resolution = decision['resolution'] risk_type = decision['risk_type'] if resolution.startswith('MERGE_') and '_INTO_' in resolution: # Parse merge instruction: MERGE_123_INTO_456 parts = resolution.replace('MERGE_', '').split('_INTO_') from_id = int(parts[0]) to_id = int(parts[1]) sbaplayer_merges.append((from_id, to_id)) elif resolution.startswith('USE_SBA_') and decision['player_id']: # Player ambiguous match resolved sbaplayer_id = int(resolution.replace('USE_SBA_', '')) player_id = decision['player_id'] player_name = decision['player_name'] # Find all players with same name (no bbref_id since these are ambiguous matches) matching_players = [p for p in all_players if p.name == player_name and not p.bbref_id] for player in matching_players: player_assignments.append((player.id, sbaplayer_id)) logger.info(f"Generated {len(new_sbaplayers)} new SbaPlayers") logger.info(f"Generated {len(player_assignments)} player assignments") logger.info(f"Generated {len(sbaplayer_merges)} SbaPlayer merges") return new_sbaplayers, player_assignments, sbaplayer_merges def generate_new_sbaplayers_csv(new_sbaplayers): """Generate CSV file with new SbaPlayers to insert""" output_file = '/mnt/NV2/Development/major-domo/database/new_sbaplayers_to_insert.csv' with open(output_file, 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(['temp_id', 'first_name', 'last_name', 'key_bbref', 'key_fangraphs', 'key_mlbam', 'key_retro']) for sba in new_sbaplayers: writer.writerow([ sba.temp_id, sba.first_name, sba.last_name, sba.key_bbref or '', '', # key_fangraphs (empty) '', # key_mlbam (empty) '' # key_retro (empty) ]) logger.info(f"Generated new SbaPlayers CSV: {output_file}") return output_file def generate_player_assignments_csv(player_assignments, sbaplayer_merges, all_players, sbaplayers_by_id): """Generate CSV file with all player assignments""" output_file = '/mnt/NV2/Development/major-domo/database/player_sbaplayer_assignments.csv' # Create merge mapping for resolving final IDs merge_mapping = {} for from_id, to_id in sbaplayer_merges: merge_mapping[from_id] = to_id players_by_id = {p.id: p for p in all_players} with open(output_file, 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow([ 'player_id', 'player_name', 'player_season', 'player_bbref_id', 'sbaplayer_id', 'sbaplayer_name', 'sbaplayer_bbref', 'assignment_type' ]) for player_id, sbaplayer_id in player_assignments: player = players_by_id[player_id] # Resolve final sbaplayer_id (in case of merges) final_sbaplayer_id = merge_mapping.get(sbaplayer_id, sbaplayer_id) # Get SbaPlayer info if final_sbaplayer_id >= 90000: # New SbaPlayer (temp ID) sbaplayer_name = f"NEW_TEMP_{final_sbaplayer_id}" sbaplayer_bbref = player.bbref_id or '' assignment_type = 'new_sbaplayer' else: # Existing SbaPlayer if final_sbaplayer_id in sbaplayers_by_id: sba = sbaplayers_by_id[final_sbaplayer_id] sbaplayer_name = f"{sba.first_name} {sba.last_name}" sbaplayer_bbref = sba.key_bbref or '' assignment_type = 'existing_sbaplayer' else: sbaplayer_name = f"UNKNOWN_SBA_{final_sbaplayer_id}" sbaplayer_bbref = '' assignment_type = 'error' writer.writerow([ player.id, player.name, player.season, player.bbref_id or '', final_sbaplayer_id, sbaplayer_name, sbaplayer_bbref, assignment_type ]) logger.info(f"Generated player assignments CSV: {output_file}") return output_file def generate_summary_report(new_sbaplayers, player_assignments, sbaplayer_merges): """Generate summary report""" summary_file = '/mnt/NV2/Development/major-domo/database/processing_summary.txt' with open(summary_file, 'w') as f: f.write("MATCHING DECISIONS PROCESSING SUMMARY\n") f.write("=" * 50 + "\n\n") f.write(f"New SbaPlayers to create: {len(new_sbaplayers)}\n") f.write(f"Player assignments to make: {len(player_assignments)}\n") f.write(f"SbaPlayer merges to perform: {len(sbaplayer_merges)}\n\n") f.write("FILES GENERATED:\n") f.write("1. new_sbaplayers_to_insert.csv - New SbaPlayer records to create\n") f.write("2. player_sbaplayer_assignments.csv - All player -> sbaplayer assignments\n\n") f.write("SbaPlayer Merges:\n") for from_id, to_id in sbaplayer_merges: f.write(f" - Merge SbaPlayer {from_id} into {to_id}\n") f.write(f"\nNEXT STEPS:\n") f.write("1. Review the generated CSV files\n") f.write("2. Execute SbaPlayer merges first (if any)\n") f.write("3. Insert new SbaPlayers and get their real IDs\n") f.write("4. Update all player records with sbaplayer_id assignments\n") logger.info(f"Generated summary report: {summary_file}") def main(): """Main processing function""" logger.info("Starting processing of matching decisions...") # Load data all_players, sbaplayers = load_cached_data() sbaplayers_by_id = {sba.id: sba for sba in sbaplayers} # Parse decisions unmatched_decisions = parse_unmatched_decisions() high_risk_decisions = parse_high_risk_decisions() # Process decisions new_sbaplayers, player_assignments, sbaplayer_merges = process_decisions( all_players, sbaplayers, unmatched_decisions, high_risk_decisions ) # Generate output files new_sba_file = generate_new_sbaplayers_csv(new_sbaplayers) assignments_file = generate_player_assignments_csv( player_assignments, sbaplayer_merges, all_players, sbaplayers_by_id ) generate_summary_report(new_sbaplayers, player_assignments, sbaplayer_merges) logger.info("\n=== PROCESSING COMPLETE ===") logger.info(f"New SbaPlayers CSV: {new_sba_file}") logger.info(f"Player assignments CSV: {assignments_file}") logger.info(f"Summary: processing_summary.txt") if __name__ == "__main__": main()