#!/usr/bin/env python3 """ Process manual review decisions and generate final assignment files. """ import json import csv import logging from pathlib import Path from typing import Dict, List, Set from dataclasses import dataclass # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('matching.log'), logging.StreamHandler() ] ) logger = logging.getLogger(f'{__name__}.process_manual_decisions') @dataclass class ManualDecision: """Manual decision from review CSV""" group_key: str player_name: str bbref_id: str seasons_appeared: str sample_player_ids: str potential_existing_sbaplayer_id: str potential_existing_sbaplayer_name: str potential_match_reason: str your_decision_sbaplayer_id: str your_decision_notes: str def load_cached_data(): """Load cached player and sbaplayer data""" logger.info("Loading cached data from /tmp/...") # Load SbaPlayers with open('/tmp/sbaplayers.json', 'r') as f: sbaplayers = json.load(f) logger.info(f"Loaded {len(sbaplayers)} SbaPlayer records") # Load all players all_players = [] for season in range(1, 13): try: with open(f'/tmp/players_season_{season}.json', 'r') as f: season_players = json.load(f) all_players.extend(season_players) except FileNotFoundError: logger.warning(f"Season {season} data not found") logger.info(f"Loaded {len(all_players)} total player records") return all_players, sbaplayers def load_previous_matches(): """Load the automatically matched players from comprehensive analysis""" matched_players = set() # Read the matching report to get previously matched player IDs try: with open('matching_report.txt', 'r') as f: content = f.read() # Extract matched player IDs from the report # This is a simplified approach - in practice, we'd want to re-run # the matching logic or save the matched IDs separately logger.info("Loading previously matched players from existing analysis...") except FileNotFoundError: logger.warning("matching_report.txt not found - will need to re-run comprehensive matching") return matched_players def parse_manual_decisions(): """Parse manual decisions from the review CSV""" decisions = [] with open('new_sbaplayers_for_review.csv', 'r', encoding='utf-8') as f: # Handle the encoded format content = f.read() # Decode common HTML entities content = content.replace('+AF8-', '_') content = content.replace('+ACI-', '"') content = content.replace('+AC0-', '-') # Parse as CSV lines = content.strip().split('\n') reader = csv.DictReader(lines) for row in reader: if not row.get('group_key'): # Skip empty rows continue decision = ManualDecision( group_key=row['group_key'], player_name=row['player_name'], bbref_id=row['bbref_id'], seasons_appeared=row['seasons_appeared'], sample_player_ids=row['sample_player_ids'], potential_existing_sbaplayer_id=row['potential_existing_sbaplayer_id'], potential_existing_sbaplayer_name=row['potential_existing_sbaplayer_name'], potential_match_reason=row['potential_match_reason'], your_decision_sbaplayer_id=row['your_decision_sbaplayer_id'], your_decision_notes=row['your_decision_notes'] ) decisions.append(decision) logger.info(f"Parsed {len(decisions)} manual decisions") return decisions def determine_final_assignments(all_players, sbaplayers, decisions): """Determine final sbaplayer_id assignments for all players""" # Create maps for quick lookup sbaplayer_map = {sp['id']: sp for sp in sbaplayers} # Group decisions by type existing_matches = {} # player_name -> sbaplayer_id new_players_needed = set() # player names that need new SbaPlayer records three_way_matches = {} # consolidated names # Process manual decisions for decision in decisions: if decision.your_decision_sbaplayer_id: # User chose an existing SbaPlayer try: sbaplayer_id = int(decision.your_decision_sbaplayer_id) existing_matches[decision.player_name] = sbaplayer_id logger.info(f"Manual match: {decision.player_name} -> SbaPlayer ID {sbaplayer_id}") except ValueError: logger.warning(f"Invalid SbaPlayer ID: {decision.your_decision_sbaplayer_id}") else: # User decided this needs a new SbaPlayer record new_players_needed.add(decision.player_name) # Check for three-way matches in notes if "Three-way match" in decision.your_decision_notes: # Extract canonical name from notes if "use name Tom Eshelman" in decision.your_decision_notes: three_way_matches[decision.player_name] = "Tom Eshelman" # Check for two-way matches (corrupted data) if "Two-way match" in decision.your_decision_notes: if "join with bbref_id mejiafr01" in decision.your_decision_notes: # This Francisco Mejia with "HALP" should join with the legitimate one # Find the mejiafr01 record in decisions for other_decision in decisions: if other_decision.bbref_id == "mejiafr01": three_way_matches[decision.player_name] = other_decision.player_name break logger.info(f"Found {len(existing_matches)} existing matches") logger.info(f"Found {len(new_players_needed)} players needing new records") logger.info(f"Found {len(three_way_matches)} three-way matches") return existing_matches, new_players_needed, three_way_matches def generate_assignment_files(all_players, existing_matches, new_players_needed, three_way_matches): """Generate the final assignment files""" # We need to re-implement the comprehensive matching logic here # For now, I'll create a simplified version that processes the manual decisions logger.info("Generating assignment files...") # This is a placeholder - we need to integrate with the existing comprehensive_player_matching.py # logic to get the full 12,232 player assignments assignments = [] new_sbaplayers = [] # Process manual decisions into assignments for player_name, sbaplayer_id in existing_matches.items(): # Find all player records with this name and assign them matching_players = [p for p in all_players if p['name'] == player_name] for player in matching_players: assignments.append({ 'player_id': player['id'], 'player_name': player['name'], 'season': player['season'], 'bbref_id': player.get('bbref_id', ''), 'assigned_sbaplayer_id': sbaplayer_id, 'match_type': 'manual_existing' }) # Process new players that need SbaPlayer records next_sbaplayer_id = max([sp['id'] for sp in sbaplayers]) + 1 for player_name in new_players_needed: if player_name not in three_way_matches: # Regular new player # Find a representative player record to get bbref_id and other data sample_players = [p for p in all_players if p['name'] == player_name] if sample_players: sample = sample_players[0] new_sbaplayers.append({ 'new_id': next_sbaplayer_id, 'name': player_name, 'key_bbref': sample.get('bbref_id', ''), 'key_fangraphs': '', 'key_mlb': '', 'notes': f'Created from manual review for {len(sample_players)} player records' }) # Assign all players with this name to the new SbaPlayer for player in sample_players: assignments.append({ 'player_id': player['id'], 'player_name': player['name'], 'season': player['season'], 'bbref_id': player.get('bbref_id', ''), 'assigned_sbaplayer_id': next_sbaplayer_id, 'match_type': 'manual_new' }) next_sbaplayer_id += 1 # Handle three-way matches for original_name, canonical_name in three_way_matches.items(): # All variations should map to the same new SbaPlayer all_variations = [name for name in new_players_needed if three_way_matches.get(name) == canonical_name] + [canonical_name] all_variations = list(set(all_variations)) # Remove duplicates # Create one new SbaPlayer for all variations sample_players = [] for variation in all_variations: sample_players.extend([p for p in all_players if p['name'] == variation]) if sample_players: sample = sample_players[0] new_sbaplayers.append({ 'new_id': next_sbaplayer_id, 'name': canonical_name, 'key_bbref': sample.get('bbref_id', ''), 'key_fangraphs': '', 'key_mlb': '', 'notes': f'Consolidated from {len(all_variations)} name variations: {", ".join(all_variations)}' }) # Assign all players with these name variations for player in sample_players: assignments.append({ 'player_id': player['id'], 'player_name': player['name'], 'season': player['season'], 'bbref_id': player.get('bbref_id', ''), 'assigned_sbaplayer_id': next_sbaplayer_id, 'match_type': 'manual_consolidated' }) next_sbaplayer_id += 1 return assignments, new_sbaplayers def save_assignment_files(assignments, new_sbaplayers): """Save the final assignment files""" # Save player assignments with open('player_sbaplayer_assignments.csv', 'w', newline='') as f: if assignments: fieldnames = assignments[0].keys() writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(assignments) logger.info(f"Saved {len(assignments)} player assignments to player_sbaplayer_assignments.csv") # Save new SbaPlayers to insert with open('new_sbaplayers_to_insert.csv', 'w', newline='') as f: if new_sbaplayers: fieldnames = new_sbaplayers[0].keys() writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(new_sbaplayers) logger.info(f"Saved {len(new_sbaplayers)} new SbaPlayer records to new_sbaplayers_to_insert.csv") def main(): """Main processing function""" logger.info("Starting manual decision processing...") try: # Load cached data all_players, sbaplayers = load_cached_data() # Parse manual decisions decisions = parse_manual_decisions() # Determine final assignments existing_matches, new_players_needed, three_way_matches = determine_final_assignments( all_players, sbaplayers, decisions ) # Generate assignment files assignments, new_sbaplayers = generate_assignment_files( all_players, existing_matches, new_players_needed, three_way_matches ) # Save files save_assignment_files(assignments, new_sbaplayers) logger.info("Manual decision processing completed successfully!") print(f"\nSUMMARY:") print(f"- {len(assignments)} player assignments generated") print(f"- {len(new_sbaplayers)} new SbaPlayer records to create") print(f"- Files saved: player_sbaplayer_assignments.csv, new_sbaplayers_to_insert.csv") except Exception as e: logger.error(f"Error processing manual decisions: {e}") raise if __name__ == "__main__": main()