major-domo-database/.claude/sqlite-to-postgres/player-to-sbaplayer-matching/process_manual_decisions.py
Cal Corum 7130a1fd43 Postgres Migration
Migration documentation and scripts
2025-08-25 07:18:31 -05:00

317 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Process manual review decisions and generate final assignment files.
"""
import json
import csv
import logging
from pathlib import Path
from typing import Dict, List, Set
from dataclasses import dataclass
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('matching.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(f'{__name__}.process_manual_decisions')
@dataclass
class ManualDecision:
"""Manual decision from review CSV"""
group_key: str
player_name: str
bbref_id: str
seasons_appeared: str
sample_player_ids: str
potential_existing_sbaplayer_id: str
potential_existing_sbaplayer_name: str
potential_match_reason: str
your_decision_sbaplayer_id: str
your_decision_notes: str
def load_cached_data():
"""Load cached player and sbaplayer data"""
logger.info("Loading cached data from /tmp/...")
# Load SbaPlayers
with open('/tmp/sbaplayers.json', 'r') as f:
sbaplayers = json.load(f)
logger.info(f"Loaded {len(sbaplayers)} SbaPlayer records")
# Load all players
all_players = []
for season in range(1, 13):
try:
with open(f'/tmp/players_season_{season}.json', 'r') as f:
season_players = json.load(f)
all_players.extend(season_players)
except FileNotFoundError:
logger.warning(f"Season {season} data not found")
logger.info(f"Loaded {len(all_players)} total player records")
return all_players, sbaplayers
def load_previous_matches():
"""Load the automatically matched players from comprehensive analysis"""
matched_players = set()
# Read the matching report to get previously matched player IDs
try:
with open('matching_report.txt', 'r') as f:
content = f.read()
# Extract matched player IDs from the report
# This is a simplified approach - in practice, we'd want to re-run
# the matching logic or save the matched IDs separately
logger.info("Loading previously matched players from existing analysis...")
except FileNotFoundError:
logger.warning("matching_report.txt not found - will need to re-run comprehensive matching")
return matched_players
def parse_manual_decisions():
"""Parse manual decisions from the review CSV"""
decisions = []
with open('new_sbaplayers_for_review.csv', 'r', encoding='utf-8') as f:
# Handle the encoded format
content = f.read()
# Decode common HTML entities
content = content.replace('+AF8-', '_')
content = content.replace('+ACI-', '"')
content = content.replace('+AC0-', '-')
# Parse as CSV
lines = content.strip().split('\n')
reader = csv.DictReader(lines)
for row in reader:
if not row.get('group_key'): # Skip empty rows
continue
decision = ManualDecision(
group_key=row['group_key'],
player_name=row['player_name'],
bbref_id=row['bbref_id'],
seasons_appeared=row['seasons_appeared'],
sample_player_ids=row['sample_player_ids'],
potential_existing_sbaplayer_id=row['potential_existing_sbaplayer_id'],
potential_existing_sbaplayer_name=row['potential_existing_sbaplayer_name'],
potential_match_reason=row['potential_match_reason'],
your_decision_sbaplayer_id=row['your_decision_sbaplayer_id'],
your_decision_notes=row['your_decision_notes']
)
decisions.append(decision)
logger.info(f"Parsed {len(decisions)} manual decisions")
return decisions
def determine_final_assignments(all_players, sbaplayers, decisions):
"""Determine final sbaplayer_id assignments for all players"""
# Create maps for quick lookup
sbaplayer_map = {sp['id']: sp for sp in sbaplayers}
# Group decisions by type
existing_matches = {} # player_name -> sbaplayer_id
new_players_needed = set() # player names that need new SbaPlayer records
three_way_matches = {} # consolidated names
# Process manual decisions
for decision in decisions:
if decision.your_decision_sbaplayer_id:
# User chose an existing SbaPlayer
try:
sbaplayer_id = int(decision.your_decision_sbaplayer_id)
existing_matches[decision.player_name] = sbaplayer_id
logger.info(f"Manual match: {decision.player_name} -> SbaPlayer ID {sbaplayer_id}")
except ValueError:
logger.warning(f"Invalid SbaPlayer ID: {decision.your_decision_sbaplayer_id}")
else:
# User decided this needs a new SbaPlayer record
new_players_needed.add(decision.player_name)
# Check for three-way matches in notes
if "Three-way match" in decision.your_decision_notes:
# Extract canonical name from notes
if "use name Tom Eshelman" in decision.your_decision_notes:
three_way_matches[decision.player_name] = "Tom Eshelman"
# Check for two-way matches (corrupted data)
if "Two-way match" in decision.your_decision_notes:
if "join with bbref_id mejiafr01" in decision.your_decision_notes:
# This Francisco Mejia with "HALP" should join with the legitimate one
# Find the mejiafr01 record in decisions
for other_decision in decisions:
if other_decision.bbref_id == "mejiafr01":
three_way_matches[decision.player_name] = other_decision.player_name
break
logger.info(f"Found {len(existing_matches)} existing matches")
logger.info(f"Found {len(new_players_needed)} players needing new records")
logger.info(f"Found {len(three_way_matches)} three-way matches")
return existing_matches, new_players_needed, three_way_matches
def generate_assignment_files(all_players, existing_matches, new_players_needed, three_way_matches):
"""Generate the final assignment files"""
# We need to re-implement the comprehensive matching logic here
# For now, I'll create a simplified version that processes the manual decisions
logger.info("Generating assignment files...")
# This is a placeholder - we need to integrate with the existing comprehensive_player_matching.py
# logic to get the full 12,232 player assignments
assignments = []
new_sbaplayers = []
# Process manual decisions into assignments
for player_name, sbaplayer_id in existing_matches.items():
# Find all player records with this name and assign them
matching_players = [p for p in all_players if p['name'] == player_name]
for player in matching_players:
assignments.append({
'player_id': player['id'],
'player_name': player['name'],
'season': player['season'],
'bbref_id': player.get('bbref_id', ''),
'assigned_sbaplayer_id': sbaplayer_id,
'match_type': 'manual_existing'
})
# Process new players that need SbaPlayer records
next_sbaplayer_id = max([sp['id'] for sp in sbaplayers]) + 1
for player_name in new_players_needed:
if player_name not in three_way_matches: # Regular new player
# Find a representative player record to get bbref_id and other data
sample_players = [p for p in all_players if p['name'] == player_name]
if sample_players:
sample = sample_players[0]
new_sbaplayers.append({
'new_id': next_sbaplayer_id,
'name': player_name,
'key_bbref': sample.get('bbref_id', ''),
'key_fangraphs': '',
'key_mlb': '',
'notes': f'Created from manual review for {len(sample_players)} player records'
})
# Assign all players with this name to the new SbaPlayer
for player in sample_players:
assignments.append({
'player_id': player['id'],
'player_name': player['name'],
'season': player['season'],
'bbref_id': player.get('bbref_id', ''),
'assigned_sbaplayer_id': next_sbaplayer_id,
'match_type': 'manual_new'
})
next_sbaplayer_id += 1
# Handle three-way matches
for original_name, canonical_name in three_way_matches.items():
# All variations should map to the same new SbaPlayer
all_variations = [name for name in new_players_needed if three_way_matches.get(name) == canonical_name] + [canonical_name]
all_variations = list(set(all_variations)) # Remove duplicates
# Create one new SbaPlayer for all variations
sample_players = []
for variation in all_variations:
sample_players.extend([p for p in all_players if p['name'] == variation])
if sample_players:
sample = sample_players[0]
new_sbaplayers.append({
'new_id': next_sbaplayer_id,
'name': canonical_name,
'key_bbref': sample.get('bbref_id', ''),
'key_fangraphs': '',
'key_mlb': '',
'notes': f'Consolidated from {len(all_variations)} name variations: {", ".join(all_variations)}'
})
# Assign all players with these name variations
for player in sample_players:
assignments.append({
'player_id': player['id'],
'player_name': player['name'],
'season': player['season'],
'bbref_id': player.get('bbref_id', ''),
'assigned_sbaplayer_id': next_sbaplayer_id,
'match_type': 'manual_consolidated'
})
next_sbaplayer_id += 1
return assignments, new_sbaplayers
def save_assignment_files(assignments, new_sbaplayers):
"""Save the final assignment files"""
# Save player assignments
with open('player_sbaplayer_assignments.csv', 'w', newline='') as f:
if assignments:
fieldnames = assignments[0].keys()
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(assignments)
logger.info(f"Saved {len(assignments)} player assignments to player_sbaplayer_assignments.csv")
# Save new SbaPlayers to insert
with open('new_sbaplayers_to_insert.csv', 'w', newline='') as f:
if new_sbaplayers:
fieldnames = new_sbaplayers[0].keys()
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(new_sbaplayers)
logger.info(f"Saved {len(new_sbaplayers)} new SbaPlayer records to new_sbaplayers_to_insert.csv")
def main():
"""Main processing function"""
logger.info("Starting manual decision processing...")
try:
# Load cached data
all_players, sbaplayers = load_cached_data()
# Parse manual decisions
decisions = parse_manual_decisions()
# Determine final assignments
existing_matches, new_players_needed, three_way_matches = determine_final_assignments(
all_players, sbaplayers, decisions
)
# Generate assignment files
assignments, new_sbaplayers = generate_assignment_files(
all_players, existing_matches, new_players_needed, three_way_matches
)
# Save files
save_assignment_files(assignments, new_sbaplayers)
logger.info("Manual decision processing completed successfully!")
print(f"\nSUMMARY:")
print(f"- {len(assignments)} player assignments generated")
print(f"- {len(new_sbaplayers)} new SbaPlayer records to create")
print(f"- Files saved: player_sbaplayer_assignments.csv, new_sbaplayers_to_insert.csv")
except Exception as e:
logger.error(f"Error processing manual decisions: {e}")
raise
if __name__ == "__main__":
main()