major-domo-database/.claude/sqlite-to-postgres/botched-sbaplayer-matching/process_matching_decisions.py
Cal Corum 7130a1fd43 Postgres Migration
Migration documentation and scripts
2025-08-25 07:18:31 -05:00

396 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Process manual matching decisions from reviewed CSV files
Generate two output CSV files:
1. New SbaPlayers to insert
2. All player_id -> sbaplayer_id assignments
"""
import csv
import json
import logging
from dataclasses import dataclass
from typing import Dict, List, Set, Optional
from collections import defaultdict
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('ProcessDecisions')
@dataclass
class PlayerRecord:
id: int
name: str
season: int
bbref_id: Optional[str] = None
sbaplayer_id: Optional[int] = None
@dataclass
class SbaPlayerRecord:
id: int
first_name: str
last_name: str
key_bbref: Optional[str] = None
key_fangraphs: Optional[int] = None
key_mlbam: Optional[int] = None
key_retro: Optional[str] = None
@dataclass
class NewSbaPlayer:
first_name: str
last_name: str
key_bbref: Optional[str] = None
temp_id: int = 0 # Temporary ID for referencing before insertion
def load_cached_data():
"""Load cached player and sbaplayer data"""
logger.info("Loading cached data...")
# Load SbaPlayers
with open('/tmp/sbaplayers.json', 'r') as f:
sbaplayer_data = json.load(f)
sbaplayers = []
for data in sbaplayer_data:
sbaplayers.append(SbaPlayerRecord(**data))
# Load all players
all_players = []
for season in range(1, 13):
with open(f"/tmp/players_season_{season}.json", 'r') as f:
season_data = json.load(f)
for data in season_data:
all_players.append(PlayerRecord(**data))
logger.info(f"Loaded {len(sbaplayers)} SbaPlayers and {len(all_players)} player records")
return all_players, sbaplayers
def parse_unmatched_decisions():
"""Parse decisions from the unmatched players CSV"""
logger.info("Parsing unmatched player decisions...")
decisions = []
csv_file = '/mnt/NV2/Development/major-domo/database/unmatched_players_for_review_updated.csv'
with open(csv_file, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
# Debug: print column names
logger.info(f"CSV columns: {reader.fieldnames}")
for row in reader:
resolution = row.get('resolution', '').strip()
if not resolution or resolution == 'SKIP':
continue
# Handle encoded column names
player_id_key = next((k for k in row.keys() if 'player' in k.lower() and 'id' in k.lower()), 'player_id')
bbref_id_key = next((k for k in row.keys() if 'bbref' in k.lower() and 'id' in k.lower()), 'bbref_id')
seasons_key = next((k for k in row.keys() if 'seasons' in k.lower() and 'appeared' in k.lower()), 'seasons_appeared')
suggested_id_key = next((k for k in row.keys() if 'suggested' in k.lower() and 'sbaplayer' in k.lower() and 'id' in k.lower()), 'suggested_sbaplayer_id')
match_type_key = next((k for k in row.keys() if 'match' in k.lower() and 'type' in k.lower()), 'match_type')
decision = {
'player_id': int(row[player_id_key]),
'player_name': row['name'],
'bbref_id': row[bbref_id_key] if row[bbref_id_key] else None,
'seasons_appeared': row[seasons_key],
'resolution': resolution,
'suggested_sbaplayer_id': row[suggested_id_key],
'match_type': row[match_type_key]
}
decisions.append(decision)
logger.info(f"Found {len(decisions)} unmatched player decisions")
return decisions
def parse_high_risk_decisions():
"""Parse decisions from the high risk matches CSV"""
logger.info("Parsing high-risk match decisions...")
decisions = []
csv_file = '/mnt/NV2/Development/major-domo/database/high_risk_player_matches_updated.csv'
with open(csv_file, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
# Debug: print column names
logger.info(f"High-risk CSV columns: {reader.fieldnames}")
for row in reader:
resolution = row.get('resolution', '').strip()
if not resolution or resolution == 'SKIP':
continue
# Handle encoded column names
risk_type_key = next((k for k in row.keys() if 'risk' in k.lower() and 'type' in k.lower()), 'risk_type')
player_id_key = next((k for k in row.keys() if 'player' in k.lower() and 'id' in k.lower()), 'player_id')
player_name_key = next((k for k in row.keys() if 'player' in k.lower() and 'name' in k.lower()), 'player_name')
sba1_id_key = next((k for k in row.keys() if 'sba1' in k.lower() and 'id' in k.lower()), 'sba1_id')
sba1_name_key = next((k for k in row.keys() if 'sba1' in k.lower() and 'name' in k.lower()), 'sba1_name')
sba2_id_key = next((k for k in row.keys() if 'sba2' in k.lower() and 'id' in k.lower()), 'sba2_id')
sba2_name_key = next((k for k in row.keys() if 'sba2' in k.lower() and 'name' in k.lower()), 'sba2_name')
decision = {
'risk_type': row[risk_type_key],
'resolution': resolution,
'player_id': int(row[player_id_key]) if row[player_id_key] else None,
'player_name': row[player_name_key],
'sba1_id': int(row[sba1_id_key]) if row[sba1_id_key] else None,
'sba1_name': row[sba1_name_key],
'sba2_id': int(row[sba2_id_key]) if row[sba2_id_key] else None,
'sba2_name': row[sba2_name_key]
}
decisions.append(decision)
logger.info(f"Found {len(decisions)} high-risk match decisions")
return decisions
def process_decisions(all_players, sbaplayers, unmatched_decisions, high_risk_decisions):
"""Process all decisions and generate new SbaPlayers and assignments"""
# Create lookup maps
sbaplayers_by_id = {sba.id: sba for sba in sbaplayers}
players_by_id = {p.id: p for p in all_players}
# Track what we're creating and assigning
new_sbaplayers = []
player_assignments = [] # (player_id, sbaplayer_id) pairs
sbaplayer_merges = [] # (from_id, to_id) pairs
temp_id_counter = 90000 # Start temp IDs at 90000 to avoid conflicts
logger.info("Processing unmatched player decisions...")
# Process unmatched player decisions
for decision in unmatched_decisions:
resolution = decision['resolution']
player_id = decision['player_id']
player_name = decision['player_name']
bbref_id = decision['bbref_id']
if resolution == 'ACCEPT':
# Use suggested SbaPlayer ID
suggested_id = decision['suggested_sbaplayer_id']
if suggested_id and not suggested_id.startswith('PARTIAL:'):
sbaplayer_id = int(suggested_id)
# Find all players with same identity and assign them
if bbref_id:
# Group by bbref_id
matching_players = [p for p in all_players if p.bbref_id == bbref_id]
else:
# Group by name (for players without bbref_id)
matching_players = [p for p in all_players if p.name == player_name and not p.bbref_id]
for player in matching_players:
player_assignments.append((player.id, sbaplayer_id))
elif resolution.startswith('USE_SBA_'):
# Use specific SbaPlayer ID
sbaplayer_id = int(resolution.replace('USE_SBA_', ''))
# Find all players with same identity
if bbref_id:
matching_players = [p for p in all_players if p.bbref_id == bbref_id]
else:
matching_players = [p for p in all_players if p.name == player_name and not p.bbref_id]
for player in matching_players:
player_assignments.append((player.id, sbaplayer_id))
elif resolution == 'CREATE_NEW':
# Create new SbaPlayer
name_parts = player_name.strip().split()
if len(name_parts) >= 2:
first_name = name_parts[0]
last_name = ' '.join(name_parts[1:]) # Handle multiple part last names
else:
first_name = player_name
last_name = ""
new_sba = NewSbaPlayer(
first_name=first_name,
last_name=last_name,
key_bbref=bbref_id,
temp_id=temp_id_counter
)
new_sbaplayers.append(new_sba)
# Find all players with same identity and assign to new SbaPlayer
if bbref_id:
matching_players = [p for p in all_players if p.bbref_id == bbref_id]
else:
matching_players = [p for p in all_players if p.name == player_name and not p.bbref_id]
for player in matching_players:
player_assignments.append((player.id, temp_id_counter))
temp_id_counter += 1
logger.info("Processing high-risk match decisions...")
# Process high-risk decisions
for decision in high_risk_decisions:
resolution = decision['resolution']
risk_type = decision['risk_type']
if resolution.startswith('MERGE_') and '_INTO_' in resolution:
# Parse merge instruction: MERGE_123_INTO_456
parts = resolution.replace('MERGE_', '').split('_INTO_')
from_id = int(parts[0])
to_id = int(parts[1])
sbaplayer_merges.append((from_id, to_id))
elif resolution.startswith('USE_SBA_') and decision['player_id']:
# Player ambiguous match resolved
sbaplayer_id = int(resolution.replace('USE_SBA_', ''))
player_id = decision['player_id']
player_name = decision['player_name']
# Find all players with same name (no bbref_id since these are ambiguous matches)
matching_players = [p for p in all_players if p.name == player_name and not p.bbref_id]
for player in matching_players:
player_assignments.append((player.id, sbaplayer_id))
logger.info(f"Generated {len(new_sbaplayers)} new SbaPlayers")
logger.info(f"Generated {len(player_assignments)} player assignments")
logger.info(f"Generated {len(sbaplayer_merges)} SbaPlayer merges")
return new_sbaplayers, player_assignments, sbaplayer_merges
def generate_new_sbaplayers_csv(new_sbaplayers):
"""Generate CSV file with new SbaPlayers to insert"""
output_file = '/mnt/NV2/Development/major-domo/database/new_sbaplayers_to_insert.csv'
with open(output_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['temp_id', 'first_name', 'last_name', 'key_bbref', 'key_fangraphs', 'key_mlbam', 'key_retro'])
for sba in new_sbaplayers:
writer.writerow([
sba.temp_id,
sba.first_name,
sba.last_name,
sba.key_bbref or '',
'', # key_fangraphs (empty)
'', # key_mlbam (empty)
'' # key_retro (empty)
])
logger.info(f"Generated new SbaPlayers CSV: {output_file}")
return output_file
def generate_player_assignments_csv(player_assignments, sbaplayer_merges, all_players, sbaplayers_by_id):
"""Generate CSV file with all player assignments"""
output_file = '/mnt/NV2/Development/major-domo/database/player_sbaplayer_assignments.csv'
# Create merge mapping for resolving final IDs
merge_mapping = {}
for from_id, to_id in sbaplayer_merges:
merge_mapping[from_id] = to_id
players_by_id = {p.id: p for p in all_players}
with open(output_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([
'player_id', 'player_name', 'player_season', 'player_bbref_id',
'sbaplayer_id', 'sbaplayer_name', 'sbaplayer_bbref', 'assignment_type'
])
for player_id, sbaplayer_id in player_assignments:
player = players_by_id[player_id]
# Resolve final sbaplayer_id (in case of merges)
final_sbaplayer_id = merge_mapping.get(sbaplayer_id, sbaplayer_id)
# Get SbaPlayer info
if final_sbaplayer_id >= 90000:
# New SbaPlayer (temp ID)
sbaplayer_name = f"NEW_TEMP_{final_sbaplayer_id}"
sbaplayer_bbref = player.bbref_id or ''
assignment_type = 'new_sbaplayer'
else:
# Existing SbaPlayer
if final_sbaplayer_id in sbaplayers_by_id:
sba = sbaplayers_by_id[final_sbaplayer_id]
sbaplayer_name = f"{sba.first_name} {sba.last_name}"
sbaplayer_bbref = sba.key_bbref or ''
assignment_type = 'existing_sbaplayer'
else:
sbaplayer_name = f"UNKNOWN_SBA_{final_sbaplayer_id}"
sbaplayer_bbref = ''
assignment_type = 'error'
writer.writerow([
player.id,
player.name,
player.season,
player.bbref_id or '',
final_sbaplayer_id,
sbaplayer_name,
sbaplayer_bbref,
assignment_type
])
logger.info(f"Generated player assignments CSV: {output_file}")
return output_file
def generate_summary_report(new_sbaplayers, player_assignments, sbaplayer_merges):
"""Generate summary report"""
summary_file = '/mnt/NV2/Development/major-domo/database/processing_summary.txt'
with open(summary_file, 'w') as f:
f.write("MATCHING DECISIONS PROCESSING SUMMARY\n")
f.write("=" * 50 + "\n\n")
f.write(f"New SbaPlayers to create: {len(new_sbaplayers)}\n")
f.write(f"Player assignments to make: {len(player_assignments)}\n")
f.write(f"SbaPlayer merges to perform: {len(sbaplayer_merges)}\n\n")
f.write("FILES GENERATED:\n")
f.write("1. new_sbaplayers_to_insert.csv - New SbaPlayer records to create\n")
f.write("2. player_sbaplayer_assignments.csv - All player -> sbaplayer assignments\n\n")
f.write("SbaPlayer Merges:\n")
for from_id, to_id in sbaplayer_merges:
f.write(f" - Merge SbaPlayer {from_id} into {to_id}\n")
f.write(f"\nNEXT STEPS:\n")
f.write("1. Review the generated CSV files\n")
f.write("2. Execute SbaPlayer merges first (if any)\n")
f.write("3. Insert new SbaPlayers and get their real IDs\n")
f.write("4. Update all player records with sbaplayer_id assignments\n")
logger.info(f"Generated summary report: {summary_file}")
def main():
"""Main processing function"""
logger.info("Starting processing of matching decisions...")
# Load data
all_players, sbaplayers = load_cached_data()
sbaplayers_by_id = {sba.id: sba for sba in sbaplayers}
# Parse decisions
unmatched_decisions = parse_unmatched_decisions()
high_risk_decisions = parse_high_risk_decisions()
# Process decisions
new_sbaplayers, player_assignments, sbaplayer_merges = process_decisions(
all_players, sbaplayers, unmatched_decisions, high_risk_decisions
)
# Generate output files
new_sba_file = generate_new_sbaplayers_csv(new_sbaplayers)
assignments_file = generate_player_assignments_csv(
player_assignments, sbaplayer_merges, all_players, sbaplayers_by_id
)
generate_summary_report(new_sbaplayers, player_assignments, sbaplayer_merges)
logger.info("\n=== PROCESSING COMPLETE ===")
logger.info(f"New SbaPlayers CSV: {new_sba_file}")
logger.info(f"Player assignments CSV: {assignments_file}")
logger.info(f"Summary: processing_summary.txt")
if __name__ == "__main__":
main()