major-domo-database/.claude/sqlite-to-postgres/botched-sbaplayer-matching/generate_complete_assignments.py
Cal Corum 7130a1fd43 Postgres Migration
Migration documentation and scripts
2025-08-25 07:18:31 -05:00

325 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Generate complete player assignments for ALL players:
1. Automatic bbref_id matches
2. Manual review decisions
3. Create new SbaPlayers for remaining unmatched players
"""
import csv
import json
import logging
from dataclasses import dataclass
from typing import Dict, List, Set, Optional
from collections import defaultdict
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('CompleteAssignments')
@dataclass
class PlayerRecord:
id: int
name: str
season: int
bbref_id: Optional[str] = None
sbaplayer_id: Optional[int] = None
@dataclass
class SbaPlayerRecord:
id: int
first_name: str
last_name: str
key_bbref: Optional[str] = None
key_fangraphs: Optional[int] = None
key_mlbam: Optional[int] = None
key_retro: Optional[str] = None
@dataclass
class NewSbaPlayer:
first_name: str
last_name: str
key_bbref: Optional[str] = None
temp_id: int = 0
def load_cached_data():
"""Load cached player and sbaplayer data"""
logger.info("Loading cached data...")
# Load SbaPlayers
with open('/tmp/sbaplayers.json', 'r') as f:
sbaplayer_data = json.load(f)
sbaplayers = []
for data in sbaplayer_data:
sbaplayers.append(SbaPlayerRecord(**data))
# Load all players
all_players = []
for season in range(1, 13):
with open(f"/tmp/players_season_{season}.json", 'r') as f:
season_data = json.load(f)
for data in season_data:
all_players.append(PlayerRecord(**data))
logger.info(f"Loaded {len(sbaplayers)} SbaPlayers and {len(all_players)} player records")
return all_players, sbaplayers
def parse_manual_decisions():
"""Parse manual decisions from reviewed CSV files"""
logger.info("Parsing manual decisions...")
manual_assignments = {} # player_id -> sbaplayer_id
# Parse unmatched decisions
try:
with open('/mnt/NV2/Development/major-domo/database/unmatched_players_for_review_updated.csv', 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
resolution = row.get('resolution', '').strip()
if not resolution or resolution == 'SKIP':
continue
# Get column names dynamically
player_id_key = next((k for k in row.keys() if 'player' in k.lower() and 'id' in k.lower()), 'player_id')
bbref_id_key = next((k for k in row.keys() if 'bbref' in k.lower() and 'id' in k.lower()), 'bbref_id')
suggested_id_key = next((k for k in row.keys() if 'suggested' in k.lower() and 'sbaplayer' in k.lower() and 'id' in k.lower()), 'suggested_sbaplayer_id')
player_id = int(row[player_id_key])
player_name = row['name']
bbref_id = row[bbref_id_key] if row[bbref_id_key] else None
if resolution == 'ACCEPT':
suggested_id = row[suggested_id_key]
if suggested_id and not suggested_id.startswith('PARTIAL:'):
manual_assignments[player_id] = int(suggested_id)
elif resolution.startswith('USE_SBA_'):
sbaplayer_id = int(resolution.replace('USE_SBA_', ''))
manual_assignments[player_id] = sbaplayer_id
# Note: CREATE_NEW cases will be handled by creating new SbaPlayers
except FileNotFoundError:
logger.warning("Unmatched decisions file not found")
# Parse high-risk decisions
try:
with open('/mnt/NV2/Development/major-domo/database/high_risk_player_matches_updated.csv', 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
resolution = row.get('resolution', '').strip()
if not resolution or resolution == 'SKIP':
continue
# Get column names dynamically
player_id_key = next((k for k in row.keys() if 'player' in k.lower() and 'id' in k.lower()), 'player_id')
if resolution.startswith('USE_SBA_') and row[player_id_key]:
player_id = int(row[player_id_key])
sbaplayer_id = int(resolution.replace('USE_SBA_', ''))
manual_assignments[player_id] = sbaplayer_id
except FileNotFoundError:
logger.warning("High-risk decisions file not found")
logger.info(f"Found {len(manual_assignments)} manual player assignments")
return manual_assignments
def generate_complete_assignments(all_players, sbaplayers, manual_assignments):
"""Generate assignments for ALL players"""
# Create lookup maps
sbaplayers_by_bbref = {}
for sba in sbaplayers:
if sba.key_bbref:
sbaplayers_by_bbref[sba.key_bbref] = sba
# Track assignments and new SbaPlayers needed
all_assignments = [] # (player_id, sbaplayer_id, assignment_type)
new_sbaplayers = []
players_needing_new_sba = defaultdict(list) # group by unique identifier
temp_id_counter = 90000
logger.info("Processing all player records...")
for player in all_players:
assignment_type = ""
sbaplayer_id = None
# 1. Check manual assignments first
if player.id in manual_assignments:
sbaplayer_id = manual_assignments[player.id]
assignment_type = "manual_decision"
# 2. Try automatic bbref_id matching
elif player.bbref_id and player.bbref_id in sbaplayers_by_bbref:
sba = sbaplayers_by_bbref[player.bbref_id]
sbaplayer_id = sba.id
assignment_type = "automatic_bbref_match"
# 3. Player needs new SbaPlayer record
else:
# Group players who need new SbaPlayer records
if player.bbref_id:
# Group by bbref_id
key = f"bbref:{player.bbref_id}"
unique_name = player.name # Use name from any player with this bbref_id
unique_bbref = player.bbref_id
else:
# Group by name
key = f"name:{player.name}"
unique_name = player.name
unique_bbref = None
players_needing_new_sba[key].append(player)
continue # Will process these after creating new SbaPlayers
# Add assignment
if sbaplayer_id:
all_assignments.append((player.id, sbaplayer_id, assignment_type))
logger.info(f"Direct assignments: {len(all_assignments)}")
logger.info(f"Player groups needing new SbaPlayers: {len(players_needing_new_sba)}")
# Create new SbaPlayer records for remaining players
for key, players in players_needing_new_sba.items():
# Use the first player as representative
representative = players[0]
# Parse name
name_parts = representative.name.strip().split()
if len(name_parts) >= 2:
first_name = name_parts[0]
last_name = ' '.join(name_parts[1:])
else:
first_name = representative.name
last_name = ""
# Create new SbaPlayer
new_sba = NewSbaPlayer(
first_name=first_name,
last_name=last_name,
key_bbref=representative.bbref_id,
temp_id=temp_id_counter
)
new_sbaplayers.append(new_sba)
# Assign all players in this group to the new SbaPlayer
for player in players:
all_assignments.append((player.id, temp_id_counter, "new_sbaplayer"))
temp_id_counter += 1
logger.info(f"Total assignments: {len(all_assignments)}")
logger.info(f"New SbaPlayers to create: {len(new_sbaplayers)}")
return all_assignments, new_sbaplayers
def generate_csv_files(all_assignments, new_sbaplayers, all_players, sbaplayers):
"""Generate the final CSV files"""
players_by_id = {p.id: p for p in all_players}
sbaplayers_by_id = {sba.id: sba for sba in sbaplayers}
# Generate new SbaPlayers CSV
logger.info("Generating new SbaPlayers CSV...")
with open('/mnt/NV2/Development/major-domo/database/new_sbaplayers_to_insert_complete.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['temp_id', 'first_name', 'last_name', 'key_bbref', 'key_fangraphs', 'key_mlbam', 'key_retro'])
for sba in new_sbaplayers:
writer.writerow([
sba.temp_id,
sba.first_name,
sba.last_name,
sba.key_bbref or '',
'', # key_fangraphs
'', # key_mlbam
'' # key_retro
])
# Generate complete player assignments CSV
logger.info("Generating complete player assignments CSV...")
with open('/mnt/NV2/Development/major-domo/database/player_sbaplayer_assignments_complete.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([
'player_id', 'player_name', 'player_season', 'player_bbref_id',
'sbaplayer_id', 'sbaplayer_name', 'sbaplayer_bbref', 'assignment_type'
])
for player_id, sbaplayer_id, assignment_type in all_assignments:
player = players_by_id[player_id]
# Get SbaPlayer info
if sbaplayer_id >= 90000:
# New SbaPlayer (temp ID)
sbaplayer_name = f"NEW_TEMP_{sbaplayer_id}"
sbaplayer_bbref = player.bbref_id or ''
else:
# Existing SbaPlayer
if sbaplayer_id in sbaplayers_by_id:
sba = sbaplayers_by_id[sbaplayer_id]
sbaplayer_name = f"{sba.first_name} {sba.last_name}"
sbaplayer_bbref = sba.key_bbref or ''
else:
sbaplayer_name = f"UNKNOWN_SBA_{sbaplayer_id}"
sbaplayer_bbref = ''
writer.writerow([
player.id,
player.name,
player.season,
player.bbref_id or '',
sbaplayer_id,
sbaplayer_name,
sbaplayer_bbref,
assignment_type
])
# Generate summary
with open('/mnt/NV2/Development/major-domo/database/complete_assignment_summary.txt', 'w') as f:
f.write("COMPLETE PLAYER ASSIGNMENT SUMMARY\n")
f.write("=" * 50 + "\n\n")
# Count by assignment type
type_counts = defaultdict(int)
for _, _, assignment_type in all_assignments:
type_counts[assignment_type] += 1
f.write("ASSIGNMENT BREAKDOWN:\n")
for assignment_type, count in type_counts.items():
f.write(f" {assignment_type}: {count:,} players\n")
f.write(f"\nTOTAL ASSIGNMENTS: {len(all_assignments):,}\n")
f.write(f"NEW SBAPLAYERS TO CREATE: {len(new_sbaplayers):,}\n\n")
f.write("FILES GENERATED:\n")
f.write("1. new_sbaplayers_to_insert_complete.csv - New SbaPlayer records\n")
f.write("2. player_sbaplayer_assignments_complete.csv - ALL player assignments\n")
def main():
"""Generate complete assignments for all players"""
logger.info("Generating complete player assignments...")
# Load data
all_players, sbaplayers = load_cached_data()
# Parse manual decisions
manual_assignments = parse_manual_decisions()
# Generate complete assignments
all_assignments, new_sbaplayers = generate_complete_assignments(
all_players, sbaplayers, manual_assignments
)
# Generate CSV files
generate_csv_files(all_assignments, new_sbaplayers, all_players, sbaplayers)
logger.info("\n=== COMPLETE ASSIGNMENT GENERATION COMPLETE ===")
logger.info(f"Total player assignments: {len(all_assignments):,}")
logger.info(f"New SbaPlayers needed: {len(new_sbaplayers):,}")
if __name__ == "__main__":
main()