major-domo-database/.claude/sqlite-to-postgres/player-to-sbaplayer-matching/generate_final_assignments_v2.py
Cal Corum 7130a1fd43 Postgres Migration
Migration documentation and scripts
2025-08-25 07:18:31 -05:00

492 lines
21 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Generate final player-to-SbaPlayer assignments by combining:
1. Automatic matches from comprehensive matching (Tier 1 & 2)
2. Manual decisions from review CSV
3. New SbaPlayer records for unmatched players
FIXED VERSION: Properly handles all 12,232 individual player records
"""
import json
import csv
import logging
import re
from dataclasses import dataclass, asdict
from typing import Dict, List, Set, Optional, Tuple
from collections import defaultdict
# Import functions from comprehensive matching
from comprehensive_player_matching import (
PlayerRecord, SbaPlayerRecord, normalize_name,
create_name_variants, load_cached_data, create_matching_maps
)
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('matching.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(f'{__name__}.generate_final_assignments_v2')
@dataclass
class ManualDecision:
"""Manual decision from review CSV"""
group_key: str
player_name: str
bbref_id: str
seasons_appeared: str
sample_player_ids: str
potential_existing_sbaplayer_id: str
potential_existing_sbaplayer_name: str
potential_match_reason: str
your_decision_sbaplayer_id: str
your_decision_notes: str
@dataclass
class FinalAssignment:
"""Final player assignment result"""
player_id: int
player_name: str
season: int
bbref_id: Optional[str]
assigned_sbaplayer_id: int
assignment_source: str # 'tier1_bbref', 'tier2_name', 'manual_existing', 'manual_new'
notes: str = ""
@dataclass
class NewSbaPlayer:
"""New SbaPlayer record to create"""
temp_id: int # Temporary ID for assignments
canonical_name: str
key_bbref: Optional[str]
first_name: str
last_name: str
name_variations: List[str]
player_count: int
notes: str
def parse_manual_decisions():
"""Parse manual decisions from the review CSV"""
decisions = []
logger.info("Parsing manual decisions from new_sbaplayers_for_review.csv...")
with open('new_sbaplayers_for_review.csv', 'r', encoding='utf-8') as f:
# Handle the encoded format
content = f.read()
# Decode common HTML entities
content = content.replace('+AF8-', '_')
content = content.replace('+ACI-', '"')
content = content.replace('+AC0-', '-')
# Parse as CSV
lines = content.strip().split('\n')
reader = csv.DictReader(lines)
for row in reader:
if not row.get('group_key'): # Skip empty rows
continue
decision = ManualDecision(
group_key=row['group_key'],
player_name=row['player_name'],
bbref_id=row['bbref_id'],
seasons_appeared=row['seasons_appeared'],
sample_player_ids=row['sample_player_ids'],
potential_existing_sbaplayer_id=row['potential_existing_sbaplayer_id'],
potential_existing_sbaplayer_name=row['potential_existing_sbaplayer_name'],
potential_match_reason=row['potential_match_reason'],
your_decision_sbaplayer_id=row['your_decision_sbaplayer_id'],
your_decision_notes=row['your_decision_notes']
)
decisions.append(decision)
logger.info(f"Parsed {len(decisions)} manual decisions")
return decisions
def run_comprehensive_matching(all_players, sbaplayers):
"""Run the comprehensive matching logic to get automatic assignments"""
logger.info("Running comprehensive matching logic...")
# Create lookup maps
bbref_map, name_map = create_matching_maps(sbaplayers)
# Track all assignments
assignments = {} # player_id -> (sbaplayer_id, source, notes)
# TIER 1: bbref_id matching
logger.info("Tier 1: Matching by bbref_id...")
tier1_count = 0
for player in all_players:
# Treat "HALP" as corrupted data (equivalent to no bbref_id)
valid_bbref_id = player.bbref_id if player.bbref_id and player.bbref_id != "HALP" else None
if valid_bbref_id and valid_bbref_id in bbref_map:
sba = bbref_map[valid_bbref_id]
assignments[player.id] = (
sba.id,
"tier1_bbref",
f"Automatic match via bbref_id to {sba.full_name}"
)
tier1_count += 1
logger.info(f"Tier 1 matches: {tier1_count} player records")
# TIER 2: exact name matching (for players without valid bbref_id that weren't matched in tier 1)
logger.info("Tier 2: Matching by exact name...")
tier2_count = 0
for player in all_players:
# Treat "HALP" as corrupted data (equivalent to no bbref_id)
valid_bbref_id = player.bbref_id if player.bbref_id and player.bbref_id != "HALP" else None
if player.id not in assignments and not valid_bbref_id: # Not matched in tier 1, no valid bbref_id
normalized_name = normalize_name(player.name)
if normalized_name in name_map:
potential_matches = name_map[normalized_name]
if len(potential_matches) == 1: # Unambiguous match
sba = potential_matches[0]
assignments[player.id] = (
sba.id,
"tier2_name",
f"Automatic match via exact name to {sba.full_name}"
)
tier2_count += 1
logger.info(f"Tier 2 matches: {tier2_count} player records")
logger.info(f"Total automatic matches: {tier1_count + tier2_count}")
return assignments
def process_manual_decisions(decisions: List[ManualDecision], sbaplayers: List[SbaPlayerRecord]) -> Tuple[Dict, Dict]:
"""Process manual decisions into existing matches and new players needed"""
# Create SbaPlayer lookup
sbaplayer_map = {sp.id: sp for sp in sbaplayers}
# Process decisions
existing_matches = {} # player_name -> sbaplayer_id
new_player_groups = {} # canonical_name -> [player_names]
special_cases = {} # Handle consolidations and corrupted data
for decision in decisions:
player_name = decision.player_name
if decision.your_decision_sbaplayer_id:
# User chose an existing SbaPlayer
try:
sbaplayer_id = int(decision.your_decision_sbaplayer_id)
if sbaplayer_id in sbaplayer_map:
existing_matches[player_name] = sbaplayer_id
logger.info(f"Manual match: '{player_name}' -> SbaPlayer ID {sbaplayer_id} ({sbaplayer_map[sbaplayer_id].full_name})")
else:
logger.warning(f"Invalid SbaPlayer ID {sbaplayer_id} for {player_name}")
except ValueError:
logger.warning(f"Invalid SbaPlayer ID format: {decision.your_decision_sbaplayer_id}")
else:
# User decided this needs a new SbaPlayer record
canonical_name = player_name # Default to same name
# Check for special consolidation cases
if "Three-way match" in decision.your_decision_notes:
if "use name Tom Eshelman" in decision.your_decision_notes:
canonical_name = "Tom Eshelman"
elif "Two-way match" in decision.your_decision_notes:
if "join with bbref_id mejiafr01" in decision.your_decision_notes:
# This Francisco Mejia (HALP) should consolidate with the legitimate one
canonical_name = "Francisco Mejia" # Will be consolidated later
special_cases[player_name] = "consolidate_with_mejiafr01"
# Group players by canonical name
if canonical_name not in new_player_groups:
new_player_groups[canonical_name] = []
new_player_groups[canonical_name].append(player_name)
logger.info(f"Manual decisions processed:")
logger.info(f" Existing matches: {len(existing_matches)}")
logger.info(f" New player groups: {len(new_player_groups)}")
return existing_matches, new_player_groups, special_cases
def create_new_sbaplayer_records(new_player_groups: Dict[str, List[str]], all_players: List[PlayerRecord],
sbaplayers: List[SbaPlayerRecord]) -> List[NewSbaPlayer]:
"""Create new SbaPlayer records for unmatched player groups"""
new_sbaplayers = []
next_id = max([sp.id for sp in sbaplayers]) + 1000 # Start from high ID to avoid conflicts
for canonical_name, player_names in new_player_groups.items():
# Find all player records for this group
all_variants = set(player_names)
group_players = [p for p in all_players if p.name in all_variants]
if not group_players:
logger.warning(f"No player records found for group: {canonical_name}")
continue
# Get bbref_id from any player that has one
bbref_id = None
for player in group_players:
if player.bbref_id and player.bbref_id != "HALP": # Skip corrupted data
bbref_id = player.bbref_id
break
# Parse canonical name for first/last
name_parts = canonical_name.split()
if len(name_parts) >= 2:
first_name = name_parts[0]
last_name = ' '.join(name_parts[1:])
else:
first_name = canonical_name
last_name = ""
# Create new SbaPlayer record
new_sba = NewSbaPlayer(
temp_id=next_id,
canonical_name=canonical_name,
key_bbref=bbref_id,
first_name=first_name,
last_name=last_name,
name_variations=player_names,
player_count=len(group_players),
notes=f"Created from manual review. Variations: {', '.join(sorted(set(player_names)))}"
)
new_sbaplayers.append(new_sba)
logger.info(f"New SbaPlayer: '{canonical_name}' (temp ID {next_id}) for {len(group_players)} player records")
next_id += 1
return new_sbaplayers
def generate_all_assignments(all_players: List[PlayerRecord], sbaplayers: List[SbaPlayerRecord],
auto_assignments: Dict, existing_matches: Dict[str, int],
new_sbaplayers: List[NewSbaPlayer]) -> List[FinalAssignment]:
"""Generate complete assignments for ALL 12,232 player records"""
assignments = []
# Create lookup maps
sbaplayer_map = {sp.id: sp for sp in sbaplayers}
new_sbaplayer_map = {} # name -> temp_id
for new_sba in new_sbaplayers:
for name_variant in new_sba.name_variations:
new_sbaplayer_map[name_variant] = new_sba.temp_id
# Track processed player IDs
processed_player_ids = set()
# Process every single player record
for player in all_players:
assignment = None
# 1. Check if already assigned via automatic matching
if player.id in auto_assignments:
sbaplayer_id, source, notes = auto_assignments[player.id]
assignment = FinalAssignment(
player_id=player.id,
player_name=player.name,
season=player.season,
bbref_id=player.bbref_id,
assigned_sbaplayer_id=sbaplayer_id,
assignment_source=source,
notes=notes
)
# 2. Check if manually assigned to existing SbaPlayer
elif player.name in existing_matches:
sbaplayer_id = existing_matches[player.name]
sba_name = sbaplayer_map[sbaplayer_id].full_name if sbaplayer_id in sbaplayer_map else "Unknown"
assignment = FinalAssignment(
player_id=player.id,
player_name=player.name,
season=player.season,
bbref_id=player.bbref_id,
assigned_sbaplayer_id=sbaplayer_id,
assignment_source="manual_existing",
notes=f"Manual match to existing SbaPlayer: {sba_name}"
)
# 3. Check if assigned to new SbaPlayer
elif player.name in new_sbaplayer_map:
temp_id = new_sbaplayer_map[player.name]
new_sba = next((ns for ns in new_sbaplayers if ns.temp_id == temp_id), None)
canonical_name = new_sba.canonical_name if new_sba else player.name
assignment = FinalAssignment(
player_id=player.id,
player_name=player.name,
season=player.season,
bbref_id=player.bbref_id,
assigned_sbaplayer_id=temp_id,
assignment_source="manual_new",
notes=f"Manual assignment to new SbaPlayer: {canonical_name}"
)
# 4. This shouldn't happen - every player should be assigned by now
else:
logger.error(f"UNASSIGNED PLAYER: {player.name} (ID: {player.id}, Season: {player.season})")
# Create emergency new SbaPlayer for this orphaned player
emergency_id = 9999000 + player.id # Use a very high ID to avoid conflicts
assignment = FinalAssignment(
player_id=player.id,
player_name=player.name,
season=player.season,
bbref_id=player.bbref_id,
assigned_sbaplayer_id=emergency_id,
assignment_source="emergency_new",
notes=f"EMERGENCY: Unassigned player, needs new SbaPlayer record"
)
if assignment:
assignments.append(assignment)
processed_player_ids.add(player.id)
logger.info(f"Generated {len(assignments)} total player assignments")
# Verify we got everyone
if len(assignments) != len(all_players):
logger.error(f"MISMATCH: Expected {len(all_players)} assignments, got {len(assignments)}")
else:
logger.info("✅ All players successfully assigned!")
return assignments
def save_assignment_files(assignments: List[FinalAssignment], new_sbaplayers: List[NewSbaPlayer]):
"""Save the final assignment files"""
# Save player assignments
logger.info("Saving player_sbaplayer_assignments.csv...")
with open('player_sbaplayer_assignments.csv', 'w', newline='') as f:
fieldnames = ['player_id', 'player_name', 'season', 'bbref_id', 'assigned_sbaplayer_id', 'assignment_source', 'notes']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for assignment in assignments:
writer.writerow(asdict(assignment))
logger.info(f"Saved {len(assignments)} player assignments")
# Save new SbaPlayers to create
logger.info("Saving new_sbaplayers_to_insert.csv...")
with open('new_sbaplayers_to_insert.csv', 'w', newline='') as f:
fieldnames = ['temp_id', 'canonical_name', 'first_name', 'last_name', 'key_bbref', 'name_variations', 'player_count', 'notes']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for new_sba in new_sbaplayers:
row = asdict(new_sba)
row['name_variations'] = ', '.join(row['name_variations']) # Convert list to string
writer.writerow(row)
logger.info(f"Saved {len(new_sbaplayers)} new SbaPlayer records")
def generate_summary_report(assignments: List[FinalAssignment], new_sbaplayers: List[NewSbaPlayer]):
"""Generate final summary report"""
# Count assignments by source
source_counts = defaultdict(int)
for assignment in assignments:
source_counts[assignment.assignment_source] += 1
logger.info("Generating final summary report...")
with open('final_assignment_summary.txt', 'w') as f:
f.write("FINAL PLAYER-TO-SBAPLAYER ASSIGNMENT SUMMARY\n")
f.write("=" * 55 + "\n\n")
f.write("ASSIGNMENT BREAKDOWN:\n")
f.write(f" Tier 1 (bbref_id): {source_counts['tier1_bbref']:,} players\n")
f.write(f" Tier 2 (exact name): {source_counts['tier2_name']:,} players\n")
f.write(f" Manual existing: {source_counts['manual_existing']:,} players\n")
f.write(f" Manual new: {source_counts['manual_new']:,} players\n")
if source_counts['emergency_new'] > 0:
f.write(f" Emergency new: {source_counts['emergency_new']:,} players (ERROR)\n")
f.write(f" TOTAL ASSIGNMENTS: {len(assignments):,} players\n\n")
f.write("NEW SBAPLAYER RECORDS TO CREATE:\n")
f.write(f" Total new records: {len(new_sbaplayers)}\n")
for new_sba in new_sbaplayers:
f.write(f" {new_sba.canonical_name} (temp ID {new_sba.temp_id}) - {new_sba.player_count} players\n")
f.write("\n")
f.write("CAREER STAT TRACKING ENABLED FOR:\n")
existing_sbaplayers = len([a for a in assignments if a.assignment_source in ['tier1_bbref', 'tier2_name', 'manual_existing']])
f.write(f" {existing_sbaplayers:,} players linked to existing SbaPlayers\n")
f.write(f" {source_counts['manual_new']:,} players will have career stats after new SbaPlayer creation\n")
f.write(f" TOTAL: {len(assignments):,} players will have career stat tracking\n\n")
f.write("FILES GENERATED:\n")
f.write(" - player_sbaplayer_assignments.csv (ready for API updates)\n")
f.write(" - new_sbaplayers_to_insert.csv (new SbaPlayer records to create first)\n\n")
f.write("NEXT STEPS:\n")
f.write("1. Review the assignment files for any issues\n")
f.write("2. Create new SbaPlayer records via API (new_sbaplayers_to_insert.csv)\n")
f.write("3. Update all player.sbaplayer_id fields via API (player_sbaplayer_assignments.csv)\n")
f.write("4. Verify career stat tracking works correctly\n")
def main():
"""Main processing function"""
logger.info("Starting final assignment generation (FIXED VERSION)...")
try:
# Load cached data
all_players, sbaplayers = load_cached_data()
logger.info(f"Loaded {len(all_players)} players and {len(sbaplayers)} SbaPlayers")
# Run comprehensive matching for automatic assignments
logger.info("Running comprehensive matching...")
auto_assignments = run_comprehensive_matching(all_players, sbaplayers)
# Parse manual decisions
decisions = parse_manual_decisions()
existing_matches, new_player_groups, special_cases = process_manual_decisions(decisions, sbaplayers)
# Create new SbaPlayer records
new_sbaplayers = create_new_sbaplayer_records(new_player_groups, all_players, sbaplayers)
# Generate complete assignments for ALL players
assignments = generate_all_assignments(
all_players, sbaplayers, auto_assignments, existing_matches, new_sbaplayers
)
# Save files
save_assignment_files(assignments, new_sbaplayers)
# Generate summary
generate_summary_report(assignments, new_sbaplayers)
logger.info("✅ Final assignment generation completed successfully!")
# Print summary
source_counts = defaultdict(int)
for assignment in assignments:
source_counts[assignment.assignment_source] += 1
print(f"\n🎉 FINAL RESULTS:")
print(f" 📊 {len(assignments):,} total player assignments generated")
print(f" 🔗 {source_counts['tier1_bbref']:,} Tier 1 (bbref_id) matches")
print(f" 📝 {source_counts['tier2_name']:,} Tier 2 (exact name) matches")
print(f" 👤 {source_counts['manual_existing']:,} Manual existing matches")
print(f" {source_counts['manual_new']:,} Manual new SbaPlayer assignments")
if source_counts['emergency_new'] > 0:
print(f" 🚨 {source_counts['emergency_new']:,} Emergency assignments (ERROR)")
print(f" 📋 {len(new_sbaplayers)} new SbaPlayer records to create")
print(f"\n📁 Files generated:")
print(f" - player_sbaplayer_assignments.csv")
print(f" - new_sbaplayers_to_insert.csv")
print(f" - final_assignment_summary.txt")
# Verify total matches expected result
expected_total = len(all_players)
if len(assignments) == expected_total:
print(f"\n✅ SUCCESS: All {expected_total:,} players successfully assigned!")
else:
print(f"\n❌ ERROR: Expected {expected_total:,}, got {len(assignments):,} assignments")
except Exception as e:
logger.error(f"Error generating final assignments: {e}")
raise
if __name__ == "__main__":
main()