major-domo-database/.claude/sqlite-to-postgres/player-to-sbaplayer-matching/generate_final_assignments.py
Cal Corum 7130a1fd43 Postgres Migration
Migration documentation and scripts
2025-08-25 07:18:31 -05:00

447 lines
19 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Generate final player-to-SbaPlayer assignments by combining:
1. Automatic matches from comprehensive matching (Tier 1 & 2)
2. Manual decisions from review CSV
3. New SbaPlayer records for unmatched players
"""
import json
import csv
import logging
import re
from dataclasses import dataclass, asdict
from typing import Dict, List, Set, Optional, Tuple
from collections import defaultdict
# Import functions from comprehensive matching
from comprehensive_player_matching import (
PlayerRecord, SbaPlayerRecord, MatchResult, normalize_name,
create_name_variants, load_cached_data, create_matching_maps,
match_players_tier1_bbref, match_players_tier2_name, find_unmatched_players
)
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('matching.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(f'{__name__}.generate_final_assignments')
@dataclass
class ManualDecision:
"""Manual decision from review CSV"""
group_key: str
player_name: str
bbref_id: str
seasons_appeared: str
sample_player_ids: str
potential_existing_sbaplayer_id: str
potential_existing_sbaplayer_name: str
potential_match_reason: str
your_decision_sbaplayer_id: str
your_decision_notes: str
@dataclass
class FinalAssignment:
"""Final player assignment result"""
player_id: int
player_name: str
season: int
bbref_id: Optional[str]
assigned_sbaplayer_id: int
assignment_source: str # 'tier1_bbref', 'tier2_name', 'manual_existing', 'manual_new'
notes: str = ""
@dataclass
class NewSbaPlayer:
"""New SbaPlayer record to create"""
temp_id: int # Temporary ID for assignments
canonical_name: str
key_bbref: Optional[str]
first_name: str
last_name: str
name_variations: List[str]
player_count: int
notes: str
def parse_manual_decisions():
"""Parse manual decisions from the review CSV"""
decisions = []
logger.info("Parsing manual decisions from new_sbaplayers_for_review.csv...")
with open('new_sbaplayers_for_review.csv', 'r', encoding='utf-8') as f:
# Handle the encoded format
content = f.read()
# Decode common HTML entities
content = content.replace('+AF8-', '_')
content = content.replace('+ACI-', '"')
content = content.replace('+AC0-', '-')
# Parse as CSV
lines = content.strip().split('\n')
reader = csv.DictReader(lines)
for row in reader:
if not row.get('group_key'): # Skip empty rows
continue
decision = ManualDecision(
group_key=row['group_key'],
player_name=row['player_name'],
bbref_id=row['bbref_id'],
seasons_appeared=row['seasons_appeared'],
sample_player_ids=row['sample_player_ids'],
potential_existing_sbaplayer_id=row['potential_existing_sbaplayer_id'],
potential_existing_sbaplayer_name=row['potential_existing_sbaplayer_name'],
potential_match_reason=row['potential_match_reason'],
your_decision_sbaplayer_id=row['your_decision_sbaplayer_id'],
your_decision_notes=row['your_decision_notes']
)
decisions.append(decision)
logger.info(f"Parsed {len(decisions)} manual decisions")
return decisions
def process_manual_decisions(decisions: List[ManualDecision], sbaplayers: List[SbaPlayerRecord]) -> Tuple[Dict, Dict]:
"""Process manual decisions into existing matches and new players needed"""
# Create SbaPlayer lookup
sbaplayer_map = {sp.id: sp for sp in sbaplayers}
# Process decisions
existing_matches = {} # player_name -> sbaplayer_id
new_player_groups = {} # canonical_name -> [player_names]
special_cases = {} # Handle consolidations and corrupted data
for decision in decisions:
player_name = decision.player_name
if decision.your_decision_sbaplayer_id:
# User chose an existing SbaPlayer
try:
sbaplayer_id = int(decision.your_decision_sbaplayer_id)
if sbaplayer_id in sbaplayer_map:
existing_matches[player_name] = sbaplayer_id
logger.info(f"Manual match: '{player_name}' -> SbaPlayer ID {sbaplayer_id} ({sbaplayer_map[sbaplayer_id].full_name})")
else:
logger.warning(f"Invalid SbaPlayer ID {sbaplayer_id} for {player_name}")
except ValueError:
logger.warning(f"Invalid SbaPlayer ID format: {decision.your_decision_sbaplayer_id}")
else:
# User decided this needs a new SbaPlayer record
canonical_name = player_name # Default to same name
# Check for special consolidation cases
if "Three-way match" in decision.your_decision_notes:
if "use name Tom Eshelman" in decision.your_decision_notes:
canonical_name = "Tom Eshelman"
elif "Two-way match" in decision.your_decision_notes:
if "join with bbref_id mejiafr01" in decision.your_decision_notes:
# This Francisco Mejia (HALP) should consolidate with the legitimate one
canonical_name = "Francisco Mejia" # Will be consolidated later
special_cases[player_name] = "consolidate_with_mejiafr01"
# Group players by canonical name
if canonical_name not in new_player_groups:
new_player_groups[canonical_name] = []
new_player_groups[canonical_name].append(player_name)
logger.info(f"Manual decisions processed:")
logger.info(f" Existing matches: {len(existing_matches)}")
logger.info(f" New player groups: {len(new_player_groups)}")
return existing_matches, new_player_groups, special_cases
def create_new_sbaplayer_records(new_player_groups: Dict[str, List[str]], all_players: List[PlayerRecord],
sbaplayers: List[SbaPlayerRecord]) -> List[NewSbaPlayer]:
"""Create new SbaPlayer records for unmatched player groups"""
new_sbaplayers = []
next_id = max([sp.id for sp in sbaplayers]) + 1000 # Start from high ID to avoid conflicts
for canonical_name, player_names in new_player_groups.items():
# Find all player records for this group
all_variants = set(player_names)
group_players = [p for p in all_players if p.name in all_variants]
if not group_players:
logger.warning(f"No player records found for group: {canonical_name}")
continue
# Get bbref_id from any player that has one
bbref_id = None
for player in group_players:
if player.bbref_id and player.bbref_id != "HALP": # Skip corrupted data
bbref_id = player.bbref_id
break
# Parse canonical name for first/last
name_parts = canonical_name.split()
if len(name_parts) >= 2:
first_name = name_parts[0]
last_name = ' '.join(name_parts[1:])
else:
first_name = canonical_name
last_name = ""
# Create new SbaPlayer record
new_sba = NewSbaPlayer(
temp_id=next_id,
canonical_name=canonical_name,
key_bbref=bbref_id,
first_name=first_name,
last_name=last_name,
name_variations=player_names,
player_count=len(group_players),
notes=f"Created from manual review. Variations: {', '.join(sorted(set(player_names)))}"
)
new_sbaplayers.append(new_sba)
logger.info(f"New SbaPlayer: '{canonical_name}' (temp ID {next_id}) for {len(group_players)} player records")
next_id += 1
return new_sbaplayers
def generate_all_assignments(all_players: List[PlayerRecord], sbaplayers: List[SbaPlayerRecord],
tier1_matches: List[MatchResult], tier2_matches: List[MatchResult],
existing_matches: Dict[str, int], new_sbaplayers: List[NewSbaPlayer]) -> List[FinalAssignment]:
"""Generate complete assignments for all 12,232+ player records"""
assignments = []
# Create lookup maps
sbaplayer_map = {sp.id: sp for sp in sbaplayers}
new_sbaplayer_map = {} # name -> temp_id
for new_sba in new_sbaplayers:
for name_variant in new_sba.name_variations:
new_sbaplayer_map[name_variant] = new_sba.temp_id
# Track assigned player IDs
assigned_player_ids = set()
# 1. Process Tier 1 matches (bbref_id)
logger.info("Processing Tier 1 (bbref_id) assignments...")
for match in tier1_matches:
# Find the player record to get season info
player_record = next((p for p in all_players if p.id == match.player_id), None)
season = player_record.season if player_record else 0
assignment = FinalAssignment(
player_id=match.player_id,
player_name=match.player_name,
season=season,
bbref_id=match.player_bbref_id,
assigned_sbaplayer_id=match.sbaplayer_id,
assignment_source="tier1_bbref",
notes=f"Automatic match via bbref_id to {match.sbaplayer_name}"
)
assignments.append(assignment)
assigned_player_ids.add(match.player_id)
# 2. Process Tier 2 matches (exact name)
logger.info("Processing Tier 2 (exact name) assignments...")
for match in tier2_matches:
# Find the player record to get season info
player_record = next((p for p in all_players if p.id == match.player_id), None)
season = player_record.season if player_record else 0
assignment = FinalAssignment(
player_id=match.player_id,
player_name=match.player_name,
season=season,
bbref_id=match.player_bbref_id,
assigned_sbaplayer_id=match.sbaplayer_id,
assignment_source="tier2_name",
notes=f"Automatic match via exact name to {match.sbaplayer_name}"
)
assignments.append(assignment)
assigned_player_ids.add(match.player_id)
# 3. Process manual existing matches
logger.info("Processing manual existing matches...")
for player_name, sbaplayer_id in existing_matches.items():
matching_players = [p for p in all_players if p.name == player_name and p.id not in assigned_player_ids]
sba_name = sbaplayer_map[sbaplayer_id].full_name if sbaplayer_id in sbaplayer_map else "Unknown"
for player in matching_players:
assignment = FinalAssignment(
player_id=player.id,
player_name=player.name,
season=player.season,
bbref_id=player.bbref_id,
assigned_sbaplayer_id=sbaplayer_id,
assignment_source="manual_existing",
notes=f"Manual match to existing SbaPlayer: {sba_name}"
)
assignments.append(assignment)
assigned_player_ids.add(player.id)
# 4. Process new SbaPlayer assignments
logger.info("Processing new SbaPlayer assignments...")
for new_sba in new_sbaplayers:
for name_variant in new_sba.name_variations:
matching_players = [p for p in all_players if p.name == name_variant and p.id not in assigned_player_ids]
for player in matching_players:
assignment = FinalAssignment(
player_id=player.id,
player_name=player.name,
season=player.season,
bbref_id=player.bbref_id,
assigned_sbaplayer_id=new_sba.temp_id,
assignment_source="manual_new",
notes=f"Manual assignment to new SbaPlayer: {new_sba.canonical_name}"
)
assignments.append(assignment)
assigned_player_ids.add(player.id)
# 5. Check for any remaining unassigned players
unassigned_players = [p for p in all_players if p.id not in assigned_player_ids]
if unassigned_players:
logger.warning(f"Found {len(unassigned_players)} unassigned players! This shouldn't happen.")
for player in unassigned_players[:5]: # Show first 5
logger.warning(f" Unassigned: {player.name} (ID: {player.id}, Season: {player.season})")
logger.info(f"Generated {len(assignments)} total player assignments")
return assignments
def save_assignment_files(assignments: List[FinalAssignment], new_sbaplayers: List[NewSbaPlayer]):
"""Save the final assignment files"""
# Save player assignments
logger.info("Saving player_sbaplayer_assignments.csv...")
with open('player_sbaplayer_assignments.csv', 'w', newline='') as f:
fieldnames = ['player_id', 'player_name', 'season', 'bbref_id', 'assigned_sbaplayer_id', 'assignment_source', 'notes']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for assignment in assignments:
writer.writerow(asdict(assignment))
logger.info(f"Saved {len(assignments)} player assignments")
# Save new SbaPlayers to create
logger.info("Saving new_sbaplayers_to_insert.csv...")
with open('new_sbaplayers_to_insert.csv', 'w', newline='') as f:
fieldnames = ['temp_id', 'canonical_name', 'first_name', 'last_name', 'key_bbref', 'name_variations', 'player_count', 'notes']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for new_sba in new_sbaplayers:
row = asdict(new_sba)
row['name_variations'] = ', '.join(row['name_variations']) # Convert list to string
writer.writerow(row)
logger.info(f"Saved {len(new_sbaplayers)} new SbaPlayer records")
def generate_summary_report(assignments: List[FinalAssignment], new_sbaplayers: List[NewSbaPlayer]):
"""Generate final summary report"""
# Count assignments by source
source_counts = defaultdict(int)
for assignment in assignments:
source_counts[assignment.assignment_source] += 1
# Count assignments by SbaPlayer ID
sbaplayer_counts = defaultdict(int)
for assignment in assignments:
sbaplayer_counts[assignment.assigned_sbaplayer_id] += 1
logger.info("Generating final summary report...")
with open('final_assignment_summary.txt', 'w') as f:
f.write("FINAL PLAYER-TO-SBAPLAYER ASSIGNMENT SUMMARY\n")
f.write("=" * 55 + "\n\n")
f.write("ASSIGNMENT BREAKDOWN:\n")
f.write(f" Tier 1 (bbref_id): {source_counts['tier1_bbref']:,} players\n")
f.write(f" Tier 2 (exact name): {source_counts['tier2_name']:,} players\n")
f.write(f" Manual existing: {source_counts['manual_existing']:,} players\n")
f.write(f" Manual new: {source_counts['manual_new']:,} players\n")
f.write(f" TOTAL ASSIGNMENTS: {len(assignments):,} players\n\n")
f.write("NEW SBAPLAYER RECORDS TO CREATE:\n")
f.write(f" Total new records: {len(new_sbaplayers)}\n")
for new_sba in new_sbaplayers:
f.write(f" {new_sba.canonical_name} (temp ID {new_sba.temp_id}) - {new_sba.player_count} players\n")
f.write("\n")
f.write("CAREER STAT TRACKING ENABLED FOR:\n")
existing_sbaplayers = len([a for a in assignments if a.assignment_source in ['tier1_bbref', 'tier2_name', 'manual_existing']])
f.write(f" {existing_sbaplayers:,} players linked to existing SbaPlayers\n")
f.write(f" {source_counts['manual_new']:,} players will have career stats after new SbaPlayer creation\n")
f.write(f" TOTAL: {len(assignments):,} players will have career stat tracking\n\n")
f.write("FILES GENERATED:\n")
f.write(" - player_sbaplayer_assignments.csv (ready for API updates)\n")
f.write(" - new_sbaplayers_to_insert.csv (new SbaPlayer records to create first)\n\n")
f.write("NEXT STEPS:\n")
f.write("1. Review the assignment files for any issues\n")
f.write("2. Create new SbaPlayer records via API (new_sbaplayers_to_insert.csv)\n")
f.write("3. Update all player.sbaplayer_id fields via API (player_sbaplayer_assignments.csv)\n")
f.write("4. Verify career stat tracking works correctly\n")
def main():
"""Main processing function"""
logger.info("Starting final assignment generation...")
try:
# Load cached data
all_players, sbaplayers = load_cached_data()
# Run comprehensive matching for automatic matches
logger.info("Running comprehensive matching for Tier 1 & 2...")
bbref_map, name_map = create_matching_maps(sbaplayers)
tier1_matches = match_players_tier1_bbref(all_players, bbref_map)
tier2_matches = match_players_tier2_name(all_players, name_map, tier1_matches)
# Parse manual decisions
decisions = parse_manual_decisions()
existing_matches, new_player_groups, special_cases = process_manual_decisions(decisions, sbaplayers)
# Create new SbaPlayer records
new_sbaplayers = create_new_sbaplayer_records(new_player_groups, all_players, sbaplayers)
# Generate complete assignments
assignments = generate_all_assignments(
all_players, sbaplayers, tier1_matches, tier2_matches,
existing_matches, new_sbaplayers
)
# Save files
save_assignment_files(assignments, new_sbaplayers)
# Generate summary
generate_summary_report(assignments, new_sbaplayers)
logger.info("✅ Final assignment generation completed successfully!")
# Print summary
source_counts = defaultdict(int)
for assignment in assignments:
source_counts[assignment.assignment_source] += 1
print(f"\n🎉 FINAL RESULTS:")
print(f" 📊 {len(assignments):,} total player assignments generated")
print(f" 🔗 {source_counts['tier1_bbref']:,} Tier 1 (bbref_id) matches")
print(f" 📝 {source_counts['tier2_name']:,} Tier 2 (exact name) matches")
print(f" 👤 {source_counts['manual_existing']:,} Manual existing matches")
print(f" {source_counts['manual_new']:,} Manual new SbaPlayer assignments")
print(f" 📋 {len(new_sbaplayers)} new SbaPlayer records to create")
print(f"\n📁 Files generated:")
print(f" - player_sbaplayer_assignments.csv")
print(f" - new_sbaplayers_to_insert.csv")
print(f" - final_assignment_summary.txt")
except Exception as e:
logger.error(f"Error generating final assignments: {e}")
raise
if __name__ == "__main__":
main()