492 lines
21 KiB
Python
492 lines
21 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Generate final player-to-SbaPlayer assignments by combining:
|
||
1. Automatic matches from comprehensive matching (Tier 1 & 2)
|
||
2. Manual decisions from review CSV
|
||
3. New SbaPlayer records for unmatched players
|
||
|
||
FIXED VERSION: Properly handles all 12,232 individual player records
|
||
"""
|
||
|
||
import json
|
||
import csv
|
||
import logging
|
||
import re
|
||
from dataclasses import dataclass, asdict
|
||
from typing import Dict, List, Set, Optional, Tuple
|
||
from collections import defaultdict
|
||
|
||
# Import functions from comprehensive matching
|
||
from comprehensive_player_matching import (
|
||
PlayerRecord, SbaPlayerRecord, normalize_name,
|
||
create_name_variants, load_cached_data, create_matching_maps
|
||
)
|
||
|
||
# Set up logging
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||
handlers=[
|
||
logging.FileHandler('matching.log'),
|
||
logging.StreamHandler()
|
||
]
|
||
)
|
||
logger = logging.getLogger(f'{__name__}.generate_final_assignments_v2')
|
||
|
||
@dataclass
|
||
class ManualDecision:
|
||
"""Manual decision from review CSV"""
|
||
group_key: str
|
||
player_name: str
|
||
bbref_id: str
|
||
seasons_appeared: str
|
||
sample_player_ids: str
|
||
potential_existing_sbaplayer_id: str
|
||
potential_existing_sbaplayer_name: str
|
||
potential_match_reason: str
|
||
your_decision_sbaplayer_id: str
|
||
your_decision_notes: str
|
||
|
||
@dataclass
|
||
class FinalAssignment:
|
||
"""Final player assignment result"""
|
||
player_id: int
|
||
player_name: str
|
||
season: int
|
||
bbref_id: Optional[str]
|
||
assigned_sbaplayer_id: int
|
||
assignment_source: str # 'tier1_bbref', 'tier2_name', 'manual_existing', 'manual_new'
|
||
notes: str = ""
|
||
|
||
@dataclass
|
||
class NewSbaPlayer:
|
||
"""New SbaPlayer record to create"""
|
||
temp_id: int # Temporary ID for assignments
|
||
canonical_name: str
|
||
key_bbref: Optional[str]
|
||
first_name: str
|
||
last_name: str
|
||
name_variations: List[str]
|
||
player_count: int
|
||
notes: str
|
||
|
||
def parse_manual_decisions():
|
||
"""Parse manual decisions from the review CSV"""
|
||
decisions = []
|
||
|
||
logger.info("Parsing manual decisions from new_sbaplayers_for_review.csv...")
|
||
|
||
with open('new_sbaplayers_for_review.csv', 'r', encoding='utf-8') as f:
|
||
# Handle the encoded format
|
||
content = f.read()
|
||
# Decode common HTML entities
|
||
content = content.replace('+AF8-', '_')
|
||
content = content.replace('+ACI-', '"')
|
||
content = content.replace('+AC0-', '-')
|
||
|
||
# Parse as CSV
|
||
lines = content.strip().split('\n')
|
||
reader = csv.DictReader(lines)
|
||
|
||
for row in reader:
|
||
if not row.get('group_key'): # Skip empty rows
|
||
continue
|
||
|
||
decision = ManualDecision(
|
||
group_key=row['group_key'],
|
||
player_name=row['player_name'],
|
||
bbref_id=row['bbref_id'],
|
||
seasons_appeared=row['seasons_appeared'],
|
||
sample_player_ids=row['sample_player_ids'],
|
||
potential_existing_sbaplayer_id=row['potential_existing_sbaplayer_id'],
|
||
potential_existing_sbaplayer_name=row['potential_existing_sbaplayer_name'],
|
||
potential_match_reason=row['potential_match_reason'],
|
||
your_decision_sbaplayer_id=row['your_decision_sbaplayer_id'],
|
||
your_decision_notes=row['your_decision_notes']
|
||
)
|
||
decisions.append(decision)
|
||
|
||
logger.info(f"Parsed {len(decisions)} manual decisions")
|
||
return decisions
|
||
|
||
def run_comprehensive_matching(all_players, sbaplayers):
|
||
"""Run the comprehensive matching logic to get automatic assignments"""
|
||
logger.info("Running comprehensive matching logic...")
|
||
|
||
# Create lookup maps
|
||
bbref_map, name_map = create_matching_maps(sbaplayers)
|
||
|
||
# Track all assignments
|
||
assignments = {} # player_id -> (sbaplayer_id, source, notes)
|
||
|
||
# TIER 1: bbref_id matching
|
||
logger.info("Tier 1: Matching by bbref_id...")
|
||
tier1_count = 0
|
||
for player in all_players:
|
||
# Treat "HALP" as corrupted data (equivalent to no bbref_id)
|
||
valid_bbref_id = player.bbref_id if player.bbref_id and player.bbref_id != "HALP" else None
|
||
if valid_bbref_id and valid_bbref_id in bbref_map:
|
||
sba = bbref_map[valid_bbref_id]
|
||
assignments[player.id] = (
|
||
sba.id,
|
||
"tier1_bbref",
|
||
f"Automatic match via bbref_id to {sba.full_name}"
|
||
)
|
||
tier1_count += 1
|
||
|
||
logger.info(f"Tier 1 matches: {tier1_count} player records")
|
||
|
||
# TIER 2: exact name matching (for players without valid bbref_id that weren't matched in tier 1)
|
||
logger.info("Tier 2: Matching by exact name...")
|
||
tier2_count = 0
|
||
for player in all_players:
|
||
# Treat "HALP" as corrupted data (equivalent to no bbref_id)
|
||
valid_bbref_id = player.bbref_id if player.bbref_id and player.bbref_id != "HALP" else None
|
||
if player.id not in assignments and not valid_bbref_id: # Not matched in tier 1, no valid bbref_id
|
||
normalized_name = normalize_name(player.name)
|
||
if normalized_name in name_map:
|
||
potential_matches = name_map[normalized_name]
|
||
if len(potential_matches) == 1: # Unambiguous match
|
||
sba = potential_matches[0]
|
||
assignments[player.id] = (
|
||
sba.id,
|
||
"tier2_name",
|
||
f"Automatic match via exact name to {sba.full_name}"
|
||
)
|
||
tier2_count += 1
|
||
|
||
logger.info(f"Tier 2 matches: {tier2_count} player records")
|
||
logger.info(f"Total automatic matches: {tier1_count + tier2_count}")
|
||
|
||
return assignments
|
||
|
||
def process_manual_decisions(decisions: List[ManualDecision], sbaplayers: List[SbaPlayerRecord]) -> Tuple[Dict, Dict]:
|
||
"""Process manual decisions into existing matches and new players needed"""
|
||
|
||
# Create SbaPlayer lookup
|
||
sbaplayer_map = {sp.id: sp for sp in sbaplayers}
|
||
|
||
# Process decisions
|
||
existing_matches = {} # player_name -> sbaplayer_id
|
||
new_player_groups = {} # canonical_name -> [player_names]
|
||
special_cases = {} # Handle consolidations and corrupted data
|
||
|
||
for decision in decisions:
|
||
player_name = decision.player_name
|
||
|
||
if decision.your_decision_sbaplayer_id:
|
||
# User chose an existing SbaPlayer
|
||
try:
|
||
sbaplayer_id = int(decision.your_decision_sbaplayer_id)
|
||
if sbaplayer_id in sbaplayer_map:
|
||
existing_matches[player_name] = sbaplayer_id
|
||
logger.info(f"Manual match: '{player_name}' -> SbaPlayer ID {sbaplayer_id} ({sbaplayer_map[sbaplayer_id].full_name})")
|
||
else:
|
||
logger.warning(f"Invalid SbaPlayer ID {sbaplayer_id} for {player_name}")
|
||
except ValueError:
|
||
logger.warning(f"Invalid SbaPlayer ID format: {decision.your_decision_sbaplayer_id}")
|
||
else:
|
||
# User decided this needs a new SbaPlayer record
|
||
canonical_name = player_name # Default to same name
|
||
|
||
# Check for special consolidation cases
|
||
if "Three-way match" in decision.your_decision_notes:
|
||
if "use name Tom Eshelman" in decision.your_decision_notes:
|
||
canonical_name = "Tom Eshelman"
|
||
elif "Two-way match" in decision.your_decision_notes:
|
||
if "join with bbref_id mejiafr01" in decision.your_decision_notes:
|
||
# This Francisco Mejia (HALP) should consolidate with the legitimate one
|
||
canonical_name = "Francisco Mejia" # Will be consolidated later
|
||
special_cases[player_name] = "consolidate_with_mejiafr01"
|
||
|
||
# Group players by canonical name
|
||
if canonical_name not in new_player_groups:
|
||
new_player_groups[canonical_name] = []
|
||
new_player_groups[canonical_name].append(player_name)
|
||
|
||
logger.info(f"Manual decisions processed:")
|
||
logger.info(f" Existing matches: {len(existing_matches)}")
|
||
logger.info(f" New player groups: {len(new_player_groups)}")
|
||
|
||
return existing_matches, new_player_groups, special_cases
|
||
|
||
def create_new_sbaplayer_records(new_player_groups: Dict[str, List[str]], all_players: List[PlayerRecord],
|
||
sbaplayers: List[SbaPlayerRecord]) -> List[NewSbaPlayer]:
|
||
"""Create new SbaPlayer records for unmatched player groups"""
|
||
|
||
new_sbaplayers = []
|
||
next_id = max([sp.id for sp in sbaplayers]) + 1000 # Start from high ID to avoid conflicts
|
||
|
||
for canonical_name, player_names in new_player_groups.items():
|
||
# Find all player records for this group
|
||
all_variants = set(player_names)
|
||
group_players = [p for p in all_players if p.name in all_variants]
|
||
|
||
if not group_players:
|
||
logger.warning(f"No player records found for group: {canonical_name}")
|
||
continue
|
||
|
||
# Get bbref_id from any player that has one
|
||
bbref_id = None
|
||
for player in group_players:
|
||
if player.bbref_id and player.bbref_id != "HALP": # Skip corrupted data
|
||
bbref_id = player.bbref_id
|
||
break
|
||
|
||
# Parse canonical name for first/last
|
||
name_parts = canonical_name.split()
|
||
if len(name_parts) >= 2:
|
||
first_name = name_parts[0]
|
||
last_name = ' '.join(name_parts[1:])
|
||
else:
|
||
first_name = canonical_name
|
||
last_name = ""
|
||
|
||
# Create new SbaPlayer record
|
||
new_sba = NewSbaPlayer(
|
||
temp_id=next_id,
|
||
canonical_name=canonical_name,
|
||
key_bbref=bbref_id,
|
||
first_name=first_name,
|
||
last_name=last_name,
|
||
name_variations=player_names,
|
||
player_count=len(group_players),
|
||
notes=f"Created from manual review. Variations: {', '.join(sorted(set(player_names)))}"
|
||
)
|
||
|
||
new_sbaplayers.append(new_sba)
|
||
logger.info(f"New SbaPlayer: '{canonical_name}' (temp ID {next_id}) for {len(group_players)} player records")
|
||
next_id += 1
|
||
|
||
return new_sbaplayers
|
||
|
||
def generate_all_assignments(all_players: List[PlayerRecord], sbaplayers: List[SbaPlayerRecord],
|
||
auto_assignments: Dict, existing_matches: Dict[str, int],
|
||
new_sbaplayers: List[NewSbaPlayer]) -> List[FinalAssignment]:
|
||
"""Generate complete assignments for ALL 12,232 player records"""
|
||
|
||
assignments = []
|
||
|
||
# Create lookup maps
|
||
sbaplayer_map = {sp.id: sp for sp in sbaplayers}
|
||
new_sbaplayer_map = {} # name -> temp_id
|
||
for new_sba in new_sbaplayers:
|
||
for name_variant in new_sba.name_variations:
|
||
new_sbaplayer_map[name_variant] = new_sba.temp_id
|
||
|
||
# Track processed player IDs
|
||
processed_player_ids = set()
|
||
|
||
# Process every single player record
|
||
for player in all_players:
|
||
assignment = None
|
||
|
||
# 1. Check if already assigned via automatic matching
|
||
if player.id in auto_assignments:
|
||
sbaplayer_id, source, notes = auto_assignments[player.id]
|
||
assignment = FinalAssignment(
|
||
player_id=player.id,
|
||
player_name=player.name,
|
||
season=player.season,
|
||
bbref_id=player.bbref_id,
|
||
assigned_sbaplayer_id=sbaplayer_id,
|
||
assignment_source=source,
|
||
notes=notes
|
||
)
|
||
|
||
# 2. Check if manually assigned to existing SbaPlayer
|
||
elif player.name in existing_matches:
|
||
sbaplayer_id = existing_matches[player.name]
|
||
sba_name = sbaplayer_map[sbaplayer_id].full_name if sbaplayer_id in sbaplayer_map else "Unknown"
|
||
assignment = FinalAssignment(
|
||
player_id=player.id,
|
||
player_name=player.name,
|
||
season=player.season,
|
||
bbref_id=player.bbref_id,
|
||
assigned_sbaplayer_id=sbaplayer_id,
|
||
assignment_source="manual_existing",
|
||
notes=f"Manual match to existing SbaPlayer: {sba_name}"
|
||
)
|
||
|
||
# 3. Check if assigned to new SbaPlayer
|
||
elif player.name in new_sbaplayer_map:
|
||
temp_id = new_sbaplayer_map[player.name]
|
||
new_sba = next((ns for ns in new_sbaplayers if ns.temp_id == temp_id), None)
|
||
canonical_name = new_sba.canonical_name if new_sba else player.name
|
||
assignment = FinalAssignment(
|
||
player_id=player.id,
|
||
player_name=player.name,
|
||
season=player.season,
|
||
bbref_id=player.bbref_id,
|
||
assigned_sbaplayer_id=temp_id,
|
||
assignment_source="manual_new",
|
||
notes=f"Manual assignment to new SbaPlayer: {canonical_name}"
|
||
)
|
||
|
||
# 4. This shouldn't happen - every player should be assigned by now
|
||
else:
|
||
logger.error(f"UNASSIGNED PLAYER: {player.name} (ID: {player.id}, Season: {player.season})")
|
||
# Create emergency new SbaPlayer for this orphaned player
|
||
emergency_id = 9999000 + player.id # Use a very high ID to avoid conflicts
|
||
assignment = FinalAssignment(
|
||
player_id=player.id,
|
||
player_name=player.name,
|
||
season=player.season,
|
||
bbref_id=player.bbref_id,
|
||
assigned_sbaplayer_id=emergency_id,
|
||
assignment_source="emergency_new",
|
||
notes=f"EMERGENCY: Unassigned player, needs new SbaPlayer record"
|
||
)
|
||
|
||
if assignment:
|
||
assignments.append(assignment)
|
||
processed_player_ids.add(player.id)
|
||
|
||
logger.info(f"Generated {len(assignments)} total player assignments")
|
||
|
||
# Verify we got everyone
|
||
if len(assignments) != len(all_players):
|
||
logger.error(f"MISMATCH: Expected {len(all_players)} assignments, got {len(assignments)}")
|
||
else:
|
||
logger.info("✅ All players successfully assigned!")
|
||
|
||
return assignments
|
||
|
||
def save_assignment_files(assignments: List[FinalAssignment], new_sbaplayers: List[NewSbaPlayer]):
|
||
"""Save the final assignment files"""
|
||
|
||
# Save player assignments
|
||
logger.info("Saving player_sbaplayer_assignments.csv...")
|
||
with open('player_sbaplayer_assignments.csv', 'w', newline='') as f:
|
||
fieldnames = ['player_id', 'player_name', 'season', 'bbref_id', 'assigned_sbaplayer_id', 'assignment_source', 'notes']
|
||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||
writer.writeheader()
|
||
|
||
for assignment in assignments:
|
||
writer.writerow(asdict(assignment))
|
||
|
||
logger.info(f"Saved {len(assignments)} player assignments")
|
||
|
||
# Save new SbaPlayers to create
|
||
logger.info("Saving new_sbaplayers_to_insert.csv...")
|
||
with open('new_sbaplayers_to_insert.csv', 'w', newline='') as f:
|
||
fieldnames = ['temp_id', 'canonical_name', 'first_name', 'last_name', 'key_bbref', 'name_variations', 'player_count', 'notes']
|
||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||
writer.writeheader()
|
||
|
||
for new_sba in new_sbaplayers:
|
||
row = asdict(new_sba)
|
||
row['name_variations'] = ', '.join(row['name_variations']) # Convert list to string
|
||
writer.writerow(row)
|
||
|
||
logger.info(f"Saved {len(new_sbaplayers)} new SbaPlayer records")
|
||
|
||
def generate_summary_report(assignments: List[FinalAssignment], new_sbaplayers: List[NewSbaPlayer]):
|
||
"""Generate final summary report"""
|
||
|
||
# Count assignments by source
|
||
source_counts = defaultdict(int)
|
||
for assignment in assignments:
|
||
source_counts[assignment.assignment_source] += 1
|
||
|
||
logger.info("Generating final summary report...")
|
||
with open('final_assignment_summary.txt', 'w') as f:
|
||
f.write("FINAL PLAYER-TO-SBAPLAYER ASSIGNMENT SUMMARY\n")
|
||
f.write("=" * 55 + "\n\n")
|
||
|
||
f.write("ASSIGNMENT BREAKDOWN:\n")
|
||
f.write(f" Tier 1 (bbref_id): {source_counts['tier1_bbref']:,} players\n")
|
||
f.write(f" Tier 2 (exact name): {source_counts['tier2_name']:,} players\n")
|
||
f.write(f" Manual existing: {source_counts['manual_existing']:,} players\n")
|
||
f.write(f" Manual new: {source_counts['manual_new']:,} players\n")
|
||
if source_counts['emergency_new'] > 0:
|
||
f.write(f" Emergency new: {source_counts['emergency_new']:,} players (ERROR)\n")
|
||
f.write(f" TOTAL ASSIGNMENTS: {len(assignments):,} players\n\n")
|
||
|
||
f.write("NEW SBAPLAYER RECORDS TO CREATE:\n")
|
||
f.write(f" Total new records: {len(new_sbaplayers)}\n")
|
||
for new_sba in new_sbaplayers:
|
||
f.write(f" {new_sba.canonical_name} (temp ID {new_sba.temp_id}) - {new_sba.player_count} players\n")
|
||
f.write("\n")
|
||
|
||
f.write("CAREER STAT TRACKING ENABLED FOR:\n")
|
||
existing_sbaplayers = len([a for a in assignments if a.assignment_source in ['tier1_bbref', 'tier2_name', 'manual_existing']])
|
||
f.write(f" {existing_sbaplayers:,} players linked to existing SbaPlayers\n")
|
||
f.write(f" {source_counts['manual_new']:,} players will have career stats after new SbaPlayer creation\n")
|
||
f.write(f" TOTAL: {len(assignments):,} players will have career stat tracking\n\n")
|
||
|
||
f.write("FILES GENERATED:\n")
|
||
f.write(" - player_sbaplayer_assignments.csv (ready for API updates)\n")
|
||
f.write(" - new_sbaplayers_to_insert.csv (new SbaPlayer records to create first)\n\n")
|
||
|
||
f.write("NEXT STEPS:\n")
|
||
f.write("1. Review the assignment files for any issues\n")
|
||
f.write("2. Create new SbaPlayer records via API (new_sbaplayers_to_insert.csv)\n")
|
||
f.write("3. Update all player.sbaplayer_id fields via API (player_sbaplayer_assignments.csv)\n")
|
||
f.write("4. Verify career stat tracking works correctly\n")
|
||
|
||
def main():
|
||
"""Main processing function"""
|
||
logger.info("Starting final assignment generation (FIXED VERSION)...")
|
||
|
||
try:
|
||
# Load cached data
|
||
all_players, sbaplayers = load_cached_data()
|
||
logger.info(f"Loaded {len(all_players)} players and {len(sbaplayers)} SbaPlayers")
|
||
|
||
# Run comprehensive matching for automatic assignments
|
||
logger.info("Running comprehensive matching...")
|
||
auto_assignments = run_comprehensive_matching(all_players, sbaplayers)
|
||
|
||
# Parse manual decisions
|
||
decisions = parse_manual_decisions()
|
||
existing_matches, new_player_groups, special_cases = process_manual_decisions(decisions, sbaplayers)
|
||
|
||
# Create new SbaPlayer records
|
||
new_sbaplayers = create_new_sbaplayer_records(new_player_groups, all_players, sbaplayers)
|
||
|
||
# Generate complete assignments for ALL players
|
||
assignments = generate_all_assignments(
|
||
all_players, sbaplayers, auto_assignments, existing_matches, new_sbaplayers
|
||
)
|
||
|
||
# Save files
|
||
save_assignment_files(assignments, new_sbaplayers)
|
||
|
||
# Generate summary
|
||
generate_summary_report(assignments, new_sbaplayers)
|
||
|
||
logger.info("✅ Final assignment generation completed successfully!")
|
||
|
||
# Print summary
|
||
source_counts = defaultdict(int)
|
||
for assignment in assignments:
|
||
source_counts[assignment.assignment_source] += 1
|
||
|
||
print(f"\n🎉 FINAL RESULTS:")
|
||
print(f" 📊 {len(assignments):,} total player assignments generated")
|
||
print(f" 🔗 {source_counts['tier1_bbref']:,} Tier 1 (bbref_id) matches")
|
||
print(f" 📝 {source_counts['tier2_name']:,} Tier 2 (exact name) matches")
|
||
print(f" 👤 {source_counts['manual_existing']:,} Manual existing matches")
|
||
print(f" ➕ {source_counts['manual_new']:,} Manual new SbaPlayer assignments")
|
||
if source_counts['emergency_new'] > 0:
|
||
print(f" 🚨 {source_counts['emergency_new']:,} Emergency assignments (ERROR)")
|
||
print(f" 📋 {len(new_sbaplayers)} new SbaPlayer records to create")
|
||
print(f"\n📁 Files generated:")
|
||
print(f" - player_sbaplayer_assignments.csv")
|
||
print(f" - new_sbaplayers_to_insert.csv")
|
||
print(f" - final_assignment_summary.txt")
|
||
|
||
# Verify total matches expected result
|
||
expected_total = len(all_players)
|
||
if len(assignments) == expected_total:
|
||
print(f"\n✅ SUCCESS: All {expected_total:,} players successfully assigned!")
|
||
else:
|
||
print(f"\n❌ ERROR: Expected {expected_total:,}, got {len(assignments):,} assignments")
|
||
|
||
except Exception as e:
|
||
logger.error(f"Error generating final assignments: {e}")
|
||
raise
|
||
|
||
if __name__ == "__main__":
|
||
main() |