major-domo-database/.claude/sqlite-to-postgres/player-to-sbaplayer-matching/comprehensive_player_matching.py
Cal Corum 7130a1fd43 Postgres Migration
Migration documentation and scripts
2025-08-25 07:18:31 -05:00

399 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Comprehensive Player to SbaPlayer Matching System
Uses multi-tier matching strategy to avoid creating duplicate SbaPlayer records
"""
import json
import csv
import logging
import re
from dataclasses import dataclass
from typing import Dict, List, Set, Optional, Tuple
from collections import defaultdict
import difflib
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('/mnt/NV2/Development/major-domo/database/player-to-sbaplayer-matching/matching.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger('PlayerMatching')
@dataclass
class PlayerRecord:
id: int
name: str
season: int
bbref_id: Optional[str] = None
sbaplayer_id: Optional[int] = None
@dataclass
class SbaPlayerRecord:
id: int
first_name: str
last_name: str
key_bbref: Optional[str] = None
key_fangraphs: Optional[int] = None
key_mlbam: Optional[int] = None
key_retro: Optional[str] = None
@property
def full_name(self) -> str:
return f"{self.first_name} {self.last_name}"
@dataclass
class MatchResult:
player_id: int
player_name: str
player_bbref_id: Optional[str]
sbaplayer_id: int
sbaplayer_name: str
sbaplayer_bbref: Optional[str]
match_tier: str
confidence: float
seasons: List[int]
def normalize_name(name: str) -> str:
"""Normalize name for consistent matching"""
if not name:
return ""
# Convert to lowercase
normalized = name.lower().strip()
# Remove common suffixes
normalized = re.sub(r'\s+(jr|sr|ii|iii|iv)\.?$', '', normalized)
# Replace periods and apostrophes
normalized = re.sub(r"['\.]", "", normalized)
# Replace hyphens with spaces
normalized = re.sub(r'-', ' ', normalized)
# Normalize whitespace
normalized = re.sub(r'\s+', ' ', normalized)
return normalized.strip()
def create_name_variants(name: str) -> Set[str]:
"""Create common name variants for matching"""
variants = set()
normalized = normalize_name(name)
variants.add(normalized)
# Common nickname mappings
nickname_map = {
'michael': ['mike', 'micky'],
'mike': ['michael'],
'william': ['will', 'bill', 'billy'],
'will': ['william'],
'bill': ['william'],
'robert': ['rob', 'bob', 'bobby'],
'rob': ['robert'],
'bob': ['robert'],
'james': ['jim', 'jimmy'],
'jim': ['james'],
'thomas': ['tom', 'tommy'],
'tom': ['thomas'],
'joseph': ['joe', 'joey'],
'joe': ['joseph'],
'christopher': ['chris'],
'chris': ['christopher'],
'anthony': ['tony'],
'tony': ['anthony'],
'andrew': ['andy', 'drew'],
'andy': ['andrew'],
'drew': ['andrew'],
'jonathan': ['jon'],
'jon': ['jonathan'],
'matthew': ['matt'],
'matt': ['matthew'],
'nicholas': ['nick'],
'nick': ['nicholas'],
'alexander': ['alex'],
'alex': ['alexander'],
'benjamin': ['ben'],
'ben': ['benjamin'],
'samuel': ['sam'],
'sam': ['samuel'],
'daniel': ['dan', 'danny'],
'dan': ['daniel'],
'danny': ['daniel'],
'david': ['dave'],
'dave': ['david'],
'edward': ['ed', 'eddie'],
'ed': ['edward'],
'eddie': ['edward']
}
parts = normalized.split()
if len(parts) >= 2:
first_name = parts[0]
rest = ' '.join(parts[1:])
# Add nickname variants
if first_name in nickname_map:
for nickname in nickname_map[first_name]:
variants.add(f"{nickname} {rest}")
return variants
def load_cached_data():
"""Load all cached player and SbaPlayer data"""
logger.info("Loading cached data...")
# Load SbaPlayers
with open('/tmp/sbaplayers.json', 'r') as f:
sbaplayer_data = json.load(f)
sbaplayers = []
for data in sbaplayer_data:
sbaplayers.append(SbaPlayerRecord(**data))
# Load all players
all_players = []
for season in range(1, 13):
with open(f"/tmp/players_season_{season}.json", 'r') as f:
season_data = json.load(f)
for data in season_data:
all_players.append(PlayerRecord(**data))
logger.info(f"Loaded {len(sbaplayers)} SbaPlayers and {len(all_players)} player records")
return all_players, sbaplayers
def analyze_data_coverage(all_players: List[PlayerRecord], sbaplayers: List[SbaPlayerRecord]):
"""Analyze data coverage and patterns"""
logger.info("Analyzing data coverage...")
# SbaPlayer analysis
sba_with_bbref = sum(1 for sba in sbaplayers if sba.key_bbref)
sba_with_fangraphs = sum(1 for sba in sbaplayers if sba.key_fangraphs)
logger.info(f"SbaPlayer coverage:")
logger.info(f" Total SbaPlayers: {len(sbaplayers)}")
logger.info(f" With key_bbref: {sba_with_bbref} ({sba_with_bbref/len(sbaplayers)*100:.1f}%)")
logger.info(f" With key_fangraphs: {sba_with_fangraphs} ({sba_with_fangraphs/len(sbaplayers)*100:.1f}%)")
# Player analysis
players_with_bbref = sum(1 for p in all_players if p.bbref_id)
logger.info(f"Player coverage:")
logger.info(f" Total player records: {len(all_players)}")
logger.info(f" With bbref_id: {players_with_bbref} ({players_with_bbref/len(all_players)*100:.1f}%)")
# Check for existing assignments
players_with_sbaplayer = sum(1 for p in all_players if p.sbaplayer_id)
logger.info(f" Already assigned to SbaPlayer: {players_with_sbaplayer}")
def create_matching_maps(sbaplayers: List[SbaPlayerRecord]) -> Tuple[Dict, Dict]:
"""Create lookup maps for efficient matching"""
logger.info("Creating matching maps...")
# Map by bbref_id
bbref_map = {}
for sba in sbaplayers:
if sba.key_bbref:
bbref_map[sba.key_bbref] = sba
# Map by normalized name (with variants)
name_map = defaultdict(list)
for sba in sbaplayers:
variants = create_name_variants(sba.full_name)
for variant in variants:
name_map[variant].append(sba)
logger.info(f"Created bbref_id map: {len(bbref_map)} entries")
logger.info(f"Created name map: {len(name_map)} entries")
return bbref_map, name_map
def match_players_tier1_bbref(all_players: List[PlayerRecord], bbref_map: Dict) -> List[MatchResult]:
"""Tier 1: Exact bbref_id matching"""
logger.info("Tier 1: Matching by bbref_id...")
matches = []
unique_players = {} # Group by bbref_id to avoid duplicates
# Group players by bbref_id
for player in all_players:
if player.bbref_id:
if player.bbref_id not in unique_players:
unique_players[player.bbref_id] = []
unique_players[player.bbref_id].append(player)
# Match each unique bbref_id
for bbref_id, players in unique_players.items():
if bbref_id in bbref_map:
sba = bbref_map[bbref_id]
seasons = [p.season for p in players]
# Create match result for all players with this bbref_id
for player in players:
match = MatchResult(
player_id=player.id,
player_name=player.name,
player_bbref_id=bbref_id,
sbaplayer_id=sba.id,
sbaplayer_name=sba.full_name,
sbaplayer_bbref=sba.key_bbref,
match_tier="tier1_bbref",
confidence=1.0,
seasons=seasons
)
matches.append(match)
logger.info(f"Tier 1 matches: {len(matches)} player records")
return matches
def match_players_tier2_name(all_players: List[PlayerRecord], name_map: Dict,
tier1_matches: List[MatchResult]) -> List[MatchResult]:
"""Tier 2: Exact normalized name matching for players without bbref_id"""
logger.info("Tier 2: Matching by exact name...")
# Get player IDs already matched in tier 1
matched_player_ids = {match.player_id for match in tier1_matches}
matches = []
unique_players = {} # Group by name
# Group remaining players by name
for player in all_players:
if player.id not in matched_player_ids and not player.bbref_id:
normalized_name = normalize_name(player.name)
if normalized_name not in unique_players:
unique_players[normalized_name] = []
unique_players[normalized_name].append(player)
# Match each unique name
for normalized_name, players in unique_players.items():
if normalized_name in name_map:
potential_sba_matches = name_map[normalized_name]
if len(potential_sba_matches) == 1:
# Unambiguous match
sba = potential_sba_matches[0]
seasons = [p.season for p in players]
for player in players:
match = MatchResult(
player_id=player.id,
player_name=player.name,
player_bbref_id=None,
sbaplayer_id=sba.id,
sbaplayer_name=sba.full_name,
sbaplayer_bbref=sba.key_bbref,
match_tier="tier2_exact_name",
confidence=0.95,
seasons=seasons
)
matches.append(match)
# Note: Ambiguous matches (multiple SbaPlayers) will be handled in tier 3
logger.info(f"Tier 2 matches: {len(matches)} player records")
return matches
def find_unmatched_players(all_players: List[PlayerRecord], tier1_matches: List[MatchResult],
tier2_matches: List[MatchResult]) -> List[PlayerRecord]:
"""Find players that still need matching"""
matched_player_ids = set()
for match in tier1_matches + tier2_matches:
matched_player_ids.add(match.player_id)
unmatched = [p for p in all_players if p.id not in matched_player_ids]
# Group unmatched players by unique identifier
unique_unmatched = {}
for player in unmatched:
if player.bbref_id:
key = f"bbref:{player.bbref_id}"
else:
key = f"name:{normalize_name(player.name)}"
if key not in unique_unmatched:
unique_unmatched[key] = []
unique_unmatched[key].append(player)
logger.info(f"Unmatched: {len(unmatched)} player records ({len(unique_unmatched)} unique players)")
return unmatched, unique_unmatched
def generate_matching_report(tier1_matches: List[MatchResult], tier2_matches: List[MatchResult],
unmatched: List[PlayerRecord], unique_unmatched: Dict):
"""Generate comprehensive matching report"""
logger.info("Generating matching report...")
# Summary statistics
total_tier1 = len(tier1_matches)
total_tier2 = len(tier2_matches)
total_matched = total_tier1 + total_tier2
total_unmatched = len(unmatched)
with open('/mnt/NV2/Development/major-domo/database/player-to-sbaplayer-matching/matching_report.txt', 'w') as f:
f.write("COMPREHENSIVE PLAYER MATCHING REPORT\n")
f.write("=" * 50 + "\n\n")
f.write("MATCHING SUMMARY:\n")
f.write(f" Tier 1 (bbref_id): {total_tier1:,} player records\n")
f.write(f" Tier 2 (exact name): {total_tier2:,} player records\n")
f.write(f" Total matched: {total_matched:,} player records\n")
f.write(f" Unmatched: {total_unmatched:,} player records ({len(unique_unmatched)} unique players)\n\n")
f.write("TIER 1 EXAMPLES (bbref_id matches):\n")
for i, match in enumerate(tier1_matches[:10]):
f.write(f" {match.player_name} ({match.player_bbref_id}) → {match.sbaplayer_name} (ID: {match.sbaplayer_id})\n")
if len(tier1_matches) > 10:
f.write(f" ... and {len(tier1_matches) - 10} more\n")
f.write("\n")
f.write("TIER 2 EXAMPLES (exact name matches):\n")
for i, match in enumerate(tier2_matches[:10]):
f.write(f" {match.player_name}{match.sbaplayer_name} (ID: {match.sbaplayer_id})\n")
if len(tier2_matches) > 10:
f.write(f" ... and {len(tier2_matches) - 10} more\n")
f.write("\n")
f.write("SAMPLE UNMATCHED PLAYERS (need new SbaPlayer records):\n")
sample_unmatched = list(unique_unmatched.items())[:20]
for key, players in sample_unmatched:
representative = players[0] # Show one representative
seasons = sorted([p.season for p in players])
f.write(f" {representative.name} (bbref: {representative.bbref_id or 'None'}) - seasons {seasons}\n")
if len(unique_unmatched) > 20:
f.write(f" ... and {len(unique_unmatched) - 20} more unique players\n")
logger.info("Matching report generated: matching_report.txt")
def main():
"""Main matching process"""
logger.info("Starting comprehensive player matching...")
# Load data
all_players, sbaplayers = load_cached_data()
# Analyze coverage
analyze_data_coverage(all_players, sbaplayers)
# Create matching maps
bbref_map, name_map = create_matching_maps(sbaplayers)
# Tier 1: bbref_id matching
tier1_matches = match_players_tier1_bbref(all_players, bbref_map)
# Tier 2: exact name matching
tier2_matches = match_players_tier2_name(all_players, name_map, tier1_matches)
# Find unmatched players
unmatched, unique_unmatched = find_unmatched_players(all_players, tier1_matches, tier2_matches)
# Generate report
generate_matching_report(tier1_matches, tier2_matches, unmatched, unique_unmatched)
logger.info("Matching analysis complete!")
logger.info(f"Results: {len(tier1_matches + tier2_matches):,} matched, {len(unmatched):,} unmatched")
if __name__ == "__main__":
main()