399 lines
14 KiB
Python
399 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Comprehensive Player to SbaPlayer Matching System
|
|
Uses multi-tier matching strategy to avoid creating duplicate SbaPlayer records
|
|
"""
|
|
import json
|
|
import csv
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import Dict, List, Set, Optional, Tuple
|
|
from collections import defaultdict
|
|
import difflib
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler('/mnt/NV2/Development/major-domo/database/player-to-sbaplayer-matching/matching.log'),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
logger = logging.getLogger('PlayerMatching')
|
|
|
|
@dataclass
|
|
class PlayerRecord:
|
|
id: int
|
|
name: str
|
|
season: int
|
|
bbref_id: Optional[str] = None
|
|
sbaplayer_id: Optional[int] = None
|
|
|
|
@dataclass
|
|
class SbaPlayerRecord:
|
|
id: int
|
|
first_name: str
|
|
last_name: str
|
|
key_bbref: Optional[str] = None
|
|
key_fangraphs: Optional[int] = None
|
|
key_mlbam: Optional[int] = None
|
|
key_retro: Optional[str] = None
|
|
|
|
@property
|
|
def full_name(self) -> str:
|
|
return f"{self.first_name} {self.last_name}"
|
|
|
|
@dataclass
|
|
class MatchResult:
|
|
player_id: int
|
|
player_name: str
|
|
player_bbref_id: Optional[str]
|
|
sbaplayer_id: int
|
|
sbaplayer_name: str
|
|
sbaplayer_bbref: Optional[str]
|
|
match_tier: str
|
|
confidence: float
|
|
seasons: List[int]
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize name for consistent matching"""
|
|
if not name:
|
|
return ""
|
|
|
|
# Convert to lowercase
|
|
normalized = name.lower().strip()
|
|
|
|
# Remove common suffixes
|
|
normalized = re.sub(r'\s+(jr|sr|ii|iii|iv)\.?$', '', normalized)
|
|
|
|
# Replace periods and apostrophes
|
|
normalized = re.sub(r"['\.]", "", normalized)
|
|
|
|
# Replace hyphens with spaces
|
|
normalized = re.sub(r'-', ' ', normalized)
|
|
|
|
# Normalize whitespace
|
|
normalized = re.sub(r'\s+', ' ', normalized)
|
|
|
|
return normalized.strip()
|
|
|
|
def create_name_variants(name: str) -> Set[str]:
|
|
"""Create common name variants for matching"""
|
|
variants = set()
|
|
normalized = normalize_name(name)
|
|
variants.add(normalized)
|
|
|
|
# Common nickname mappings
|
|
nickname_map = {
|
|
'michael': ['mike', 'micky'],
|
|
'mike': ['michael'],
|
|
'william': ['will', 'bill', 'billy'],
|
|
'will': ['william'],
|
|
'bill': ['william'],
|
|
'robert': ['rob', 'bob', 'bobby'],
|
|
'rob': ['robert'],
|
|
'bob': ['robert'],
|
|
'james': ['jim', 'jimmy'],
|
|
'jim': ['james'],
|
|
'thomas': ['tom', 'tommy'],
|
|
'tom': ['thomas'],
|
|
'joseph': ['joe', 'joey'],
|
|
'joe': ['joseph'],
|
|
'christopher': ['chris'],
|
|
'chris': ['christopher'],
|
|
'anthony': ['tony'],
|
|
'tony': ['anthony'],
|
|
'andrew': ['andy', 'drew'],
|
|
'andy': ['andrew'],
|
|
'drew': ['andrew'],
|
|
'jonathan': ['jon'],
|
|
'jon': ['jonathan'],
|
|
'matthew': ['matt'],
|
|
'matt': ['matthew'],
|
|
'nicholas': ['nick'],
|
|
'nick': ['nicholas'],
|
|
'alexander': ['alex'],
|
|
'alex': ['alexander'],
|
|
'benjamin': ['ben'],
|
|
'ben': ['benjamin'],
|
|
'samuel': ['sam'],
|
|
'sam': ['samuel'],
|
|
'daniel': ['dan', 'danny'],
|
|
'dan': ['daniel'],
|
|
'danny': ['daniel'],
|
|
'david': ['dave'],
|
|
'dave': ['david'],
|
|
'edward': ['ed', 'eddie'],
|
|
'ed': ['edward'],
|
|
'eddie': ['edward']
|
|
}
|
|
|
|
parts = normalized.split()
|
|
if len(parts) >= 2:
|
|
first_name = parts[0]
|
|
rest = ' '.join(parts[1:])
|
|
|
|
# Add nickname variants
|
|
if first_name in nickname_map:
|
|
for nickname in nickname_map[first_name]:
|
|
variants.add(f"{nickname} {rest}")
|
|
|
|
return variants
|
|
|
|
def load_cached_data():
|
|
"""Load all cached player and SbaPlayer data"""
|
|
logger.info("Loading cached data...")
|
|
|
|
# Load SbaPlayers
|
|
with open('/tmp/sbaplayers.json', 'r') as f:
|
|
sbaplayer_data = json.load(f)
|
|
|
|
sbaplayers = []
|
|
for data in sbaplayer_data:
|
|
sbaplayers.append(SbaPlayerRecord(**data))
|
|
|
|
# Load all players
|
|
all_players = []
|
|
for season in range(1, 13):
|
|
with open(f"/tmp/players_season_{season}.json", 'r') as f:
|
|
season_data = json.load(f)
|
|
|
|
for data in season_data:
|
|
all_players.append(PlayerRecord(**data))
|
|
|
|
logger.info(f"Loaded {len(sbaplayers)} SbaPlayers and {len(all_players)} player records")
|
|
return all_players, sbaplayers
|
|
|
|
def analyze_data_coverage(all_players: List[PlayerRecord], sbaplayers: List[SbaPlayerRecord]):
|
|
"""Analyze data coverage and patterns"""
|
|
logger.info("Analyzing data coverage...")
|
|
|
|
# SbaPlayer analysis
|
|
sba_with_bbref = sum(1 for sba in sbaplayers if sba.key_bbref)
|
|
sba_with_fangraphs = sum(1 for sba in sbaplayers if sba.key_fangraphs)
|
|
|
|
logger.info(f"SbaPlayer coverage:")
|
|
logger.info(f" Total SbaPlayers: {len(sbaplayers)}")
|
|
logger.info(f" With key_bbref: {sba_with_bbref} ({sba_with_bbref/len(sbaplayers)*100:.1f}%)")
|
|
logger.info(f" With key_fangraphs: {sba_with_fangraphs} ({sba_with_fangraphs/len(sbaplayers)*100:.1f}%)")
|
|
|
|
# Player analysis
|
|
players_with_bbref = sum(1 for p in all_players if p.bbref_id)
|
|
|
|
logger.info(f"Player coverage:")
|
|
logger.info(f" Total player records: {len(all_players)}")
|
|
logger.info(f" With bbref_id: {players_with_bbref} ({players_with_bbref/len(all_players)*100:.1f}%)")
|
|
|
|
# Check for existing assignments
|
|
players_with_sbaplayer = sum(1 for p in all_players if p.sbaplayer_id)
|
|
logger.info(f" Already assigned to SbaPlayer: {players_with_sbaplayer}")
|
|
|
|
def create_matching_maps(sbaplayers: List[SbaPlayerRecord]) -> Tuple[Dict, Dict]:
|
|
"""Create lookup maps for efficient matching"""
|
|
logger.info("Creating matching maps...")
|
|
|
|
# Map by bbref_id
|
|
bbref_map = {}
|
|
for sba in sbaplayers:
|
|
if sba.key_bbref:
|
|
bbref_map[sba.key_bbref] = sba
|
|
|
|
# Map by normalized name (with variants)
|
|
name_map = defaultdict(list)
|
|
for sba in sbaplayers:
|
|
variants = create_name_variants(sba.full_name)
|
|
for variant in variants:
|
|
name_map[variant].append(sba)
|
|
|
|
logger.info(f"Created bbref_id map: {len(bbref_map)} entries")
|
|
logger.info(f"Created name map: {len(name_map)} entries")
|
|
|
|
return bbref_map, name_map
|
|
|
|
def match_players_tier1_bbref(all_players: List[PlayerRecord], bbref_map: Dict) -> List[MatchResult]:
|
|
"""Tier 1: Exact bbref_id matching"""
|
|
logger.info("Tier 1: Matching by bbref_id...")
|
|
|
|
matches = []
|
|
unique_players = {} # Group by bbref_id to avoid duplicates
|
|
|
|
# Group players by bbref_id
|
|
for player in all_players:
|
|
if player.bbref_id:
|
|
if player.bbref_id not in unique_players:
|
|
unique_players[player.bbref_id] = []
|
|
unique_players[player.bbref_id].append(player)
|
|
|
|
# Match each unique bbref_id
|
|
for bbref_id, players in unique_players.items():
|
|
if bbref_id in bbref_map:
|
|
sba = bbref_map[bbref_id]
|
|
seasons = [p.season for p in players]
|
|
|
|
# Create match result for all players with this bbref_id
|
|
for player in players:
|
|
match = MatchResult(
|
|
player_id=player.id,
|
|
player_name=player.name,
|
|
player_bbref_id=bbref_id,
|
|
sbaplayer_id=sba.id,
|
|
sbaplayer_name=sba.full_name,
|
|
sbaplayer_bbref=sba.key_bbref,
|
|
match_tier="tier1_bbref",
|
|
confidence=1.0,
|
|
seasons=seasons
|
|
)
|
|
matches.append(match)
|
|
|
|
logger.info(f"Tier 1 matches: {len(matches)} player records")
|
|
return matches
|
|
|
|
def match_players_tier2_name(all_players: List[PlayerRecord], name_map: Dict,
|
|
tier1_matches: List[MatchResult]) -> List[MatchResult]:
|
|
"""Tier 2: Exact normalized name matching for players without bbref_id"""
|
|
logger.info("Tier 2: Matching by exact name...")
|
|
|
|
# Get player IDs already matched in tier 1
|
|
matched_player_ids = {match.player_id for match in tier1_matches}
|
|
|
|
matches = []
|
|
unique_players = {} # Group by name
|
|
|
|
# Group remaining players by name
|
|
for player in all_players:
|
|
if player.id not in matched_player_ids and not player.bbref_id:
|
|
normalized_name = normalize_name(player.name)
|
|
if normalized_name not in unique_players:
|
|
unique_players[normalized_name] = []
|
|
unique_players[normalized_name].append(player)
|
|
|
|
# Match each unique name
|
|
for normalized_name, players in unique_players.items():
|
|
if normalized_name in name_map:
|
|
potential_sba_matches = name_map[normalized_name]
|
|
|
|
if len(potential_sba_matches) == 1:
|
|
# Unambiguous match
|
|
sba = potential_sba_matches[0]
|
|
seasons = [p.season for p in players]
|
|
|
|
for player in players:
|
|
match = MatchResult(
|
|
player_id=player.id,
|
|
player_name=player.name,
|
|
player_bbref_id=None,
|
|
sbaplayer_id=sba.id,
|
|
sbaplayer_name=sba.full_name,
|
|
sbaplayer_bbref=sba.key_bbref,
|
|
match_tier="tier2_exact_name",
|
|
confidence=0.95,
|
|
seasons=seasons
|
|
)
|
|
matches.append(match)
|
|
# Note: Ambiguous matches (multiple SbaPlayers) will be handled in tier 3
|
|
|
|
logger.info(f"Tier 2 matches: {len(matches)} player records")
|
|
return matches
|
|
|
|
def find_unmatched_players(all_players: List[PlayerRecord], tier1_matches: List[MatchResult],
|
|
tier2_matches: List[MatchResult]) -> List[PlayerRecord]:
|
|
"""Find players that still need matching"""
|
|
matched_player_ids = set()
|
|
for match in tier1_matches + tier2_matches:
|
|
matched_player_ids.add(match.player_id)
|
|
|
|
unmatched = [p for p in all_players if p.id not in matched_player_ids]
|
|
|
|
# Group unmatched players by unique identifier
|
|
unique_unmatched = {}
|
|
for player in unmatched:
|
|
if player.bbref_id:
|
|
key = f"bbref:{player.bbref_id}"
|
|
else:
|
|
key = f"name:{normalize_name(player.name)}"
|
|
|
|
if key not in unique_unmatched:
|
|
unique_unmatched[key] = []
|
|
unique_unmatched[key].append(player)
|
|
|
|
logger.info(f"Unmatched: {len(unmatched)} player records ({len(unique_unmatched)} unique players)")
|
|
return unmatched, unique_unmatched
|
|
|
|
def generate_matching_report(tier1_matches: List[MatchResult], tier2_matches: List[MatchResult],
|
|
unmatched: List[PlayerRecord], unique_unmatched: Dict):
|
|
"""Generate comprehensive matching report"""
|
|
logger.info("Generating matching report...")
|
|
|
|
# Summary statistics
|
|
total_tier1 = len(tier1_matches)
|
|
total_tier2 = len(tier2_matches)
|
|
total_matched = total_tier1 + total_tier2
|
|
total_unmatched = len(unmatched)
|
|
|
|
with open('/mnt/NV2/Development/major-domo/database/player-to-sbaplayer-matching/matching_report.txt', 'w') as f:
|
|
f.write("COMPREHENSIVE PLAYER MATCHING REPORT\n")
|
|
f.write("=" * 50 + "\n\n")
|
|
|
|
f.write("MATCHING SUMMARY:\n")
|
|
f.write(f" Tier 1 (bbref_id): {total_tier1:,} player records\n")
|
|
f.write(f" Tier 2 (exact name): {total_tier2:,} player records\n")
|
|
f.write(f" Total matched: {total_matched:,} player records\n")
|
|
f.write(f" Unmatched: {total_unmatched:,} player records ({len(unique_unmatched)} unique players)\n\n")
|
|
|
|
f.write("TIER 1 EXAMPLES (bbref_id matches):\n")
|
|
for i, match in enumerate(tier1_matches[:10]):
|
|
f.write(f" {match.player_name} ({match.player_bbref_id}) → {match.sbaplayer_name} (ID: {match.sbaplayer_id})\n")
|
|
if len(tier1_matches) > 10:
|
|
f.write(f" ... and {len(tier1_matches) - 10} more\n")
|
|
f.write("\n")
|
|
|
|
f.write("TIER 2 EXAMPLES (exact name matches):\n")
|
|
for i, match in enumerate(tier2_matches[:10]):
|
|
f.write(f" {match.player_name} → {match.sbaplayer_name} (ID: {match.sbaplayer_id})\n")
|
|
if len(tier2_matches) > 10:
|
|
f.write(f" ... and {len(tier2_matches) - 10} more\n")
|
|
f.write("\n")
|
|
|
|
f.write("SAMPLE UNMATCHED PLAYERS (need new SbaPlayer records):\n")
|
|
sample_unmatched = list(unique_unmatched.items())[:20]
|
|
for key, players in sample_unmatched:
|
|
representative = players[0] # Show one representative
|
|
seasons = sorted([p.season for p in players])
|
|
f.write(f" {representative.name} (bbref: {representative.bbref_id or 'None'}) - seasons {seasons}\n")
|
|
if len(unique_unmatched) > 20:
|
|
f.write(f" ... and {len(unique_unmatched) - 20} more unique players\n")
|
|
|
|
logger.info("Matching report generated: matching_report.txt")
|
|
|
|
def main():
|
|
"""Main matching process"""
|
|
logger.info("Starting comprehensive player matching...")
|
|
|
|
# Load data
|
|
all_players, sbaplayers = load_cached_data()
|
|
|
|
# Analyze coverage
|
|
analyze_data_coverage(all_players, sbaplayers)
|
|
|
|
# Create matching maps
|
|
bbref_map, name_map = create_matching_maps(sbaplayers)
|
|
|
|
# Tier 1: bbref_id matching
|
|
tier1_matches = match_players_tier1_bbref(all_players, bbref_map)
|
|
|
|
# Tier 2: exact name matching
|
|
tier2_matches = match_players_tier2_name(all_players, name_map, tier1_matches)
|
|
|
|
# Find unmatched players
|
|
unmatched, unique_unmatched = find_unmatched_players(all_players, tier1_matches, tier2_matches)
|
|
|
|
# Generate report
|
|
generate_matching_report(tier1_matches, tier2_matches, unmatched, unique_unmatched)
|
|
|
|
logger.info("Matching analysis complete!")
|
|
logger.info(f"Results: {len(tier1_matches + tier2_matches):,} matched, {len(unmatched):,} unmatched")
|
|
|
|
if __name__ == "__main__":
|
|
main() |