267 lines
10 KiB
Python
267 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate CSV file of players who don't have SbaPlayer matches
|
|
For manual review and matching
|
|
"""
|
|
import json
|
|
import csv
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from typing import Dict, List, Set, Optional
|
|
from collections import defaultdict
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler('/tmp/unmatched_players.log'),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
logger = logging.getLogger('UnmatchedPlayers')
|
|
|
|
@dataclass
|
|
class PlayerRecord:
|
|
id: int
|
|
name: str
|
|
season: int
|
|
bbref_id: Optional[str] = None
|
|
sbaplayer_id: Optional[int] = None
|
|
|
|
@dataclass
|
|
class SbaPlayerRecord:
|
|
id: int
|
|
first_name: str
|
|
last_name: str
|
|
key_bbref: Optional[str] = None
|
|
key_fangraphs: Optional[int] = None
|
|
key_mlbam: Optional[int] = None
|
|
key_retro: Optional[str] = None
|
|
|
|
def load_cached_data():
|
|
"""Load all cached data from previous analysis"""
|
|
|
|
# Load SbaPlayers
|
|
logger.info("Loading cached SbaPlayer data...")
|
|
with open('/tmp/sbaplayers.json', 'r') as f:
|
|
sbaplayer_data = json.load(f)
|
|
|
|
sbaplayers = []
|
|
for data in sbaplayer_data:
|
|
sbaplayers.append(SbaPlayerRecord(**data))
|
|
|
|
logger.info(f"Loaded {len(sbaplayers)} SbaPlayers")
|
|
|
|
# Load all player seasons
|
|
logger.info("Loading cached player data...")
|
|
all_players = []
|
|
|
|
for season in range(1, 13):
|
|
cache_file = f"/tmp/players_season_{season}.json"
|
|
try:
|
|
with open(cache_file, 'r') as f:
|
|
season_data = json.load(f)
|
|
|
|
for data in season_data:
|
|
all_players.append(PlayerRecord(**data))
|
|
|
|
except FileNotFoundError:
|
|
logger.error(f"Cache file for season {season} not found. Run analyze_player_data.py first.")
|
|
return None, None
|
|
|
|
logger.info(f"Loaded {len(all_players)} player-season records")
|
|
return all_players, sbaplayers
|
|
|
|
def find_unmatched_players(all_players: List[PlayerRecord], sbaplayers: List[SbaPlayerRecord]):
|
|
"""Find all players that don't have matches"""
|
|
|
|
# Create lookup for SbaPlayers by bbref_id
|
|
sbaplayer_by_bbref = {}
|
|
for sba in sbaplayers:
|
|
if sba.key_bbref:
|
|
sbaplayer_by_bbref[sba.key_bbref] = sba
|
|
|
|
# Group players by unique identifier
|
|
unique_players = {} # Will store one representative PlayerRecord for each unique player
|
|
|
|
# First pass: group by bbref_id where available
|
|
players_by_bbref = defaultdict(list)
|
|
players_without_bbref = []
|
|
|
|
for player in all_players:
|
|
if player.bbref_id:
|
|
players_by_bbref[player.bbref_id].append(player)
|
|
else:
|
|
players_without_bbref.append(player)
|
|
|
|
# Find unmatched players with bbref_id
|
|
unmatched_with_bbref = []
|
|
for bbref_id, players in players_by_bbref.items():
|
|
if bbref_id not in sbaplayer_by_bbref:
|
|
# This bbref_id has no match in SbaPlayers
|
|
# Use the most recent season's player record as representative
|
|
representative = max(players, key=lambda p: p.season)
|
|
unmatched_with_bbref.append(representative)
|
|
|
|
logger.info(f"Found {len(unmatched_with_bbref)} unique players with bbref_id but no SbaPlayer match")
|
|
|
|
# Group players without bbref_id by name
|
|
players_by_name = defaultdict(list)
|
|
for player in players_without_bbref:
|
|
players_by_name[player.name].append(player)
|
|
|
|
unmatched_without_bbref = []
|
|
for name, players in players_by_name.items():
|
|
# Use the most recent season's player record as representative
|
|
representative = max(players, key=lambda p: p.season)
|
|
unmatched_without_bbref.append(representative)
|
|
|
|
logger.info(f"Found {len(unmatched_without_bbref)} unique players without bbref_id")
|
|
|
|
return unmatched_with_bbref, unmatched_without_bbref
|
|
|
|
def generate_csv_reports(unmatched_with_bbref: List[PlayerRecord], unmatched_without_bbref: List[PlayerRecord], sbaplayers: List[SbaPlayerRecord]):
|
|
"""Generate CSV files for manual review"""
|
|
|
|
# Generate main unmatched players file
|
|
output_file = '/tmp/unmatched_players_for_review.csv'
|
|
|
|
with open(output_file, 'w', newline='') as csvfile:
|
|
writer = csv.writer(csvfile)
|
|
|
|
# Header
|
|
writer.writerow([
|
|
'player_id', 'name', 'season', 'bbref_id', 'match_type', 'seasons_appeared',
|
|
'suggested_sbaplayer_id', 'suggested_sbaplayer_name', 'suggested_match_reason'
|
|
])
|
|
|
|
# Players with bbref_id but no match
|
|
for player in sorted(unmatched_with_bbref, key=lambda p: p.name):
|
|
# Get all seasons this player appeared in
|
|
all_seasons = []
|
|
cache_file = f"/tmp/players_season_{player.season}.json"
|
|
|
|
# Find all seasons for this bbref_id across all cached data
|
|
seasons_found = []
|
|
for season in range(1, 13):
|
|
try:
|
|
with open(f"/tmp/players_season_{season}.json", 'r') as f:
|
|
season_data = json.load(f)
|
|
|
|
for p_data in season_data:
|
|
if p_data.get('bbref_id') == player.bbref_id:
|
|
seasons_found.append(season)
|
|
break
|
|
except:
|
|
continue
|
|
|
|
seasons_str = ','.join(map(str, sorted(seasons_found)))
|
|
|
|
writer.writerow([
|
|
player.id,
|
|
player.name,
|
|
player.season,
|
|
player.bbref_id,
|
|
'has_bbref_no_match',
|
|
seasons_str,
|
|
'', # suggested_sbaplayer_id (empty for manual fill)
|
|
'', # suggested_sbaplayer_name (empty for manual fill)
|
|
'No existing SbaPlayer with this bbref_id'
|
|
])
|
|
|
|
# Players without bbref_id
|
|
for player in sorted(unmatched_without_bbref, key=lambda p: p.name):
|
|
# Get all seasons this player appeared in by name
|
|
seasons_found = []
|
|
for season in range(1, 13):
|
|
try:
|
|
with open(f"/tmp/players_season_{season}.json", 'r') as f:
|
|
season_data = json.load(f)
|
|
|
|
for p_data in season_data:
|
|
if p_data.get('name') == player.name and not p_data.get('bbref_id'):
|
|
seasons_found.append(season)
|
|
break
|
|
except:
|
|
continue
|
|
|
|
seasons_str = ','.join(map(str, sorted(seasons_found)))
|
|
|
|
# Try to suggest a match from existing SbaPlayers by name
|
|
suggested_id = ''
|
|
suggested_name = ''
|
|
suggested_reason = 'No bbref_id available'
|
|
|
|
for sba in sbaplayers:
|
|
sba_full_name = f"{sba.first_name} {sba.last_name}"
|
|
if sba_full_name.lower() == player.name.lower():
|
|
suggested_id = str(sba.id)
|
|
suggested_name = sba_full_name
|
|
suggested_reason = 'Exact name match found in SbaPlayers'
|
|
break
|
|
elif (sba.first_name.lower() in player.name.lower() and
|
|
sba.last_name.lower() in player.name.lower()):
|
|
# Partial match - suggest but flag for review
|
|
if not suggested_id: # Only suggest first partial match
|
|
suggested_id = f"PARTIAL:{sba.id}"
|
|
suggested_name = sba_full_name
|
|
suggested_reason = 'Partial name match - REVIEW NEEDED'
|
|
|
|
writer.writerow([
|
|
player.id,
|
|
player.name,
|
|
player.season,
|
|
player.bbref_id or '',
|
|
'no_bbref',
|
|
seasons_str,
|
|
suggested_id,
|
|
suggested_name,
|
|
suggested_reason
|
|
])
|
|
|
|
logger.info(f"Generated CSV report: {output_file}")
|
|
|
|
# Generate summary statistics
|
|
summary_file = '/tmp/unmatched_players_summary.txt'
|
|
with open(summary_file, 'w') as f:
|
|
f.write("UNMATCHED PLAYERS SUMMARY\n")
|
|
f.write("=" * 50 + "\n\n")
|
|
f.write(f"Players with bbref_id but no SbaPlayer match: {len(unmatched_with_bbref)}\n")
|
|
f.write(f"Players without bbref_id: {len(unmatched_without_bbref)}\n")
|
|
f.write(f"Total unique unmatched players: {len(unmatched_with_bbref) + len(unmatched_without_bbref)}\n\n")
|
|
|
|
f.write("NEXT STEPS:\n")
|
|
f.write("1. Review the CSV file: /tmp/unmatched_players_for_review.csv\n")
|
|
f.write("2. For players with suggested matches, verify they are correct\n")
|
|
f.write("3. For players marked 'PARTIAL:', carefully review the suggestion\n")
|
|
f.write("4. Fill in the suggested_sbaplayer_id column for matches you want to use\n")
|
|
f.write("5. Leave suggested_sbaplayer_id empty for players needing new SbaPlayer records\n")
|
|
|
|
logger.info(f"Generated summary: {summary_file}")
|
|
|
|
return output_file, summary_file
|
|
|
|
def main():
|
|
"""Main execution"""
|
|
logger.info("Starting unmatched players report generation...")
|
|
|
|
# Load cached data
|
|
all_players, sbaplayers = load_cached_data()
|
|
if not all_players or not sbaplayers:
|
|
logger.error("Failed to load cached data. Run analyze_player_data.py first.")
|
|
return
|
|
|
|
# Find unmatched players
|
|
unmatched_with_bbref, unmatched_without_bbref = find_unmatched_players(all_players, sbaplayers)
|
|
|
|
# Generate CSV reports
|
|
csv_file, summary_file = generate_csv_reports(unmatched_with_bbref, unmatched_without_bbref, sbaplayers)
|
|
|
|
logger.info(f"\n=== REPORT COMPLETE ===")
|
|
logger.info(f"CSV file for review: {csv_file}")
|
|
logger.info(f"Summary: {summary_file}")
|
|
logger.info(f"Total unmatched unique players: {len(unmatched_with_bbref) + len(unmatched_without_bbref)}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |