396 lines
16 KiB
Python
396 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Process manual matching decisions from reviewed CSV files
|
|
Generate two output CSV files:
|
|
1. New SbaPlayers to insert
|
|
2. All player_id -> sbaplayer_id assignments
|
|
"""
|
|
import csv
|
|
import json
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from typing import Dict, List, Set, Optional
|
|
from collections import defaultdict
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger('ProcessDecisions')
|
|
|
|
@dataclass
|
|
class PlayerRecord:
|
|
id: int
|
|
name: str
|
|
season: int
|
|
bbref_id: Optional[str] = None
|
|
sbaplayer_id: Optional[int] = None
|
|
|
|
@dataclass
|
|
class SbaPlayerRecord:
|
|
id: int
|
|
first_name: str
|
|
last_name: str
|
|
key_bbref: Optional[str] = None
|
|
key_fangraphs: Optional[int] = None
|
|
key_mlbam: Optional[int] = None
|
|
key_retro: Optional[str] = None
|
|
|
|
@dataclass
|
|
class NewSbaPlayer:
|
|
first_name: str
|
|
last_name: str
|
|
key_bbref: Optional[str] = None
|
|
temp_id: int = 0 # Temporary ID for referencing before insertion
|
|
|
|
def load_cached_data():
|
|
"""Load cached player and sbaplayer data"""
|
|
logger.info("Loading cached data...")
|
|
|
|
# Load SbaPlayers
|
|
with open('/tmp/sbaplayers.json', 'r') as f:
|
|
sbaplayer_data = json.load(f)
|
|
|
|
sbaplayers = []
|
|
for data in sbaplayer_data:
|
|
sbaplayers.append(SbaPlayerRecord(**data))
|
|
|
|
# Load all players
|
|
all_players = []
|
|
for season in range(1, 13):
|
|
with open(f"/tmp/players_season_{season}.json", 'r') as f:
|
|
season_data = json.load(f)
|
|
|
|
for data in season_data:
|
|
all_players.append(PlayerRecord(**data))
|
|
|
|
logger.info(f"Loaded {len(sbaplayers)} SbaPlayers and {len(all_players)} player records")
|
|
return all_players, sbaplayers
|
|
|
|
def parse_unmatched_decisions():
|
|
"""Parse decisions from the unmatched players CSV"""
|
|
logger.info("Parsing unmatched player decisions...")
|
|
|
|
decisions = []
|
|
csv_file = '/mnt/NV2/Development/major-domo/database/unmatched_players_for_review_updated.csv'
|
|
|
|
with open(csv_file, 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
|
|
# Debug: print column names
|
|
logger.info(f"CSV columns: {reader.fieldnames}")
|
|
|
|
for row in reader:
|
|
resolution = row.get('resolution', '').strip()
|
|
if not resolution or resolution == 'SKIP':
|
|
continue
|
|
|
|
# Handle encoded column names
|
|
player_id_key = next((k for k in row.keys() if 'player' in k.lower() and 'id' in k.lower()), 'player_id')
|
|
bbref_id_key = next((k for k in row.keys() if 'bbref' in k.lower() and 'id' in k.lower()), 'bbref_id')
|
|
seasons_key = next((k for k in row.keys() if 'seasons' in k.lower() and 'appeared' in k.lower()), 'seasons_appeared')
|
|
suggested_id_key = next((k for k in row.keys() if 'suggested' in k.lower() and 'sbaplayer' in k.lower() and 'id' in k.lower()), 'suggested_sbaplayer_id')
|
|
match_type_key = next((k for k in row.keys() if 'match' in k.lower() and 'type' in k.lower()), 'match_type')
|
|
|
|
decision = {
|
|
'player_id': int(row[player_id_key]),
|
|
'player_name': row['name'],
|
|
'bbref_id': row[bbref_id_key] if row[bbref_id_key] else None,
|
|
'seasons_appeared': row[seasons_key],
|
|
'resolution': resolution,
|
|
'suggested_sbaplayer_id': row[suggested_id_key],
|
|
'match_type': row[match_type_key]
|
|
}
|
|
decisions.append(decision)
|
|
|
|
logger.info(f"Found {len(decisions)} unmatched player decisions")
|
|
return decisions
|
|
|
|
def parse_high_risk_decisions():
|
|
"""Parse decisions from the high risk matches CSV"""
|
|
logger.info("Parsing high-risk match decisions...")
|
|
|
|
decisions = []
|
|
csv_file = '/mnt/NV2/Development/major-domo/database/high_risk_player_matches_updated.csv'
|
|
|
|
with open(csv_file, 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
|
|
# Debug: print column names
|
|
logger.info(f"High-risk CSV columns: {reader.fieldnames}")
|
|
|
|
for row in reader:
|
|
resolution = row.get('resolution', '').strip()
|
|
if not resolution or resolution == 'SKIP':
|
|
continue
|
|
|
|
# Handle encoded column names
|
|
risk_type_key = next((k for k in row.keys() if 'risk' in k.lower() and 'type' in k.lower()), 'risk_type')
|
|
player_id_key = next((k for k in row.keys() if 'player' in k.lower() and 'id' in k.lower()), 'player_id')
|
|
player_name_key = next((k for k in row.keys() if 'player' in k.lower() and 'name' in k.lower()), 'player_name')
|
|
sba1_id_key = next((k for k in row.keys() if 'sba1' in k.lower() and 'id' in k.lower()), 'sba1_id')
|
|
sba1_name_key = next((k for k in row.keys() if 'sba1' in k.lower() and 'name' in k.lower()), 'sba1_name')
|
|
sba2_id_key = next((k for k in row.keys() if 'sba2' in k.lower() and 'id' in k.lower()), 'sba2_id')
|
|
sba2_name_key = next((k for k in row.keys() if 'sba2' in k.lower() and 'name' in k.lower()), 'sba2_name')
|
|
|
|
decision = {
|
|
'risk_type': row[risk_type_key],
|
|
'resolution': resolution,
|
|
'player_id': int(row[player_id_key]) if row[player_id_key] else None,
|
|
'player_name': row[player_name_key],
|
|
'sba1_id': int(row[sba1_id_key]) if row[sba1_id_key] else None,
|
|
'sba1_name': row[sba1_name_key],
|
|
'sba2_id': int(row[sba2_id_key]) if row[sba2_id_key] else None,
|
|
'sba2_name': row[sba2_name_key]
|
|
}
|
|
decisions.append(decision)
|
|
|
|
logger.info(f"Found {len(decisions)} high-risk match decisions")
|
|
return decisions
|
|
|
|
def process_decisions(all_players, sbaplayers, unmatched_decisions, high_risk_decisions):
|
|
"""Process all decisions and generate new SbaPlayers and assignments"""
|
|
|
|
# Create lookup maps
|
|
sbaplayers_by_id = {sba.id: sba for sba in sbaplayers}
|
|
players_by_id = {p.id: p for p in all_players}
|
|
|
|
# Track what we're creating and assigning
|
|
new_sbaplayers = []
|
|
player_assignments = [] # (player_id, sbaplayer_id) pairs
|
|
sbaplayer_merges = [] # (from_id, to_id) pairs
|
|
temp_id_counter = 90000 # Start temp IDs at 90000 to avoid conflicts
|
|
|
|
logger.info("Processing unmatched player decisions...")
|
|
|
|
# Process unmatched player decisions
|
|
for decision in unmatched_decisions:
|
|
resolution = decision['resolution']
|
|
player_id = decision['player_id']
|
|
player_name = decision['player_name']
|
|
bbref_id = decision['bbref_id']
|
|
|
|
if resolution == 'ACCEPT':
|
|
# Use suggested SbaPlayer ID
|
|
suggested_id = decision['suggested_sbaplayer_id']
|
|
if suggested_id and not suggested_id.startswith('PARTIAL:'):
|
|
sbaplayer_id = int(suggested_id)
|
|
|
|
# Find all players with same identity and assign them
|
|
if bbref_id:
|
|
# Group by bbref_id
|
|
matching_players = [p for p in all_players if p.bbref_id == bbref_id]
|
|
else:
|
|
# Group by name (for players without bbref_id)
|
|
matching_players = [p for p in all_players if p.name == player_name and not p.bbref_id]
|
|
|
|
for player in matching_players:
|
|
player_assignments.append((player.id, sbaplayer_id))
|
|
|
|
elif resolution.startswith('USE_SBA_'):
|
|
# Use specific SbaPlayer ID
|
|
sbaplayer_id = int(resolution.replace('USE_SBA_', ''))
|
|
|
|
# Find all players with same identity
|
|
if bbref_id:
|
|
matching_players = [p for p in all_players if p.bbref_id == bbref_id]
|
|
else:
|
|
matching_players = [p for p in all_players if p.name == player_name and not p.bbref_id]
|
|
|
|
for player in matching_players:
|
|
player_assignments.append((player.id, sbaplayer_id))
|
|
|
|
elif resolution == 'CREATE_NEW':
|
|
# Create new SbaPlayer
|
|
name_parts = player_name.strip().split()
|
|
if len(name_parts) >= 2:
|
|
first_name = name_parts[0]
|
|
last_name = ' '.join(name_parts[1:]) # Handle multiple part last names
|
|
else:
|
|
first_name = player_name
|
|
last_name = ""
|
|
|
|
new_sba = NewSbaPlayer(
|
|
first_name=first_name,
|
|
last_name=last_name,
|
|
key_bbref=bbref_id,
|
|
temp_id=temp_id_counter
|
|
)
|
|
new_sbaplayers.append(new_sba)
|
|
|
|
# Find all players with same identity and assign to new SbaPlayer
|
|
if bbref_id:
|
|
matching_players = [p for p in all_players if p.bbref_id == bbref_id]
|
|
else:
|
|
matching_players = [p for p in all_players if p.name == player_name and not p.bbref_id]
|
|
|
|
for player in matching_players:
|
|
player_assignments.append((player.id, temp_id_counter))
|
|
|
|
temp_id_counter += 1
|
|
|
|
logger.info("Processing high-risk match decisions...")
|
|
|
|
# Process high-risk decisions
|
|
for decision in high_risk_decisions:
|
|
resolution = decision['resolution']
|
|
risk_type = decision['risk_type']
|
|
|
|
if resolution.startswith('MERGE_') and '_INTO_' in resolution:
|
|
# Parse merge instruction: MERGE_123_INTO_456
|
|
parts = resolution.replace('MERGE_', '').split('_INTO_')
|
|
from_id = int(parts[0])
|
|
to_id = int(parts[1])
|
|
sbaplayer_merges.append((from_id, to_id))
|
|
|
|
elif resolution.startswith('USE_SBA_') and decision['player_id']:
|
|
# Player ambiguous match resolved
|
|
sbaplayer_id = int(resolution.replace('USE_SBA_', ''))
|
|
player_id = decision['player_id']
|
|
player_name = decision['player_name']
|
|
|
|
# Find all players with same name (no bbref_id since these are ambiguous matches)
|
|
matching_players = [p for p in all_players if p.name == player_name and not p.bbref_id]
|
|
for player in matching_players:
|
|
player_assignments.append((player.id, sbaplayer_id))
|
|
|
|
logger.info(f"Generated {len(new_sbaplayers)} new SbaPlayers")
|
|
logger.info(f"Generated {len(player_assignments)} player assignments")
|
|
logger.info(f"Generated {len(sbaplayer_merges)} SbaPlayer merges")
|
|
|
|
return new_sbaplayers, player_assignments, sbaplayer_merges
|
|
|
|
def generate_new_sbaplayers_csv(new_sbaplayers):
|
|
"""Generate CSV file with new SbaPlayers to insert"""
|
|
output_file = '/mnt/NV2/Development/major-domo/database/new_sbaplayers_to_insert.csv'
|
|
|
|
with open(output_file, 'w', newline='') as csvfile:
|
|
writer = csv.writer(csvfile)
|
|
writer.writerow(['temp_id', 'first_name', 'last_name', 'key_bbref', 'key_fangraphs', 'key_mlbam', 'key_retro'])
|
|
|
|
for sba in new_sbaplayers:
|
|
writer.writerow([
|
|
sba.temp_id,
|
|
sba.first_name,
|
|
sba.last_name,
|
|
sba.key_bbref or '',
|
|
'', # key_fangraphs (empty)
|
|
'', # key_mlbam (empty)
|
|
'' # key_retro (empty)
|
|
])
|
|
|
|
logger.info(f"Generated new SbaPlayers CSV: {output_file}")
|
|
return output_file
|
|
|
|
def generate_player_assignments_csv(player_assignments, sbaplayer_merges, all_players, sbaplayers_by_id):
|
|
"""Generate CSV file with all player assignments"""
|
|
output_file = '/mnt/NV2/Development/major-domo/database/player_sbaplayer_assignments.csv'
|
|
|
|
# Create merge mapping for resolving final IDs
|
|
merge_mapping = {}
|
|
for from_id, to_id in sbaplayer_merges:
|
|
merge_mapping[from_id] = to_id
|
|
|
|
players_by_id = {p.id: p for p in all_players}
|
|
|
|
with open(output_file, 'w', newline='') as csvfile:
|
|
writer = csv.writer(csvfile)
|
|
writer.writerow([
|
|
'player_id', 'player_name', 'player_season', 'player_bbref_id',
|
|
'sbaplayer_id', 'sbaplayer_name', 'sbaplayer_bbref', 'assignment_type'
|
|
])
|
|
|
|
for player_id, sbaplayer_id in player_assignments:
|
|
player = players_by_id[player_id]
|
|
|
|
# Resolve final sbaplayer_id (in case of merges)
|
|
final_sbaplayer_id = merge_mapping.get(sbaplayer_id, sbaplayer_id)
|
|
|
|
# Get SbaPlayer info
|
|
if final_sbaplayer_id >= 90000:
|
|
# New SbaPlayer (temp ID)
|
|
sbaplayer_name = f"NEW_TEMP_{final_sbaplayer_id}"
|
|
sbaplayer_bbref = player.bbref_id or ''
|
|
assignment_type = 'new_sbaplayer'
|
|
else:
|
|
# Existing SbaPlayer
|
|
if final_sbaplayer_id in sbaplayers_by_id:
|
|
sba = sbaplayers_by_id[final_sbaplayer_id]
|
|
sbaplayer_name = f"{sba.first_name} {sba.last_name}"
|
|
sbaplayer_bbref = sba.key_bbref or ''
|
|
assignment_type = 'existing_sbaplayer'
|
|
else:
|
|
sbaplayer_name = f"UNKNOWN_SBA_{final_sbaplayer_id}"
|
|
sbaplayer_bbref = ''
|
|
assignment_type = 'error'
|
|
|
|
writer.writerow([
|
|
player.id,
|
|
player.name,
|
|
player.season,
|
|
player.bbref_id or '',
|
|
final_sbaplayer_id,
|
|
sbaplayer_name,
|
|
sbaplayer_bbref,
|
|
assignment_type
|
|
])
|
|
|
|
logger.info(f"Generated player assignments CSV: {output_file}")
|
|
return output_file
|
|
|
|
def generate_summary_report(new_sbaplayers, player_assignments, sbaplayer_merges):
|
|
"""Generate summary report"""
|
|
summary_file = '/mnt/NV2/Development/major-domo/database/processing_summary.txt'
|
|
|
|
with open(summary_file, 'w') as f:
|
|
f.write("MATCHING DECISIONS PROCESSING SUMMARY\n")
|
|
f.write("=" * 50 + "\n\n")
|
|
|
|
f.write(f"New SbaPlayers to create: {len(new_sbaplayers)}\n")
|
|
f.write(f"Player assignments to make: {len(player_assignments)}\n")
|
|
f.write(f"SbaPlayer merges to perform: {len(sbaplayer_merges)}\n\n")
|
|
|
|
f.write("FILES GENERATED:\n")
|
|
f.write("1. new_sbaplayers_to_insert.csv - New SbaPlayer records to create\n")
|
|
f.write("2. player_sbaplayer_assignments.csv - All player -> sbaplayer assignments\n\n")
|
|
|
|
f.write("SbaPlayer Merges:\n")
|
|
for from_id, to_id in sbaplayer_merges:
|
|
f.write(f" - Merge SbaPlayer {from_id} into {to_id}\n")
|
|
|
|
f.write(f"\nNEXT STEPS:\n")
|
|
f.write("1. Review the generated CSV files\n")
|
|
f.write("2. Execute SbaPlayer merges first (if any)\n")
|
|
f.write("3. Insert new SbaPlayers and get their real IDs\n")
|
|
f.write("4. Update all player records with sbaplayer_id assignments\n")
|
|
|
|
logger.info(f"Generated summary report: {summary_file}")
|
|
|
|
def main():
|
|
"""Main processing function"""
|
|
logger.info("Starting processing of matching decisions...")
|
|
|
|
# Load data
|
|
all_players, sbaplayers = load_cached_data()
|
|
sbaplayers_by_id = {sba.id: sba for sba in sbaplayers}
|
|
|
|
# Parse decisions
|
|
unmatched_decisions = parse_unmatched_decisions()
|
|
high_risk_decisions = parse_high_risk_decisions()
|
|
|
|
# Process decisions
|
|
new_sbaplayers, player_assignments, sbaplayer_merges = process_decisions(
|
|
all_players, sbaplayers, unmatched_decisions, high_risk_decisions
|
|
)
|
|
|
|
# Generate output files
|
|
new_sba_file = generate_new_sbaplayers_csv(new_sbaplayers)
|
|
assignments_file = generate_player_assignments_csv(
|
|
player_assignments, sbaplayer_merges, all_players, sbaplayers_by_id
|
|
)
|
|
generate_summary_report(new_sbaplayers, player_assignments, sbaplayer_merges)
|
|
|
|
logger.info("\n=== PROCESSING COMPLETE ===")
|
|
logger.info(f"New SbaPlayers CSV: {new_sba_file}")
|
|
logger.info(f"Player assignments CSV: {assignments_file}")
|
|
logger.info(f"Summary: processing_summary.txt")
|
|
|
|
if __name__ == "__main__":
|
|
main() |