317 lines
12 KiB
Python
317 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Process manual review decisions and generate final assignment files.
|
|
"""
|
|
|
|
import json
|
|
import csv
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Dict, List, Set
|
|
from dataclasses import dataclass
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler('matching.log'),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
logger = logging.getLogger(f'{__name__}.process_manual_decisions')
|
|
|
|
@dataclass
|
|
class ManualDecision:
|
|
"""Manual decision from review CSV"""
|
|
group_key: str
|
|
player_name: str
|
|
bbref_id: str
|
|
seasons_appeared: str
|
|
sample_player_ids: str
|
|
potential_existing_sbaplayer_id: str
|
|
potential_existing_sbaplayer_name: str
|
|
potential_match_reason: str
|
|
your_decision_sbaplayer_id: str
|
|
your_decision_notes: str
|
|
|
|
def load_cached_data():
|
|
"""Load cached player and sbaplayer data"""
|
|
logger.info("Loading cached data from /tmp/...")
|
|
|
|
# Load SbaPlayers
|
|
with open('/tmp/sbaplayers.json', 'r') as f:
|
|
sbaplayers = json.load(f)
|
|
logger.info(f"Loaded {len(sbaplayers)} SbaPlayer records")
|
|
|
|
# Load all players
|
|
all_players = []
|
|
for season in range(1, 13):
|
|
try:
|
|
with open(f'/tmp/players_season_{season}.json', 'r') as f:
|
|
season_players = json.load(f)
|
|
all_players.extend(season_players)
|
|
except FileNotFoundError:
|
|
logger.warning(f"Season {season} data not found")
|
|
|
|
logger.info(f"Loaded {len(all_players)} total player records")
|
|
return all_players, sbaplayers
|
|
|
|
def load_previous_matches():
|
|
"""Load the automatically matched players from comprehensive analysis"""
|
|
matched_players = set()
|
|
|
|
# Read the matching report to get previously matched player IDs
|
|
try:
|
|
with open('matching_report.txt', 'r') as f:
|
|
content = f.read()
|
|
|
|
# Extract matched player IDs from the report
|
|
# This is a simplified approach - in practice, we'd want to re-run
|
|
# the matching logic or save the matched IDs separately
|
|
logger.info("Loading previously matched players from existing analysis...")
|
|
|
|
except FileNotFoundError:
|
|
logger.warning("matching_report.txt not found - will need to re-run comprehensive matching")
|
|
|
|
return matched_players
|
|
|
|
def parse_manual_decisions():
|
|
"""Parse manual decisions from the review CSV"""
|
|
decisions = []
|
|
|
|
with open('new_sbaplayers_for_review.csv', 'r', encoding='utf-8') as f:
|
|
# Handle the encoded format
|
|
content = f.read()
|
|
# Decode common HTML entities
|
|
content = content.replace('+AF8-', '_')
|
|
content = content.replace('+ACI-', '"')
|
|
content = content.replace('+AC0-', '-')
|
|
|
|
# Parse as CSV
|
|
lines = content.strip().split('\n')
|
|
reader = csv.DictReader(lines)
|
|
|
|
for row in reader:
|
|
if not row.get('group_key'): # Skip empty rows
|
|
continue
|
|
|
|
decision = ManualDecision(
|
|
group_key=row['group_key'],
|
|
player_name=row['player_name'],
|
|
bbref_id=row['bbref_id'],
|
|
seasons_appeared=row['seasons_appeared'],
|
|
sample_player_ids=row['sample_player_ids'],
|
|
potential_existing_sbaplayer_id=row['potential_existing_sbaplayer_id'],
|
|
potential_existing_sbaplayer_name=row['potential_existing_sbaplayer_name'],
|
|
potential_match_reason=row['potential_match_reason'],
|
|
your_decision_sbaplayer_id=row['your_decision_sbaplayer_id'],
|
|
your_decision_notes=row['your_decision_notes']
|
|
)
|
|
decisions.append(decision)
|
|
|
|
logger.info(f"Parsed {len(decisions)} manual decisions")
|
|
return decisions
|
|
|
|
def determine_final_assignments(all_players, sbaplayers, decisions):
|
|
"""Determine final sbaplayer_id assignments for all players"""
|
|
|
|
# Create maps for quick lookup
|
|
sbaplayer_map = {sp['id']: sp for sp in sbaplayers}
|
|
|
|
# Group decisions by type
|
|
existing_matches = {} # player_name -> sbaplayer_id
|
|
new_players_needed = set() # player names that need new SbaPlayer records
|
|
three_way_matches = {} # consolidated names
|
|
|
|
# Process manual decisions
|
|
for decision in decisions:
|
|
if decision.your_decision_sbaplayer_id:
|
|
# User chose an existing SbaPlayer
|
|
try:
|
|
sbaplayer_id = int(decision.your_decision_sbaplayer_id)
|
|
existing_matches[decision.player_name] = sbaplayer_id
|
|
logger.info(f"Manual match: {decision.player_name} -> SbaPlayer ID {sbaplayer_id}")
|
|
except ValueError:
|
|
logger.warning(f"Invalid SbaPlayer ID: {decision.your_decision_sbaplayer_id}")
|
|
else:
|
|
# User decided this needs a new SbaPlayer record
|
|
new_players_needed.add(decision.player_name)
|
|
|
|
# Check for three-way matches in notes
|
|
if "Three-way match" in decision.your_decision_notes:
|
|
# Extract canonical name from notes
|
|
if "use name Tom Eshelman" in decision.your_decision_notes:
|
|
three_way_matches[decision.player_name] = "Tom Eshelman"
|
|
|
|
# Check for two-way matches (corrupted data)
|
|
if "Two-way match" in decision.your_decision_notes:
|
|
if "join with bbref_id mejiafr01" in decision.your_decision_notes:
|
|
# This Francisco Mejia with "HALP" should join with the legitimate one
|
|
# Find the mejiafr01 record in decisions
|
|
for other_decision in decisions:
|
|
if other_decision.bbref_id == "mejiafr01":
|
|
three_way_matches[decision.player_name] = other_decision.player_name
|
|
break
|
|
|
|
logger.info(f"Found {len(existing_matches)} existing matches")
|
|
logger.info(f"Found {len(new_players_needed)} players needing new records")
|
|
logger.info(f"Found {len(three_way_matches)} three-way matches")
|
|
|
|
return existing_matches, new_players_needed, three_way_matches
|
|
|
|
def generate_assignment_files(all_players, existing_matches, new_players_needed, three_way_matches):
|
|
"""Generate the final assignment files"""
|
|
|
|
# We need to re-implement the comprehensive matching logic here
|
|
# For now, I'll create a simplified version that processes the manual decisions
|
|
|
|
logger.info("Generating assignment files...")
|
|
|
|
# This is a placeholder - we need to integrate with the existing comprehensive_player_matching.py
|
|
# logic to get the full 12,232 player assignments
|
|
|
|
assignments = []
|
|
new_sbaplayers = []
|
|
|
|
# Process manual decisions into assignments
|
|
for player_name, sbaplayer_id in existing_matches.items():
|
|
# Find all player records with this name and assign them
|
|
matching_players = [p for p in all_players if p['name'] == player_name]
|
|
for player in matching_players:
|
|
assignments.append({
|
|
'player_id': player['id'],
|
|
'player_name': player['name'],
|
|
'season': player['season'],
|
|
'bbref_id': player.get('bbref_id', ''),
|
|
'assigned_sbaplayer_id': sbaplayer_id,
|
|
'match_type': 'manual_existing'
|
|
})
|
|
|
|
# Process new players that need SbaPlayer records
|
|
next_sbaplayer_id = max([sp['id'] for sp in sbaplayers]) + 1
|
|
|
|
for player_name in new_players_needed:
|
|
if player_name not in three_way_matches: # Regular new player
|
|
# Find a representative player record to get bbref_id and other data
|
|
sample_players = [p for p in all_players if p['name'] == player_name]
|
|
if sample_players:
|
|
sample = sample_players[0]
|
|
new_sbaplayers.append({
|
|
'new_id': next_sbaplayer_id,
|
|
'name': player_name,
|
|
'key_bbref': sample.get('bbref_id', ''),
|
|
'key_fangraphs': '',
|
|
'key_mlb': '',
|
|
'notes': f'Created from manual review for {len(sample_players)} player records'
|
|
})
|
|
|
|
# Assign all players with this name to the new SbaPlayer
|
|
for player in sample_players:
|
|
assignments.append({
|
|
'player_id': player['id'],
|
|
'player_name': player['name'],
|
|
'season': player['season'],
|
|
'bbref_id': player.get('bbref_id', ''),
|
|
'assigned_sbaplayer_id': next_sbaplayer_id,
|
|
'match_type': 'manual_new'
|
|
})
|
|
|
|
next_sbaplayer_id += 1
|
|
|
|
# Handle three-way matches
|
|
for original_name, canonical_name in three_way_matches.items():
|
|
# All variations should map to the same new SbaPlayer
|
|
all_variations = [name for name in new_players_needed if three_way_matches.get(name) == canonical_name] + [canonical_name]
|
|
all_variations = list(set(all_variations)) # Remove duplicates
|
|
|
|
# Create one new SbaPlayer for all variations
|
|
sample_players = []
|
|
for variation in all_variations:
|
|
sample_players.extend([p for p in all_players if p['name'] == variation])
|
|
|
|
if sample_players:
|
|
sample = sample_players[0]
|
|
new_sbaplayers.append({
|
|
'new_id': next_sbaplayer_id,
|
|
'name': canonical_name,
|
|
'key_bbref': sample.get('bbref_id', ''),
|
|
'key_fangraphs': '',
|
|
'key_mlb': '',
|
|
'notes': f'Consolidated from {len(all_variations)} name variations: {", ".join(all_variations)}'
|
|
})
|
|
|
|
# Assign all players with these name variations
|
|
for player in sample_players:
|
|
assignments.append({
|
|
'player_id': player['id'],
|
|
'player_name': player['name'],
|
|
'season': player['season'],
|
|
'bbref_id': player.get('bbref_id', ''),
|
|
'assigned_sbaplayer_id': next_sbaplayer_id,
|
|
'match_type': 'manual_consolidated'
|
|
})
|
|
|
|
next_sbaplayer_id += 1
|
|
|
|
return assignments, new_sbaplayers
|
|
|
|
def save_assignment_files(assignments, new_sbaplayers):
|
|
"""Save the final assignment files"""
|
|
|
|
# Save player assignments
|
|
with open('player_sbaplayer_assignments.csv', 'w', newline='') as f:
|
|
if assignments:
|
|
fieldnames = assignments[0].keys()
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(assignments)
|
|
|
|
logger.info(f"Saved {len(assignments)} player assignments to player_sbaplayer_assignments.csv")
|
|
|
|
# Save new SbaPlayers to insert
|
|
with open('new_sbaplayers_to_insert.csv', 'w', newline='') as f:
|
|
if new_sbaplayers:
|
|
fieldnames = new_sbaplayers[0].keys()
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(new_sbaplayers)
|
|
|
|
logger.info(f"Saved {len(new_sbaplayers)} new SbaPlayer records to new_sbaplayers_to_insert.csv")
|
|
|
|
def main():
|
|
"""Main processing function"""
|
|
logger.info("Starting manual decision processing...")
|
|
|
|
try:
|
|
# Load cached data
|
|
all_players, sbaplayers = load_cached_data()
|
|
|
|
# Parse manual decisions
|
|
decisions = parse_manual_decisions()
|
|
|
|
# Determine final assignments
|
|
existing_matches, new_players_needed, three_way_matches = determine_final_assignments(
|
|
all_players, sbaplayers, decisions
|
|
)
|
|
|
|
# Generate assignment files
|
|
assignments, new_sbaplayers = generate_assignment_files(
|
|
all_players, existing_matches, new_players_needed, three_way_matches
|
|
)
|
|
|
|
# Save files
|
|
save_assignment_files(assignments, new_sbaplayers)
|
|
|
|
logger.info("Manual decision processing completed successfully!")
|
|
print(f"\nSUMMARY:")
|
|
print(f"- {len(assignments)} player assignments generated")
|
|
print(f"- {len(new_sbaplayers)} new SbaPlayer records to create")
|
|
print(f"- Files saved: player_sbaplayer_assignments.csv, new_sbaplayers_to_insert.csv")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing manual decisions: {e}")
|
|
raise
|
|
|
|
if __name__ == "__main__":
|
|
main() |