paper-dynasty-card-creation/pd_cards/commands/retrosheet.py
Cal Corum 5b75a3d38f Implement CLI wrappers for live-series, retrosheet, scouting, upload
Migrated all major card creation workflows to pd-cards CLI:

live-series:
- update: Full FanGraphs/BBRef card generation with CLI options
- status: Show cardset status from database

retrosheet:
- process: Historical Retrosheet data processing
- arms: Generate outfield arm ratings from play-by-play
- validate: Check for position anomalies in cardsets
- defense: Fetch defensive stats from Baseball Reference

scouting:
- batters: Generate batting scouting reports
- pitchers: Generate pitching scouting reports
- all: Generate all reports at once

upload:
- s3: Upload card images to AWS S3
- check: Validate cards without uploading
- refresh: Re-generate and re-upload card images

Updated CLAUDE.md with comprehensive CLI documentation.
Legacy scripts remain available but CLI is now the primary interface.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-18 16:39:38 -06:00

379 lines
13 KiB
Python

"""
Retrosheet historical data processing commands.
Commands for generating cards from historical Retrosheet play-by-play data.
"""
import asyncio
from pathlib import Path
from typing import Optional
import typer
from rich.console import Console
app = typer.Typer(no_args_is_help=True)
console = Console()
@app.command()
def process(
year: int = typer.Argument(..., help="Season year to process (e.g., 2005)"),
cardset_id: int = typer.Option(..., "--cardset-id", "-c", help="Target cardset ID"),
description: str = typer.Option("Live", "--description", "-d", help="Player description ('Live' or 'Month PotM')"),
start_date: Optional[str] = typer.Option(None, "--start", help="Start date YYYYMMDD (defaults to March 1)"),
end_date: Optional[str] = typer.Option(None, "--end", help="End date YYYYMMDD (defaults to Oct 2)"),
events_file: Optional[str] = typer.Option(None, "--events", "-e", help="Retrosheet events CSV filename"),
data_input: Optional[str] = typer.Option(None, "--input", "-i", help="Data input directory path"),
season_pct: float = typer.Option(1.0, "--season-pct", help="Season percentage (0.0-1.0)"),
min_pa_vl: int = typer.Option(None, "--min-pa-vl", help="Minimum PA vs LHP (default: 20 Live, 1 PotM)"),
min_pa_vr: int = typer.Option(None, "--min-pa-vr", help="Minimum PA vs RHP (default: 40 Live, 1 PotM)"),
post_data: bool = typer.Option(True, "--post/--no-post", help="Post data to database"),
dry_run: bool = typer.Option(False, "--dry-run", "-n", help="Preview without saving to database"),
):
"""
Process Retrosheet data and create player cards.
Generates batting and pitching cards from historical play-by-play data.
Example:
pd-cards retrosheet process 2005 --cardset-id 27 --description Live
"""
console.print()
console.print("=" * 70)
console.print(f"[bold]RETROSHEET PROCESSING - {year}[/bold]")
console.print("=" * 70)
# Calculate defaults based on description
is_live = 'live' in description.lower()
if min_pa_vl is None:
min_pa_vl = 20 if is_live else 1
if min_pa_vr is None:
min_pa_vr = 40 if is_live else 1
if start_date is None:
start_date = f"{year}0301"
if end_date is None:
end_date = f"{year}1002"
if events_file is None:
events_file = f"retrosheets_events_{year}.csv"
if data_input is None:
data_input = f"data-input/{year} Live Cardset/"
console.print(f"Cardset ID: {cardset_id}")
console.print(f"Description: {description}")
console.print(f"Date Range: {start_date} - {end_date}")
console.print(f"Season %: {season_pct:.0%}")
console.print(f"Min PA: vL={min_pa_vl}, vR={min_pa_vr}")
console.print(f"Events: {events_file}")
console.print(f"Input: {data_input}")
console.print()
if dry_run:
console.print("[yellow]DRY RUN - no changes will be made[/yellow]")
console.print()
console.print("[green]Validation passed - ready to run[/green]")
console.print()
console.print("To run for real, remove --dry-run flag")
raise typer.Exit(0)
# Import and configure the retrosheet module
try:
import sys
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
import retrosheet_data as rd
# Configure the module's globals
rd.CARDSET_ID = cardset_id
rd.PLAYER_DESCRIPTION = description
rd.START_DATE = int(start_date)
rd.END_DATE = int(end_date)
rd.SEASON_PCT = season_pct
rd.MIN_PA_VL = min_pa_vl
rd.MIN_PA_VR = min_pa_vr
rd.MIN_TBF_VL = min_pa_vl
rd.MIN_TBF_VR = min_pa_vr
rd.POST_DATA = post_data
rd.EVENTS_FILENAME = events_file
rd.DATA_INPUT_FILE_PATH = data_input
console.print("[bold]Starting Retrosheet processing...[/bold]")
console.print()
# Run the main function
asyncio.run(rd.main())
console.print()
console.print("=" * 70)
console.print(f"[bold green]✓ RETROSHEET PROCESSING COMPLETE[/bold green]")
console.print("=" * 70)
except ImportError as e:
console.print(f"[red]Error importing modules: {e}[/red]")
console.print("Make sure you're running from the card-creation directory")
raise typer.Exit(1)
except Exception as e:
console.print(f"[red]Error: {e}[/red]")
import traceback
traceback.print_exc()
raise typer.Exit(1)
@app.command()
def arms(
year: int = typer.Argument(..., help="Season year"),
events_file: Path = typer.Option(..., "--events", "-e", help="Retrosheet events CSV file"),
output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output CSV file"),
season_pct: float = typer.Option(1.0, "--season-pct", help="Season percentage for min sample"),
):
"""
Generate outfield arm ratings from Retrosheet data.
Analyzes play-by-play events to calculate OF arm strength ratings.
Example:
pd-cards retrosheet arms 2005 --events data-input/retrosheet/retrosheets_events_2005.csv
"""
console.print()
console.print("=" * 70)
console.print(f"[bold]OUTFIELD ARM RATINGS - {year}[/bold]")
console.print("=" * 70)
if not events_file.exists():
console.print(f"[red]Events file not found: {events_file}[/red]")
raise typer.Exit(1)
if output is None:
output = Path(f"data-output/retrosheet_arm_ratings_{year}.csv")
console.print(f"Events file: {events_file}")
console.print(f"Output: {output}")
console.print(f"Season %: {season_pct:.0%}")
console.print()
try:
import sys
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
import pandas as pd
from defenders.retrosheet_arm_calculator import calculate_of_arms_from_retrosheet
console.print("Loading Retrosheet events...")
df_events = pd.read_csv(events_file)
console.print(f" Loaded {len(df_events)} events")
console.print("Calculating arm ratings...")
arm_ratings = calculate_of_arms_from_retrosheet(df_events, season_pct=season_pct)
# Convert to DataFrame for CSV output
rows = []
for key, rating in arm_ratings.items():
player_id, position = key.rsplit('_', 1)
rows.append({
'player_id': player_id,
'position': position.upper(),
'season': year,
'arm_rating': rating.get('arm_rating', 0),
'balls_fielded': rating.get('balls_fielded', 0),
'total_assists': rating.get('total_assists', 0),
'assist_rate': rating.get('assist_rate', 0),
'z_score': rating.get('z_score', 0),
})
df_output = pd.DataFrame(rows)
df_output = df_output.sort_values(['position', 'arm_rating'])
# Ensure output directory exists
output.parent.mkdir(parents=True, exist_ok=True)
df_output.to_csv(output, index=False)
console.print()
console.print(f"[green]✓ Saved {len(df_output)} arm ratings to {output}[/green]")
# Show distribution summary
from rich.table import Table
table = Table(title="Arm Rating Distribution")
table.add_column("Position")
table.add_column("Count", justify="right")
table.add_column("Avg Rating", justify="right")
table.add_column("Min", justify="right")
table.add_column("Max", justify="right")
for pos in ['LF', 'CF', 'RF']:
pos_df = df_output[df_output['position'] == pos]
if len(pos_df) > 0:
table.add_row(
pos,
str(len(pos_df)),
f"{pos_df['arm_rating'].mean():.1f}",
str(int(pos_df['arm_rating'].min())),
str(int(pos_df['arm_rating'].max()))
)
console.print()
console.print(table)
except ImportError as e:
console.print(f"[red]Error importing modules: {e}[/red]")
raise typer.Exit(1)
except Exception as e:
console.print(f"[red]Error: {e}[/red]")
import traceback
traceback.print_exc()
raise typer.Exit(1)
@app.command()
def validate(
cardset_id: int = typer.Argument(..., help="Cardset ID to validate"),
api_url: str = typer.Option("https://pd.manticorum.com/api", "--api", help="API URL"),
):
"""
Validate positions for a cardset.
Checks for anomalous DH counts and missing outfield positions.
Example:
pd-cards retrosheet validate 27
"""
console.print()
console.print("=" * 70)
console.print(f"[bold]POSITION VALIDATION - Cardset {cardset_id}[/bold]")
console.print("=" * 70)
try:
import sys
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from db_calls import db_get
async def run_validation():
# Get all players in cardset
console.print(f"Fetching players from cardset {cardset_id}...")
players = await db_get('players', params=[('cardset_id', cardset_id)])
if players is None or players['count'] == 0:
console.print(f"[yellow]No players found in cardset {cardset_id}[/yellow]")
return
console.print(f"Found {players['count']} players")
console.print()
# Count positions
pos_counts = {}
dh_players = []
for player in players['players']:
pos_1 = player.get('pos_1', '')
if pos_1:
pos_counts[pos_1] = pos_counts.get(pos_1, 0) + 1
if pos_1 == 'DH':
dh_players.append(f"{player.get('p_name', 'Unknown')} (ID: {player['player_id']})")
# Display position counts
from rich.table import Table
table = Table(title="Position Distribution")
table.add_column("Position")
table.add_column("Count", justify="right")
for pos in sorted(pos_counts.keys()):
count = pos_counts[pos]
style = "red" if pos == 'DH' and count > 5 else None
table.add_row(pos, str(count), style=style)
console.print(table)
console.print()
# Check for anomalies
issues = []
dh_count = pos_counts.get('DH', 0)
if dh_count > 5:
issues.append(f"[red]⚠ Anomalous DH count: {dh_count} (should be <5 for full-season cards)[/red]")
console.print("DH Players:")
for p in dh_players[:10]:
console.print(f" - {p}")
if len(dh_players) > 10:
console.print(f" ... and {len(dh_players) - 10} more")
console.print()
for pos in ['LF', 'CF', 'RF']:
if pos_counts.get(pos, 0) == 0:
issues.append(f"[red]⚠ Missing {pos} positions (indicates defensive calculation failures)[/red]")
if issues:
console.print("[bold]Issues Found:[/bold]")
for issue in issues:
console.print(f" {issue}")
else:
console.print("[green]✓ No position anomalies detected[/green]")
asyncio.run(run_validation())
except ImportError as e:
console.print(f"[red]Error importing modules: {e}[/red]")
raise typer.Exit(1)
except Exception as e:
console.print(f"[red]Error: {e}[/red]")
raise typer.Exit(1)
@app.command()
def defense(
year: int = typer.Argument(..., help="Season year to fetch defense stats for"),
output_dir: Path = typer.Option(None, "--output", "-o", help="Output directory for CSV files"),
):
"""
Fetch and store defensive statistics from Baseball Reference.
Downloads fielding stats for all positions and saves to CSV files.
Example:
pd-cards retrosheet defense 2005 --output "data-input/2005 Live Cardset/"
"""
console.print()
console.print("=" * 70)
console.print(f"[bold]FETCH DEFENSIVE STATS - {year}[/bold]")
console.print("=" * 70)
if output_dir is None:
output_dir = Path(f"data-input/{year} Live Cardset/")
console.print(f"Output directory: {output_dir}")
console.print()
try:
import sys
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
import defenders.calcs_defense as cde
positions = ['c', '1b', '2b', '3b', 'ss', 'lf', 'cf', 'rf', 'of', 'p']
output_dir.mkdir(parents=True, exist_ok=True)
async def fetch_defense():
for position in positions:
console.print(f"Fetching {position.upper()} defensive stats...")
pos_df = cde.get_bbref_fielding_df(position, year)
output_file = output_dir / f"defense_{position}.csv"
pos_df.to_csv(output_file)
console.print(f" [green]✓ Saved {len(pos_df)} records to {output_file}[/green]")
await asyncio.sleep(8) # Rate limiting
console.print()
console.print(f"[green]✓ All defensive stats saved to {output_dir}[/green]")
asyncio.run(fetch_defense())
except ImportError as e:
console.print(f"[red]Error importing modules: {e}[/red]")
raise typer.Exit(1)
except Exception as e:
console.print(f"[red]Error: {e}[/red]")
import traceback
traceback.print_exc()
raise typer.Exit(1)