- Add --last-week-ratio, --last-twoweeks-ratio, --last-month-ratio flags - Auto-enable 0.2 recency bias for last 2 weeks on Live series after May 30 - Fix main() call to pass empty args list (legacy parameter required) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
396 lines
15 KiB
Python
396 lines
15 KiB
Python
"""
|
|
Retrosheet historical data processing commands.
|
|
|
|
Commands for generating cards from historical Retrosheet play-by-play data.
|
|
"""
|
|
|
|
import asyncio
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import typer
|
|
from rich.console import Console
|
|
|
|
app = typer.Typer(no_args_is_help=True)
|
|
console = Console()
|
|
|
|
|
|
@app.command()
|
|
def process(
|
|
year: int = typer.Argument(..., help="Season year to process (e.g., 2005)"),
|
|
cardset_id: int = typer.Option(..., "--cardset-id", "-c", help="Target cardset ID"),
|
|
description: str = typer.Option("Live", "--description", "-d", help="Player description ('Live' or 'Month PotM')"),
|
|
start_date: Optional[str] = typer.Option(None, "--start", help="Start date YYYYMMDD (defaults to March 1)"),
|
|
end_date: Optional[str] = typer.Option(None, "--end", help="End date YYYYMMDD (defaults to Oct 2)"),
|
|
events_file: Optional[str] = typer.Option(None, "--events", "-e", help="Retrosheet events CSV filename"),
|
|
data_input: Optional[str] = typer.Option(None, "--input", "-i", help="Data input directory path"),
|
|
season_pct: float = typer.Option(1.0, "--season-pct", help="Season percentage (0.0-1.0)"),
|
|
min_pa_vl: int = typer.Option(None, "--min-pa-vl", help="Minimum PA vs LHP (default: 20 Live, 1 PotM)"),
|
|
min_pa_vr: int = typer.Option(None, "--min-pa-vr", help="Minimum PA vs RHP (default: 40 Live, 1 PotM)"),
|
|
post_data: bool = typer.Option(True, "--post/--no-post", help="Post data to database"),
|
|
dry_run: bool = typer.Option(False, "--dry-run", "-n", help="Preview without saving to database"),
|
|
last_week_ratio: float = typer.Option(0.0, "--last-week-ratio", help="Recency bias: weight for last week's stats (0.0-1.0)"),
|
|
last_twoweeks_ratio: Optional[float] = typer.Option(None, "--last-twoweeks-ratio", help="Recency bias: weight for last 2 weeks' stats (0.0-1.0, default: 0.2 for Live after May 30)"),
|
|
last_month_ratio: float = typer.Option(0.0, "--last-month-ratio", help="Recency bias: weight for last month's stats (0.0-1.0)"),
|
|
):
|
|
"""
|
|
Process Retrosheet data and create player cards.
|
|
|
|
Generates batting and pitching cards from historical play-by-play data.
|
|
|
|
Example:
|
|
pd-cards retrosheet process 2005 --cardset-id 27 --description Live
|
|
"""
|
|
console.print()
|
|
console.print("=" * 70)
|
|
console.print(f"[bold]RETROSHEET PROCESSING - {year}[/bold]")
|
|
console.print("=" * 70)
|
|
|
|
# Calculate defaults based on description
|
|
is_live = 'live' in description.lower()
|
|
|
|
if min_pa_vl is None:
|
|
min_pa_vl = 20 if is_live else 1
|
|
if min_pa_vr is None:
|
|
min_pa_vr = 40 if is_live else 1
|
|
|
|
if start_date is None:
|
|
start_date = f"{year}0301"
|
|
if end_date is None:
|
|
end_date = f"{year}1002"
|
|
|
|
if events_file is None:
|
|
events_file = f"retrosheets_events_{year}.csv"
|
|
|
|
if data_input is None:
|
|
data_input = f"data-input/{year} Live Cardset/"
|
|
|
|
# Auto-enable recency bias for Live series after May 30
|
|
if last_twoweeks_ratio is None:
|
|
end_mmdd = int(end_date[4:]) # Extract MMDD from YYYYMMDD
|
|
if is_live and end_mmdd > 530:
|
|
last_twoweeks_ratio = 0.2
|
|
console.print(f"[cyan]Auto-enabled recency bias (end date > May 30)[/cyan]")
|
|
else:
|
|
last_twoweeks_ratio = 0.0
|
|
|
|
console.print(f"Cardset ID: {cardset_id}")
|
|
console.print(f"Description: {description}")
|
|
console.print(f"Date Range: {start_date} - {end_date}")
|
|
console.print(f"Season %: {season_pct:.0%}")
|
|
console.print(f"Min PA: vL={min_pa_vl}, vR={min_pa_vr}")
|
|
console.print(f"Events: {events_file}")
|
|
console.print(f"Input: {data_input}")
|
|
if last_week_ratio > 0 or last_twoweeks_ratio > 0 or last_month_ratio > 0:
|
|
console.print(f"Recency Bias: week={last_week_ratio}, 2weeks={last_twoweeks_ratio}, month={last_month_ratio}")
|
|
console.print()
|
|
|
|
if dry_run:
|
|
console.print("[yellow]DRY RUN - no changes will be made[/yellow]")
|
|
console.print()
|
|
console.print("[green]Validation passed - ready to run[/green]")
|
|
console.print()
|
|
console.print("To run for real, remove --dry-run flag")
|
|
raise typer.Exit(0)
|
|
|
|
# Import and configure the retrosheet module
|
|
try:
|
|
import sys
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
|
|
import retrosheet_data as rd
|
|
|
|
# Configure the module's globals
|
|
rd.CARDSET_ID = cardset_id
|
|
rd.PLAYER_DESCRIPTION = description
|
|
rd.START_DATE = int(start_date)
|
|
rd.END_DATE = int(end_date)
|
|
rd.SEASON_PCT = season_pct
|
|
rd.MIN_PA_VL = min_pa_vl
|
|
rd.MIN_PA_VR = min_pa_vr
|
|
rd.MIN_TBF_VL = min_pa_vl
|
|
rd.MIN_TBF_VR = min_pa_vr
|
|
rd.POST_DATA = post_data
|
|
rd.EVENTS_FILENAME = events_file
|
|
rd.DATA_INPUT_FILE_PATH = data_input
|
|
rd.LAST_WEEK_RATIO = last_week_ratio
|
|
rd.LAST_TWOWEEKS_RATIO = last_twoweeks_ratio
|
|
rd.LAST_MONTH_RATIO = last_month_ratio
|
|
|
|
console.print("[bold]Starting Retrosheet processing...[/bold]")
|
|
console.print()
|
|
|
|
# Run the main function (args is legacy, pass empty list)
|
|
asyncio.run(rd.main([]))
|
|
|
|
console.print()
|
|
console.print("=" * 70)
|
|
console.print(f"[bold green]✓ RETROSHEET PROCESSING COMPLETE[/bold green]")
|
|
console.print("=" * 70)
|
|
|
|
except ImportError as e:
|
|
console.print(f"[red]Error importing modules: {e}[/red]")
|
|
console.print("Make sure you're running from the card-creation directory")
|
|
raise typer.Exit(1)
|
|
except Exception as e:
|
|
console.print(f"[red]Error: {e}[/red]")
|
|
import traceback
|
|
traceback.print_exc()
|
|
raise typer.Exit(1)
|
|
|
|
|
|
@app.command()
|
|
def arms(
|
|
year: int = typer.Argument(..., help="Season year"),
|
|
events_file: Path = typer.Option(..., "--events", "-e", help="Retrosheet events CSV file"),
|
|
output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output CSV file"),
|
|
season_pct: float = typer.Option(1.0, "--season-pct", help="Season percentage for min sample"),
|
|
):
|
|
"""
|
|
Generate outfield arm ratings from Retrosheet data.
|
|
|
|
Analyzes play-by-play events to calculate OF arm strength ratings.
|
|
|
|
Example:
|
|
pd-cards retrosheet arms 2005 --events data-input/retrosheet/retrosheets_events_2005.csv
|
|
"""
|
|
console.print()
|
|
console.print("=" * 70)
|
|
console.print(f"[bold]OUTFIELD ARM RATINGS - {year}[/bold]")
|
|
console.print("=" * 70)
|
|
|
|
if not events_file.exists():
|
|
console.print(f"[red]Events file not found: {events_file}[/red]")
|
|
raise typer.Exit(1)
|
|
|
|
if output is None:
|
|
output = Path(f"data-output/retrosheet_arm_ratings_{year}.csv")
|
|
|
|
console.print(f"Events file: {events_file}")
|
|
console.print(f"Output: {output}")
|
|
console.print(f"Season %: {season_pct:.0%}")
|
|
console.print()
|
|
|
|
try:
|
|
import sys
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
|
|
import pandas as pd
|
|
from defenders.retrosheet_arm_calculator import calculate_of_arms_from_retrosheet
|
|
|
|
console.print("Loading Retrosheet events...")
|
|
df_events = pd.read_csv(events_file)
|
|
console.print(f" Loaded {len(df_events)} events")
|
|
|
|
console.print("Calculating arm ratings...")
|
|
arm_ratings = calculate_of_arms_from_retrosheet(df_events, season_pct=season_pct)
|
|
|
|
# Convert to DataFrame for CSV output
|
|
rows = []
|
|
for key, rating in arm_ratings.items():
|
|
player_id, position = key.rsplit('_', 1)
|
|
rows.append({
|
|
'player_id': player_id,
|
|
'position': position.upper(),
|
|
'season': year,
|
|
'arm_rating': rating.get('arm_rating', 0),
|
|
'balls_fielded': rating.get('balls_fielded', 0),
|
|
'total_assists': rating.get('total_assists', 0),
|
|
'assist_rate': rating.get('assist_rate', 0),
|
|
'z_score': rating.get('z_score', 0),
|
|
})
|
|
|
|
df_output = pd.DataFrame(rows)
|
|
df_output = df_output.sort_values(['position', 'arm_rating'])
|
|
|
|
# Ensure output directory exists
|
|
output.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
df_output.to_csv(output, index=False)
|
|
console.print()
|
|
console.print(f"[green]✓ Saved {len(df_output)} arm ratings to {output}[/green]")
|
|
|
|
# Show distribution summary
|
|
from rich.table import Table
|
|
table = Table(title="Arm Rating Distribution")
|
|
table.add_column("Position")
|
|
table.add_column("Count", justify="right")
|
|
table.add_column("Avg Rating", justify="right")
|
|
table.add_column("Min", justify="right")
|
|
table.add_column("Max", justify="right")
|
|
|
|
for pos in ['LF', 'CF', 'RF']:
|
|
pos_df = df_output[df_output['position'] == pos]
|
|
if len(pos_df) > 0:
|
|
table.add_row(
|
|
pos,
|
|
str(len(pos_df)),
|
|
f"{pos_df['arm_rating'].mean():.1f}",
|
|
str(int(pos_df['arm_rating'].min())),
|
|
str(int(pos_df['arm_rating'].max()))
|
|
)
|
|
|
|
console.print()
|
|
console.print(table)
|
|
|
|
except ImportError as e:
|
|
console.print(f"[red]Error importing modules: {e}[/red]")
|
|
raise typer.Exit(1)
|
|
except Exception as e:
|
|
console.print(f"[red]Error: {e}[/red]")
|
|
import traceback
|
|
traceback.print_exc()
|
|
raise typer.Exit(1)
|
|
|
|
|
|
@app.command()
|
|
def validate(
|
|
cardset_id: int = typer.Argument(..., help="Cardset ID to validate"),
|
|
api_url: str = typer.Option("https://pd.manticorum.com/api", "--api", help="API URL"),
|
|
):
|
|
"""
|
|
Validate positions for a cardset.
|
|
|
|
Checks for anomalous DH counts and missing outfield positions.
|
|
|
|
Example:
|
|
pd-cards retrosheet validate 27
|
|
"""
|
|
console.print()
|
|
console.print("=" * 70)
|
|
console.print(f"[bold]POSITION VALIDATION - Cardset {cardset_id}[/bold]")
|
|
console.print("=" * 70)
|
|
|
|
try:
|
|
import sys
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
from db_calls import db_get
|
|
|
|
async def run_validation():
|
|
# Get all players in cardset
|
|
console.print(f"Fetching players from cardset {cardset_id}...")
|
|
players = await db_get('players', params=[('cardset_id', cardset_id)])
|
|
|
|
if players is None or players['count'] == 0:
|
|
console.print(f"[yellow]No players found in cardset {cardset_id}[/yellow]")
|
|
return
|
|
|
|
console.print(f"Found {players['count']} players")
|
|
console.print()
|
|
|
|
# Count positions
|
|
pos_counts = {}
|
|
dh_players = []
|
|
|
|
for player in players['players']:
|
|
pos_1 = player.get('pos_1', '')
|
|
if pos_1:
|
|
pos_counts[pos_1] = pos_counts.get(pos_1, 0) + 1
|
|
if pos_1 == 'DH':
|
|
dh_players.append(f"{player.get('p_name', 'Unknown')} (ID: {player['player_id']})")
|
|
|
|
# Display position counts
|
|
from rich.table import Table
|
|
table = Table(title="Position Distribution")
|
|
table.add_column("Position")
|
|
table.add_column("Count", justify="right")
|
|
|
|
for pos in sorted(pos_counts.keys()):
|
|
count = pos_counts[pos]
|
|
style = "red" if pos == 'DH' and count > 5 else None
|
|
table.add_row(pos, str(count), style=style)
|
|
|
|
console.print(table)
|
|
console.print()
|
|
|
|
# Check for anomalies
|
|
issues = []
|
|
|
|
dh_count = pos_counts.get('DH', 0)
|
|
if dh_count > 5:
|
|
issues.append(f"[red]⚠ Anomalous DH count: {dh_count} (should be <5 for full-season cards)[/red]")
|
|
console.print("DH Players:")
|
|
for p in dh_players[:10]:
|
|
console.print(f" - {p}")
|
|
if len(dh_players) > 10:
|
|
console.print(f" ... and {len(dh_players) - 10} more")
|
|
console.print()
|
|
|
|
for pos in ['LF', 'CF', 'RF']:
|
|
if pos_counts.get(pos, 0) == 0:
|
|
issues.append(f"[red]⚠ Missing {pos} positions (indicates defensive calculation failures)[/red]")
|
|
|
|
if issues:
|
|
console.print("[bold]Issues Found:[/bold]")
|
|
for issue in issues:
|
|
console.print(f" {issue}")
|
|
else:
|
|
console.print("[green]✓ No position anomalies detected[/green]")
|
|
|
|
asyncio.run(run_validation())
|
|
|
|
except ImportError as e:
|
|
console.print(f"[red]Error importing modules: {e}[/red]")
|
|
raise typer.Exit(1)
|
|
except Exception as e:
|
|
console.print(f"[red]Error: {e}[/red]")
|
|
raise typer.Exit(1)
|
|
|
|
|
|
@app.command()
|
|
def defense(
|
|
year: int = typer.Argument(..., help="Season year to fetch defense stats for"),
|
|
output_dir: Path = typer.Option(None, "--output", "-o", help="Output directory for CSV files"),
|
|
):
|
|
"""
|
|
Fetch and store defensive statistics from Baseball Reference.
|
|
|
|
Downloads fielding stats for all positions and saves to CSV files.
|
|
|
|
Example:
|
|
pd-cards retrosheet defense 2005 --output "data-input/2005 Live Cardset/"
|
|
"""
|
|
console.print()
|
|
console.print("=" * 70)
|
|
console.print(f"[bold]FETCH DEFENSIVE STATS - {year}[/bold]")
|
|
console.print("=" * 70)
|
|
|
|
if output_dir is None:
|
|
output_dir = Path(f"data-input/{year} Live Cardset/")
|
|
|
|
console.print(f"Output directory: {output_dir}")
|
|
console.print()
|
|
|
|
try:
|
|
import sys
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
|
|
import defenders.calcs_defense as cde
|
|
|
|
positions = ['c', '1b', '2b', '3b', 'ss', 'lf', 'cf', 'rf', 'of', 'p']
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
async def fetch_defense():
|
|
for position in positions:
|
|
console.print(f"Fetching {position.upper()} defensive stats...")
|
|
pos_df = cde.get_bbref_fielding_df(position, year)
|
|
output_file = output_dir / f"defense_{position}.csv"
|
|
pos_df.to_csv(output_file)
|
|
console.print(f" [green]✓ Saved {len(pos_df)} records to {output_file}[/green]")
|
|
await asyncio.sleep(8) # Rate limiting
|
|
|
|
console.print()
|
|
console.print(f"[green]✓ All defensive stats saved to {output_dir}[/green]")
|
|
|
|
asyncio.run(fetch_defense())
|
|
|
|
except ImportError as e:
|
|
console.print(f"[red]Error importing modules: {e}[/red]")
|
|
raise typer.Exit(1)
|
|
except Exception as e:
|
|
console.print(f"[red]Error: {e}[/red]")
|
|
import traceback
|
|
traceback.print_exc()
|
|
raise typer.Exit(1)
|