paper-dynasty-card-creation/pd_cards/commands/retrosheet.py
Cal Corum 0a17745389 Run black and ruff across entire codebase
Standardize formatting with black and apply ruff auto-fixes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 14:24:33 -05:00

470 lines
15 KiB
Python

"""
Retrosheet historical data processing commands.
Commands for generating cards from historical Retrosheet play-by-play data.
"""
import asyncio
from pathlib import Path
from typing import Optional
import typer
from rich.console import Console
app = typer.Typer(no_args_is_help=True)
console = Console()
@app.command()
def process(
year: int = typer.Argument(..., help="Season year to process (e.g., 2005)"),
cardset_id: int = typer.Option(..., "--cardset-id", "-c", help="Target cardset ID"),
description: str = typer.Option(
"Live",
"--description",
"-d",
help="Player description ('Live' or 'Month PotM')",
),
start_date: Optional[str] = typer.Option(
None, "--start", help="Start date YYYYMMDD (defaults to March 1)"
),
end_date: Optional[str] = typer.Option(
None, "--end", help="End date YYYYMMDD (defaults to Oct 2)"
),
events_file: Optional[str] = typer.Option(
None, "--events", "-e", help="Retrosheet events CSV filename"
),
data_input: Optional[str] = typer.Option(
None, "--input", "-i", help="Data input directory path"
),
season_pct: float = typer.Option(
1.0, "--season-pct", help="Season percentage (0.0-1.0)"
),
min_pa_vl: int = typer.Option(
None, "--min-pa-vl", help="Minimum PA vs LHP (default: 20 Live, 1 PotM)"
),
min_pa_vr: int = typer.Option(
None, "--min-pa-vr", help="Minimum PA vs RHP (default: 40 Live, 1 PotM)"
),
post_data: bool = typer.Option(
True, "--post/--no-post", help="Post data to database"
),
dry_run: bool = typer.Option(
False, "--dry-run", "-n", help="Preview without saving to database"
),
last_week_ratio: float = typer.Option(
0.0,
"--last-week-ratio",
help="Recency bias: weight for last week's stats (0.0-1.0)",
),
last_twoweeks_ratio: Optional[float] = typer.Option(
None,
"--last-twoweeks-ratio",
help="Recency bias: weight for last 2 weeks' stats (0.0-1.0, default: 0.2 for Live after May 30)",
),
last_month_ratio: float = typer.Option(
0.0,
"--last-month-ratio",
help="Recency bias: weight for last month's stats (0.0-1.0)",
),
):
"""
Process Retrosheet data and create player cards.
Generates batting and pitching cards from historical play-by-play data.
Example:
pd-cards retrosheet process 2005 --cardset-id 27 --description Live
"""
console.print()
console.print("=" * 70)
console.print(f"[bold]RETROSHEET PROCESSING - {year}[/bold]")
console.print("=" * 70)
# Calculate defaults based on description
is_live = "live" in description.lower()
if min_pa_vl is None:
min_pa_vl = 20 if is_live else 1
if min_pa_vr is None:
min_pa_vr = 40 if is_live else 1
if start_date is None:
start_date = f"{year}0301"
if end_date is None:
end_date = f"{year}1002"
if events_file is None:
events_file = f"retrosheets_events_{year}.csv"
if data_input is None:
data_input = f"data-input/{year} Live Cardset/"
# Auto-enable recency bias for Live series after May 30
if last_twoweeks_ratio is None:
end_mmdd = int(end_date[4:]) # Extract MMDD from YYYYMMDD
if is_live and end_mmdd > 530:
last_twoweeks_ratio = 0.2
console.print("[cyan]Auto-enabled recency bias (end date > May 30)[/cyan]")
else:
last_twoweeks_ratio = 0.0
console.print(f"Cardset ID: {cardset_id}")
console.print(f"Description: {description}")
console.print(f"Date Range: {start_date} - {end_date}")
console.print(f"Season %: {season_pct:.0%}")
console.print(f"Min PA: vL={min_pa_vl}, vR={min_pa_vr}")
console.print(f"Events: {events_file}")
console.print(f"Input: {data_input}")
if last_week_ratio > 0 or last_twoweeks_ratio > 0 or last_month_ratio > 0:
console.print(
f"Recency Bias: week={last_week_ratio}, 2weeks={last_twoweeks_ratio}, month={last_month_ratio}"
)
console.print()
if dry_run:
console.print("[yellow]DRY RUN - no changes will be made[/yellow]")
console.print()
console.print("[green]Validation passed - ready to run[/green]")
console.print()
console.print("To run for real, remove --dry-run flag")
raise typer.Exit(0)
# Import and configure the retrosheet module
try:
import sys
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
import retrosheet_data as rd
# Configure the module's globals
rd.CARDSET_ID = cardset_id
rd.PLAYER_DESCRIPTION = description
rd.START_DATE = int(start_date)
rd.END_DATE = int(end_date)
rd.SEASON_PCT = season_pct
rd.MIN_PA_VL = min_pa_vl
rd.MIN_PA_VR = min_pa_vr
rd.MIN_TBF_VL = min_pa_vl
rd.MIN_TBF_VR = min_pa_vr
rd.POST_DATA = post_data
rd.EVENTS_FILENAME = events_file
rd.DATA_INPUT_FILE_PATH = data_input
rd.LAST_WEEK_RATIO = last_week_ratio
rd.LAST_TWOWEEKS_RATIO = last_twoweeks_ratio
rd.LAST_MONTH_RATIO = last_month_ratio
console.print("[bold]Starting Retrosheet processing...[/bold]")
console.print()
# Run the main function (args is legacy, pass empty list)
asyncio.run(rd.main([]))
console.print()
console.print("=" * 70)
console.print("[bold green]✓ RETROSHEET PROCESSING COMPLETE[/bold green]")
console.print("=" * 70)
except ImportError as e:
console.print(f"[red]Error importing modules: {e}[/red]")
console.print("Make sure you're running from the card-creation directory")
raise typer.Exit(1)
except Exception as e:
console.print(f"[red]Error: {e}[/red]")
import traceback
traceback.print_exc()
raise typer.Exit(1)
@app.command()
def arms(
year: int = typer.Argument(..., help="Season year"),
events_file: Path = typer.Option(
..., "--events", "-e", help="Retrosheet events CSV file"
),
output: Optional[Path] = typer.Option(
None, "--output", "-o", help="Output CSV file"
),
season_pct: float = typer.Option(
1.0, "--season-pct", help="Season percentage for min sample"
),
):
"""
Generate outfield arm ratings from Retrosheet data.
Analyzes play-by-play events to calculate OF arm strength ratings.
Example:
pd-cards retrosheet arms 2005 --events data-input/retrosheet/retrosheets_events_2005.csv
"""
console.print()
console.print("=" * 70)
console.print(f"[bold]OUTFIELD ARM RATINGS - {year}[/bold]")
console.print("=" * 70)
if not events_file.exists():
console.print(f"[red]Events file not found: {events_file}[/red]")
raise typer.Exit(1)
if output is None:
output = Path(f"data-output/retrosheet_arm_ratings_{year}.csv")
console.print(f"Events file: {events_file}")
console.print(f"Output: {output}")
console.print(f"Season %: {season_pct:.0%}")
console.print()
try:
import sys
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
import pandas as pd
from defenders.retrosheet_arm_calculator import (
calculate_of_arms_from_retrosheet,
)
console.print("Loading Retrosheet events...")
df_events = pd.read_csv(events_file)
console.print(f" Loaded {len(df_events)} events")
console.print("Calculating arm ratings...")
arm_ratings = calculate_of_arms_from_retrosheet(
df_events, season_pct=season_pct
)
# Convert to DataFrame for CSV output
rows = []
for key, rating in arm_ratings.items():
player_id, position = key.rsplit("_", 1)
rows.append(
{
"player_id": player_id,
"position": position.upper(),
"season": year,
"arm_rating": rating.get("arm_rating", 0),
"balls_fielded": rating.get("balls_fielded", 0),
"total_assists": rating.get("total_assists", 0),
"assist_rate": rating.get("assist_rate", 0),
"z_score": rating.get("z_score", 0),
}
)
df_output = pd.DataFrame(rows)
df_output = df_output.sort_values(["position", "arm_rating"])
# Ensure output directory exists
output.parent.mkdir(parents=True, exist_ok=True)
df_output.to_csv(output, index=False)
console.print()
console.print(
f"[green]✓ Saved {len(df_output)} arm ratings to {output}[/green]"
)
# Show distribution summary
from rich.table import Table
table = Table(title="Arm Rating Distribution")
table.add_column("Position")
table.add_column("Count", justify="right")
table.add_column("Avg Rating", justify="right")
table.add_column("Min", justify="right")
table.add_column("Max", justify="right")
for pos in ["LF", "CF", "RF"]:
pos_df = df_output[df_output["position"] == pos]
if len(pos_df) > 0:
table.add_row(
pos,
str(len(pos_df)),
f"{pos_df['arm_rating'].mean():.1f}",
str(int(pos_df["arm_rating"].min())),
str(int(pos_df["arm_rating"].max())),
)
console.print()
console.print(table)
except ImportError as e:
console.print(f"[red]Error importing modules: {e}[/red]")
raise typer.Exit(1)
except Exception as e:
console.print(f"[red]Error: {e}[/red]")
import traceback
traceback.print_exc()
raise typer.Exit(1)
@app.command()
def validate(
cardset_id: int = typer.Argument(..., help="Cardset ID to validate"),
api_url: str = typer.Option(
"https://pd.manticorum.com/api", "--api", help="API URL"
),
):
"""
Validate positions for a cardset.
Checks for anomalous DH counts and missing outfield positions.
Example:
pd-cards retrosheet validate 27
"""
console.print()
console.print("=" * 70)
console.print(f"[bold]POSITION VALIDATION - Cardset {cardset_id}[/bold]")
console.print("=" * 70)
try:
import sys
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from db_calls import db_get
async def run_validation():
# Get all players in cardset
console.print(f"Fetching players from cardset {cardset_id}...")
players = await db_get("players", params=[("cardset_id", cardset_id)])
if players is None or players["count"] == 0:
console.print(
f"[yellow]No players found in cardset {cardset_id}[/yellow]"
)
return
console.print(f"Found {players['count']} players")
console.print()
# Count positions
pos_counts = {}
dh_players = []
for player in players["players"]:
pos_1 = player.get("pos_1", "")
if pos_1:
pos_counts[pos_1] = pos_counts.get(pos_1, 0) + 1
if pos_1 == "DH":
dh_players.append(
f"{player.get('p_name', 'Unknown')} (ID: {player['player_id']})"
)
# Display position counts
from rich.table import Table
table = Table(title="Position Distribution")
table.add_column("Position")
table.add_column("Count", justify="right")
for pos in sorted(pos_counts.keys()):
count = pos_counts[pos]
style = "red" if pos == "DH" and count > 5 else None
table.add_row(pos, str(count), style=style)
console.print(table)
console.print()
# Check for anomalies
issues = []
dh_count = pos_counts.get("DH", 0)
if dh_count > 5:
issues.append(
f"[red]⚠ Anomalous DH count: {dh_count} (should be <5 for full-season cards)[/red]"
)
console.print("DH Players:")
for p in dh_players[:10]:
console.print(f" - {p}")
if len(dh_players) > 10:
console.print(f" ... and {len(dh_players) - 10} more")
console.print()
for pos in ["LF", "CF", "RF"]:
if pos_counts.get(pos, 0) == 0:
issues.append(
f"[red]⚠ Missing {pos} positions (indicates defensive calculation failures)[/red]"
)
if issues:
console.print("[bold]Issues Found:[/bold]")
for issue in issues:
console.print(f" {issue}")
else:
console.print("[green]✓ No position anomalies detected[/green]")
asyncio.run(run_validation())
except ImportError as e:
console.print(f"[red]Error importing modules: {e}[/red]")
raise typer.Exit(1)
except Exception as e:
console.print(f"[red]Error: {e}[/red]")
raise typer.Exit(1)
@app.command()
def defense(
year: int = typer.Argument(..., help="Season year to fetch defense stats for"),
output_dir: Path = typer.Option(
None, "--output", "-o", help="Output directory for CSV files"
),
):
"""
Fetch and store defensive statistics from Baseball Reference.
Downloads fielding stats for all positions and saves to CSV files.
Example:
pd-cards retrosheet defense 2005 --output "data-input/2005 Live Cardset/"
"""
console.print()
console.print("=" * 70)
console.print(f"[bold]FETCH DEFENSIVE STATS - {year}[/bold]")
console.print("=" * 70)
if output_dir is None:
output_dir = Path(f"data-input/{year} Live Cardset/")
console.print(f"Output directory: {output_dir}")
console.print()
try:
import sys
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
import defenders.calcs_defense as cde
positions = ["c", "1b", "2b", "3b", "ss", "lf", "cf", "rf", "of", "p"]
output_dir.mkdir(parents=True, exist_ok=True)
async def fetch_defense():
for position in positions:
console.print(f"Fetching {position.upper()} defensive stats...")
pos_df = cde.get_bbref_fielding_df(position, year)
output_file = output_dir / f"defense_{position}.csv"
pos_df.to_csv(output_file)
console.print(
f" [green]✓ Saved {len(pos_df)} records to {output_file}[/green]"
)
await asyncio.sleep(8) # Rate limiting
console.print()
console.print(f"[green]✓ All defensive stats saved to {output_dir}[/green]")
asyncio.run(fetch_defense())
except ImportError as e:
console.print(f"[red]Error importing modules: {e}[/red]")
raise typer.Exit(1)
except Exception as e:
console.print(f"[red]Error: {e}[/red]")
import traceback
traceback.print_exc()
raise typer.Exit(1)