Standardize formatting with black and apply ruff auto-fixes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
470 lines
15 KiB
Python
470 lines
15 KiB
Python
"""
|
|
Retrosheet historical data processing commands.
|
|
|
|
Commands for generating cards from historical Retrosheet play-by-play data.
|
|
"""
|
|
|
|
import asyncio
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import typer
|
|
from rich.console import Console
|
|
|
|
app = typer.Typer(no_args_is_help=True)
|
|
console = Console()
|
|
|
|
|
|
@app.command()
|
|
def process(
|
|
year: int = typer.Argument(..., help="Season year to process (e.g., 2005)"),
|
|
cardset_id: int = typer.Option(..., "--cardset-id", "-c", help="Target cardset ID"),
|
|
description: str = typer.Option(
|
|
"Live",
|
|
"--description",
|
|
"-d",
|
|
help="Player description ('Live' or 'Month PotM')",
|
|
),
|
|
start_date: Optional[str] = typer.Option(
|
|
None, "--start", help="Start date YYYYMMDD (defaults to March 1)"
|
|
),
|
|
end_date: Optional[str] = typer.Option(
|
|
None, "--end", help="End date YYYYMMDD (defaults to Oct 2)"
|
|
),
|
|
events_file: Optional[str] = typer.Option(
|
|
None, "--events", "-e", help="Retrosheet events CSV filename"
|
|
),
|
|
data_input: Optional[str] = typer.Option(
|
|
None, "--input", "-i", help="Data input directory path"
|
|
),
|
|
season_pct: float = typer.Option(
|
|
1.0, "--season-pct", help="Season percentage (0.0-1.0)"
|
|
),
|
|
min_pa_vl: int = typer.Option(
|
|
None, "--min-pa-vl", help="Minimum PA vs LHP (default: 20 Live, 1 PotM)"
|
|
),
|
|
min_pa_vr: int = typer.Option(
|
|
None, "--min-pa-vr", help="Minimum PA vs RHP (default: 40 Live, 1 PotM)"
|
|
),
|
|
post_data: bool = typer.Option(
|
|
True, "--post/--no-post", help="Post data to database"
|
|
),
|
|
dry_run: bool = typer.Option(
|
|
False, "--dry-run", "-n", help="Preview without saving to database"
|
|
),
|
|
last_week_ratio: float = typer.Option(
|
|
0.0,
|
|
"--last-week-ratio",
|
|
help="Recency bias: weight for last week's stats (0.0-1.0)",
|
|
),
|
|
last_twoweeks_ratio: Optional[float] = typer.Option(
|
|
None,
|
|
"--last-twoweeks-ratio",
|
|
help="Recency bias: weight for last 2 weeks' stats (0.0-1.0, default: 0.2 for Live after May 30)",
|
|
),
|
|
last_month_ratio: float = typer.Option(
|
|
0.0,
|
|
"--last-month-ratio",
|
|
help="Recency bias: weight for last month's stats (0.0-1.0)",
|
|
),
|
|
):
|
|
"""
|
|
Process Retrosheet data and create player cards.
|
|
|
|
Generates batting and pitching cards from historical play-by-play data.
|
|
|
|
Example:
|
|
pd-cards retrosheet process 2005 --cardset-id 27 --description Live
|
|
"""
|
|
console.print()
|
|
console.print("=" * 70)
|
|
console.print(f"[bold]RETROSHEET PROCESSING - {year}[/bold]")
|
|
console.print("=" * 70)
|
|
|
|
# Calculate defaults based on description
|
|
is_live = "live" in description.lower()
|
|
|
|
if min_pa_vl is None:
|
|
min_pa_vl = 20 if is_live else 1
|
|
if min_pa_vr is None:
|
|
min_pa_vr = 40 if is_live else 1
|
|
|
|
if start_date is None:
|
|
start_date = f"{year}0301"
|
|
if end_date is None:
|
|
end_date = f"{year}1002"
|
|
|
|
if events_file is None:
|
|
events_file = f"retrosheets_events_{year}.csv"
|
|
|
|
if data_input is None:
|
|
data_input = f"data-input/{year} Live Cardset/"
|
|
|
|
# Auto-enable recency bias for Live series after May 30
|
|
if last_twoweeks_ratio is None:
|
|
end_mmdd = int(end_date[4:]) # Extract MMDD from YYYYMMDD
|
|
if is_live and end_mmdd > 530:
|
|
last_twoweeks_ratio = 0.2
|
|
console.print("[cyan]Auto-enabled recency bias (end date > May 30)[/cyan]")
|
|
else:
|
|
last_twoweeks_ratio = 0.0
|
|
|
|
console.print(f"Cardset ID: {cardset_id}")
|
|
console.print(f"Description: {description}")
|
|
console.print(f"Date Range: {start_date} - {end_date}")
|
|
console.print(f"Season %: {season_pct:.0%}")
|
|
console.print(f"Min PA: vL={min_pa_vl}, vR={min_pa_vr}")
|
|
console.print(f"Events: {events_file}")
|
|
console.print(f"Input: {data_input}")
|
|
if last_week_ratio > 0 or last_twoweeks_ratio > 0 or last_month_ratio > 0:
|
|
console.print(
|
|
f"Recency Bias: week={last_week_ratio}, 2weeks={last_twoweeks_ratio}, month={last_month_ratio}"
|
|
)
|
|
console.print()
|
|
|
|
if dry_run:
|
|
console.print("[yellow]DRY RUN - no changes will be made[/yellow]")
|
|
console.print()
|
|
console.print("[green]Validation passed - ready to run[/green]")
|
|
console.print()
|
|
console.print("To run for real, remove --dry-run flag")
|
|
raise typer.Exit(0)
|
|
|
|
# Import and configure the retrosheet module
|
|
try:
|
|
import sys
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
|
|
import retrosheet_data as rd
|
|
|
|
# Configure the module's globals
|
|
rd.CARDSET_ID = cardset_id
|
|
rd.PLAYER_DESCRIPTION = description
|
|
rd.START_DATE = int(start_date)
|
|
rd.END_DATE = int(end_date)
|
|
rd.SEASON_PCT = season_pct
|
|
rd.MIN_PA_VL = min_pa_vl
|
|
rd.MIN_PA_VR = min_pa_vr
|
|
rd.MIN_TBF_VL = min_pa_vl
|
|
rd.MIN_TBF_VR = min_pa_vr
|
|
rd.POST_DATA = post_data
|
|
rd.EVENTS_FILENAME = events_file
|
|
rd.DATA_INPUT_FILE_PATH = data_input
|
|
rd.LAST_WEEK_RATIO = last_week_ratio
|
|
rd.LAST_TWOWEEKS_RATIO = last_twoweeks_ratio
|
|
rd.LAST_MONTH_RATIO = last_month_ratio
|
|
|
|
console.print("[bold]Starting Retrosheet processing...[/bold]")
|
|
console.print()
|
|
|
|
# Run the main function (args is legacy, pass empty list)
|
|
asyncio.run(rd.main([]))
|
|
|
|
console.print()
|
|
console.print("=" * 70)
|
|
console.print("[bold green]✓ RETROSHEET PROCESSING COMPLETE[/bold green]")
|
|
console.print("=" * 70)
|
|
|
|
except ImportError as e:
|
|
console.print(f"[red]Error importing modules: {e}[/red]")
|
|
console.print("Make sure you're running from the card-creation directory")
|
|
raise typer.Exit(1)
|
|
except Exception as e:
|
|
console.print(f"[red]Error: {e}[/red]")
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
raise typer.Exit(1)
|
|
|
|
|
|
@app.command()
|
|
def arms(
|
|
year: int = typer.Argument(..., help="Season year"),
|
|
events_file: Path = typer.Option(
|
|
..., "--events", "-e", help="Retrosheet events CSV file"
|
|
),
|
|
output: Optional[Path] = typer.Option(
|
|
None, "--output", "-o", help="Output CSV file"
|
|
),
|
|
season_pct: float = typer.Option(
|
|
1.0, "--season-pct", help="Season percentage for min sample"
|
|
),
|
|
):
|
|
"""
|
|
Generate outfield arm ratings from Retrosheet data.
|
|
|
|
Analyzes play-by-play events to calculate OF arm strength ratings.
|
|
|
|
Example:
|
|
pd-cards retrosheet arms 2005 --events data-input/retrosheet/retrosheets_events_2005.csv
|
|
"""
|
|
console.print()
|
|
console.print("=" * 70)
|
|
console.print(f"[bold]OUTFIELD ARM RATINGS - {year}[/bold]")
|
|
console.print("=" * 70)
|
|
|
|
if not events_file.exists():
|
|
console.print(f"[red]Events file not found: {events_file}[/red]")
|
|
raise typer.Exit(1)
|
|
|
|
if output is None:
|
|
output = Path(f"data-output/retrosheet_arm_ratings_{year}.csv")
|
|
|
|
console.print(f"Events file: {events_file}")
|
|
console.print(f"Output: {output}")
|
|
console.print(f"Season %: {season_pct:.0%}")
|
|
console.print()
|
|
|
|
try:
|
|
import sys
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
|
|
import pandas as pd
|
|
from defenders.retrosheet_arm_calculator import (
|
|
calculate_of_arms_from_retrosheet,
|
|
)
|
|
|
|
console.print("Loading Retrosheet events...")
|
|
df_events = pd.read_csv(events_file)
|
|
console.print(f" Loaded {len(df_events)} events")
|
|
|
|
console.print("Calculating arm ratings...")
|
|
arm_ratings = calculate_of_arms_from_retrosheet(
|
|
df_events, season_pct=season_pct
|
|
)
|
|
|
|
# Convert to DataFrame for CSV output
|
|
rows = []
|
|
for key, rating in arm_ratings.items():
|
|
player_id, position = key.rsplit("_", 1)
|
|
rows.append(
|
|
{
|
|
"player_id": player_id,
|
|
"position": position.upper(),
|
|
"season": year,
|
|
"arm_rating": rating.get("arm_rating", 0),
|
|
"balls_fielded": rating.get("balls_fielded", 0),
|
|
"total_assists": rating.get("total_assists", 0),
|
|
"assist_rate": rating.get("assist_rate", 0),
|
|
"z_score": rating.get("z_score", 0),
|
|
}
|
|
)
|
|
|
|
df_output = pd.DataFrame(rows)
|
|
df_output = df_output.sort_values(["position", "arm_rating"])
|
|
|
|
# Ensure output directory exists
|
|
output.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
df_output.to_csv(output, index=False)
|
|
console.print()
|
|
console.print(
|
|
f"[green]✓ Saved {len(df_output)} arm ratings to {output}[/green]"
|
|
)
|
|
|
|
# Show distribution summary
|
|
from rich.table import Table
|
|
|
|
table = Table(title="Arm Rating Distribution")
|
|
table.add_column("Position")
|
|
table.add_column("Count", justify="right")
|
|
table.add_column("Avg Rating", justify="right")
|
|
table.add_column("Min", justify="right")
|
|
table.add_column("Max", justify="right")
|
|
|
|
for pos in ["LF", "CF", "RF"]:
|
|
pos_df = df_output[df_output["position"] == pos]
|
|
if len(pos_df) > 0:
|
|
table.add_row(
|
|
pos,
|
|
str(len(pos_df)),
|
|
f"{pos_df['arm_rating'].mean():.1f}",
|
|
str(int(pos_df["arm_rating"].min())),
|
|
str(int(pos_df["arm_rating"].max())),
|
|
)
|
|
|
|
console.print()
|
|
console.print(table)
|
|
|
|
except ImportError as e:
|
|
console.print(f"[red]Error importing modules: {e}[/red]")
|
|
raise typer.Exit(1)
|
|
except Exception as e:
|
|
console.print(f"[red]Error: {e}[/red]")
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
raise typer.Exit(1)
|
|
|
|
|
|
@app.command()
|
|
def validate(
|
|
cardset_id: int = typer.Argument(..., help="Cardset ID to validate"),
|
|
api_url: str = typer.Option(
|
|
"https://pd.manticorum.com/api", "--api", help="API URL"
|
|
),
|
|
):
|
|
"""
|
|
Validate positions for a cardset.
|
|
|
|
Checks for anomalous DH counts and missing outfield positions.
|
|
|
|
Example:
|
|
pd-cards retrosheet validate 27
|
|
"""
|
|
console.print()
|
|
console.print("=" * 70)
|
|
console.print(f"[bold]POSITION VALIDATION - Cardset {cardset_id}[/bold]")
|
|
console.print("=" * 70)
|
|
|
|
try:
|
|
import sys
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
from db_calls import db_get
|
|
|
|
async def run_validation():
|
|
# Get all players in cardset
|
|
console.print(f"Fetching players from cardset {cardset_id}...")
|
|
players = await db_get("players", params=[("cardset_id", cardset_id)])
|
|
|
|
if players is None or players["count"] == 0:
|
|
console.print(
|
|
f"[yellow]No players found in cardset {cardset_id}[/yellow]"
|
|
)
|
|
return
|
|
|
|
console.print(f"Found {players['count']} players")
|
|
console.print()
|
|
|
|
# Count positions
|
|
pos_counts = {}
|
|
dh_players = []
|
|
|
|
for player in players["players"]:
|
|
pos_1 = player.get("pos_1", "")
|
|
if pos_1:
|
|
pos_counts[pos_1] = pos_counts.get(pos_1, 0) + 1
|
|
if pos_1 == "DH":
|
|
dh_players.append(
|
|
f"{player.get('p_name', 'Unknown')} (ID: {player['player_id']})"
|
|
)
|
|
|
|
# Display position counts
|
|
from rich.table import Table
|
|
|
|
table = Table(title="Position Distribution")
|
|
table.add_column("Position")
|
|
table.add_column("Count", justify="right")
|
|
|
|
for pos in sorted(pos_counts.keys()):
|
|
count = pos_counts[pos]
|
|
style = "red" if pos == "DH" and count > 5 else None
|
|
table.add_row(pos, str(count), style=style)
|
|
|
|
console.print(table)
|
|
console.print()
|
|
|
|
# Check for anomalies
|
|
issues = []
|
|
|
|
dh_count = pos_counts.get("DH", 0)
|
|
if dh_count > 5:
|
|
issues.append(
|
|
f"[red]⚠ Anomalous DH count: {dh_count} (should be <5 for full-season cards)[/red]"
|
|
)
|
|
console.print("DH Players:")
|
|
for p in dh_players[:10]:
|
|
console.print(f" - {p}")
|
|
if len(dh_players) > 10:
|
|
console.print(f" ... and {len(dh_players) - 10} more")
|
|
console.print()
|
|
|
|
for pos in ["LF", "CF", "RF"]:
|
|
if pos_counts.get(pos, 0) == 0:
|
|
issues.append(
|
|
f"[red]⚠ Missing {pos} positions (indicates defensive calculation failures)[/red]"
|
|
)
|
|
|
|
if issues:
|
|
console.print("[bold]Issues Found:[/bold]")
|
|
for issue in issues:
|
|
console.print(f" {issue}")
|
|
else:
|
|
console.print("[green]✓ No position anomalies detected[/green]")
|
|
|
|
asyncio.run(run_validation())
|
|
|
|
except ImportError as e:
|
|
console.print(f"[red]Error importing modules: {e}[/red]")
|
|
raise typer.Exit(1)
|
|
except Exception as e:
|
|
console.print(f"[red]Error: {e}[/red]")
|
|
raise typer.Exit(1)
|
|
|
|
|
|
@app.command()
|
|
def defense(
|
|
year: int = typer.Argument(..., help="Season year to fetch defense stats for"),
|
|
output_dir: Path = typer.Option(
|
|
None, "--output", "-o", help="Output directory for CSV files"
|
|
),
|
|
):
|
|
"""
|
|
Fetch and store defensive statistics from Baseball Reference.
|
|
|
|
Downloads fielding stats for all positions and saves to CSV files.
|
|
|
|
Example:
|
|
pd-cards retrosheet defense 2005 --output "data-input/2005 Live Cardset/"
|
|
"""
|
|
console.print()
|
|
console.print("=" * 70)
|
|
console.print(f"[bold]FETCH DEFENSIVE STATS - {year}[/bold]")
|
|
console.print("=" * 70)
|
|
|
|
if output_dir is None:
|
|
output_dir = Path(f"data-input/{year} Live Cardset/")
|
|
|
|
console.print(f"Output directory: {output_dir}")
|
|
console.print()
|
|
|
|
try:
|
|
import sys
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
|
|
import defenders.calcs_defense as cde
|
|
|
|
positions = ["c", "1b", "2b", "3b", "ss", "lf", "cf", "rf", "of", "p"]
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
async def fetch_defense():
|
|
for position in positions:
|
|
console.print(f"Fetching {position.upper()} defensive stats...")
|
|
pos_df = cde.get_bbref_fielding_df(position, year)
|
|
output_file = output_dir / f"defense_{position}.csv"
|
|
pos_df.to_csv(output_file)
|
|
console.print(
|
|
f" [green]✓ Saved {len(pos_df)} records to {output_file}[/green]"
|
|
)
|
|
await asyncio.sleep(8) # Rate limiting
|
|
|
|
console.print()
|
|
console.print(f"[green]✓ All defensive stats saved to {output_dir}[/green]")
|
|
|
|
asyncio.run(fetch_defense())
|
|
|
|
except ImportError as e:
|
|
console.print(f"[red]Error importing modules: {e}[/red]")
|
|
raise typer.Exit(1)
|
|
except Exception as e:
|
|
console.print(f"[red]Error: {e}[/red]")
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
raise typer.Exit(1)
|