""" Retrosheet historical data processing commands. Commands for generating cards from historical Retrosheet play-by-play data. """ import asyncio from pathlib import Path from typing import Optional import typer from rich.console import Console app = typer.Typer(no_args_is_help=True) console = Console() @app.command() def process( year: int = typer.Argument(..., help="Season year to process (e.g., 2005)"), cardset_id: int = typer.Option(..., "--cardset-id", "-c", help="Target cardset ID"), description: str = typer.Option("Live", "--description", "-d", help="Player description ('Live' or 'Month PotM')"), start_date: Optional[str] = typer.Option(None, "--start", help="Start date YYYYMMDD (defaults to March 1)"), end_date: Optional[str] = typer.Option(None, "--end", help="End date YYYYMMDD (defaults to Oct 2)"), events_file: Optional[str] = typer.Option(None, "--events", "-e", help="Retrosheet events CSV filename"), data_input: Optional[str] = typer.Option(None, "--input", "-i", help="Data input directory path"), season_pct: float = typer.Option(1.0, "--season-pct", help="Season percentage (0.0-1.0)"), min_pa_vl: int = typer.Option(None, "--min-pa-vl", help="Minimum PA vs LHP (default: 20 Live, 1 PotM)"), min_pa_vr: int = typer.Option(None, "--min-pa-vr", help="Minimum PA vs RHP (default: 40 Live, 1 PotM)"), post_data: bool = typer.Option(True, "--post/--no-post", help="Post data to database"), dry_run: bool = typer.Option(False, "--dry-run", "-n", help="Preview without saving to database"), last_week_ratio: float = typer.Option(0.0, "--last-week-ratio", help="Recency bias: weight for last week's stats (0.0-1.0)"), last_twoweeks_ratio: Optional[float] = typer.Option(None, "--last-twoweeks-ratio", help="Recency bias: weight for last 2 weeks' stats (0.0-1.0, default: 0.2 for Live after May 30)"), last_month_ratio: float = typer.Option(0.0, "--last-month-ratio", help="Recency bias: weight for last month's stats (0.0-1.0)"), ): """ Process Retrosheet data and create player cards. Generates batting and pitching cards from historical play-by-play data. Example: pd-cards retrosheet process 2005 --cardset-id 27 --description Live """ console.print() console.print("=" * 70) console.print(f"[bold]RETROSHEET PROCESSING - {year}[/bold]") console.print("=" * 70) # Calculate defaults based on description is_live = 'live' in description.lower() if min_pa_vl is None: min_pa_vl = 20 if is_live else 1 if min_pa_vr is None: min_pa_vr = 40 if is_live else 1 if start_date is None: start_date = f"{year}0301" if end_date is None: end_date = f"{year}1002" if events_file is None: events_file = f"retrosheets_events_{year}.csv" if data_input is None: data_input = f"data-input/{year} Live Cardset/" # Auto-enable recency bias for Live series after May 30 if last_twoweeks_ratio is None: end_mmdd = int(end_date[4:]) # Extract MMDD from YYYYMMDD if is_live and end_mmdd > 530: last_twoweeks_ratio = 0.2 console.print(f"[cyan]Auto-enabled recency bias (end date > May 30)[/cyan]") else: last_twoweeks_ratio = 0.0 console.print(f"Cardset ID: {cardset_id}") console.print(f"Description: {description}") console.print(f"Date Range: {start_date} - {end_date}") console.print(f"Season %: {season_pct:.0%}") console.print(f"Min PA: vL={min_pa_vl}, vR={min_pa_vr}") console.print(f"Events: {events_file}") console.print(f"Input: {data_input}") if last_week_ratio > 0 or last_twoweeks_ratio > 0 or last_month_ratio > 0: console.print(f"Recency Bias: week={last_week_ratio}, 2weeks={last_twoweeks_ratio}, month={last_month_ratio}") console.print() if dry_run: console.print("[yellow]DRY RUN - no changes will be made[/yellow]") console.print() console.print("[green]Validation passed - ready to run[/green]") console.print() console.print("To run for real, remove --dry-run flag") raise typer.Exit(0) # Import and configure the retrosheet module try: import sys sys.path.insert(0, str(Path(__file__).parent.parent.parent)) import retrosheet_data as rd # Configure the module's globals rd.CARDSET_ID = cardset_id rd.PLAYER_DESCRIPTION = description rd.START_DATE = int(start_date) rd.END_DATE = int(end_date) rd.SEASON_PCT = season_pct rd.MIN_PA_VL = min_pa_vl rd.MIN_PA_VR = min_pa_vr rd.MIN_TBF_VL = min_pa_vl rd.MIN_TBF_VR = min_pa_vr rd.POST_DATA = post_data rd.EVENTS_FILENAME = events_file rd.DATA_INPUT_FILE_PATH = data_input rd.LAST_WEEK_RATIO = last_week_ratio rd.LAST_TWOWEEKS_RATIO = last_twoweeks_ratio rd.LAST_MONTH_RATIO = last_month_ratio console.print("[bold]Starting Retrosheet processing...[/bold]") console.print() # Run the main function (args is legacy, pass empty list) asyncio.run(rd.main([])) console.print() console.print("=" * 70) console.print(f"[bold green]✓ RETROSHEET PROCESSING COMPLETE[/bold green]") console.print("=" * 70) except ImportError as e: console.print(f"[red]Error importing modules: {e}[/red]") console.print("Make sure you're running from the card-creation directory") raise typer.Exit(1) except Exception as e: console.print(f"[red]Error: {e}[/red]") import traceback traceback.print_exc() raise typer.Exit(1) @app.command() def arms( year: int = typer.Argument(..., help="Season year"), events_file: Path = typer.Option(..., "--events", "-e", help="Retrosheet events CSV file"), output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output CSV file"), season_pct: float = typer.Option(1.0, "--season-pct", help="Season percentage for min sample"), ): """ Generate outfield arm ratings from Retrosheet data. Analyzes play-by-play events to calculate OF arm strength ratings. Example: pd-cards retrosheet arms 2005 --events data-input/retrosheet/retrosheets_events_2005.csv """ console.print() console.print("=" * 70) console.print(f"[bold]OUTFIELD ARM RATINGS - {year}[/bold]") console.print("=" * 70) if not events_file.exists(): console.print(f"[red]Events file not found: {events_file}[/red]") raise typer.Exit(1) if output is None: output = Path(f"data-output/retrosheet_arm_ratings_{year}.csv") console.print(f"Events file: {events_file}") console.print(f"Output: {output}") console.print(f"Season %: {season_pct:.0%}") console.print() try: import sys sys.path.insert(0, str(Path(__file__).parent.parent.parent)) import pandas as pd from defenders.retrosheet_arm_calculator import calculate_of_arms_from_retrosheet console.print("Loading Retrosheet events...") df_events = pd.read_csv(events_file) console.print(f" Loaded {len(df_events)} events") console.print("Calculating arm ratings...") arm_ratings = calculate_of_arms_from_retrosheet(df_events, season_pct=season_pct) # Convert to DataFrame for CSV output rows = [] for key, rating in arm_ratings.items(): player_id, position = key.rsplit('_', 1) rows.append({ 'player_id': player_id, 'position': position.upper(), 'season': year, 'arm_rating': rating.get('arm_rating', 0), 'balls_fielded': rating.get('balls_fielded', 0), 'total_assists': rating.get('total_assists', 0), 'assist_rate': rating.get('assist_rate', 0), 'z_score': rating.get('z_score', 0), }) df_output = pd.DataFrame(rows) df_output = df_output.sort_values(['position', 'arm_rating']) # Ensure output directory exists output.parent.mkdir(parents=True, exist_ok=True) df_output.to_csv(output, index=False) console.print() console.print(f"[green]✓ Saved {len(df_output)} arm ratings to {output}[/green]") # Show distribution summary from rich.table import Table table = Table(title="Arm Rating Distribution") table.add_column("Position") table.add_column("Count", justify="right") table.add_column("Avg Rating", justify="right") table.add_column("Min", justify="right") table.add_column("Max", justify="right") for pos in ['LF', 'CF', 'RF']: pos_df = df_output[df_output['position'] == pos] if len(pos_df) > 0: table.add_row( pos, str(len(pos_df)), f"{pos_df['arm_rating'].mean():.1f}", str(int(pos_df['arm_rating'].min())), str(int(pos_df['arm_rating'].max())) ) console.print() console.print(table) except ImportError as e: console.print(f"[red]Error importing modules: {e}[/red]") raise typer.Exit(1) except Exception as e: console.print(f"[red]Error: {e}[/red]") import traceback traceback.print_exc() raise typer.Exit(1) @app.command() def validate( cardset_id: int = typer.Argument(..., help="Cardset ID to validate"), api_url: str = typer.Option("https://pd.manticorum.com/api", "--api", help="API URL"), ): """ Validate positions for a cardset. Checks for anomalous DH counts and missing outfield positions. Example: pd-cards retrosheet validate 27 """ console.print() console.print("=" * 70) console.print(f"[bold]POSITION VALIDATION - Cardset {cardset_id}[/bold]") console.print("=" * 70) try: import sys sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from db_calls import db_get async def run_validation(): # Get all players in cardset console.print(f"Fetching players from cardset {cardset_id}...") players = await db_get('players', params=[('cardset_id', cardset_id)]) if players is None or players['count'] == 0: console.print(f"[yellow]No players found in cardset {cardset_id}[/yellow]") return console.print(f"Found {players['count']} players") console.print() # Count positions pos_counts = {} dh_players = [] for player in players['players']: pos_1 = player.get('pos_1', '') if pos_1: pos_counts[pos_1] = pos_counts.get(pos_1, 0) + 1 if pos_1 == 'DH': dh_players.append(f"{player.get('p_name', 'Unknown')} (ID: {player['player_id']})") # Display position counts from rich.table import Table table = Table(title="Position Distribution") table.add_column("Position") table.add_column("Count", justify="right") for pos in sorted(pos_counts.keys()): count = pos_counts[pos] style = "red" if pos == 'DH' and count > 5 else None table.add_row(pos, str(count), style=style) console.print(table) console.print() # Check for anomalies issues = [] dh_count = pos_counts.get('DH', 0) if dh_count > 5: issues.append(f"[red]⚠ Anomalous DH count: {dh_count} (should be <5 for full-season cards)[/red]") console.print("DH Players:") for p in dh_players[:10]: console.print(f" - {p}") if len(dh_players) > 10: console.print(f" ... and {len(dh_players) - 10} more") console.print() for pos in ['LF', 'CF', 'RF']: if pos_counts.get(pos, 0) == 0: issues.append(f"[red]⚠ Missing {pos} positions (indicates defensive calculation failures)[/red]") if issues: console.print("[bold]Issues Found:[/bold]") for issue in issues: console.print(f" {issue}") else: console.print("[green]✓ No position anomalies detected[/green]") asyncio.run(run_validation()) except ImportError as e: console.print(f"[red]Error importing modules: {e}[/red]") raise typer.Exit(1) except Exception as e: console.print(f"[red]Error: {e}[/red]") raise typer.Exit(1) @app.command() def defense( year: int = typer.Argument(..., help="Season year to fetch defense stats for"), output_dir: Path = typer.Option(None, "--output", "-o", help="Output directory for CSV files"), ): """ Fetch and store defensive statistics from Baseball Reference. Downloads fielding stats for all positions and saves to CSV files. Example: pd-cards retrosheet defense 2005 --output "data-input/2005 Live Cardset/" """ console.print() console.print("=" * 70) console.print(f"[bold]FETCH DEFENSIVE STATS - {year}[/bold]") console.print("=" * 70) if output_dir is None: output_dir = Path(f"data-input/{year} Live Cardset/") console.print(f"Output directory: {output_dir}") console.print() try: import sys sys.path.insert(0, str(Path(__file__).parent.parent.parent)) import defenders.calcs_defense as cde positions = ['c', '1b', '2b', '3b', 'ss', 'lf', 'cf', 'rf', 'of', 'p'] output_dir.mkdir(parents=True, exist_ok=True) async def fetch_defense(): for position in positions: console.print(f"Fetching {position.upper()} defensive stats...") pos_df = cde.get_bbref_fielding_df(position, year) output_file = output_dir / f"defense_{position}.csv" pos_df.to_csv(output_file) console.print(f" [green]✓ Saved {len(pos_df)} records to {output_file}[/green]") await asyncio.sleep(8) # Rate limiting console.print() console.print(f"[green]✓ All defensive stats saved to {output_dir}[/green]") asyncio.run(fetch_defense()) except ImportError as e: console.print(f"[red]Error importing modules: {e}[/red]") raise typer.Exit(1) except Exception as e: console.print(f"[red]Error: {e}[/red]") import traceback traceback.print_exc() raise typer.Exit(1)