""" Retrosheet CSV Format Transformer This module transforms newer Retrosheet CSV formats into the legacy format expected by retrosheet_data.py. Includes smart caching to avoid redundant transformations. Author: Claude Code """ import os import logging from pathlib import Path import pandas as pd import numpy as np # Set up logging logger = logging.getLogger(f'{__name__}') def get_normalized_csv_path(source_path: str) -> str: """ Generate the cached/normalized CSV path from source path. Args: source_path: Path to the source CSV file Returns: Path to the normalized cache file """ source = Path(source_path) cache_name = f"{source.stem}_normalized{source.suffix}" return str(source.parent / cache_name) def needs_transformation(source_path: str, cache_path: str) -> bool: """ Check if transformation is needed based on file modification times. Args: source_path: Path to source CSV cache_path: Path to cached normalized CSV Returns: True if transformation needed, False if cache is valid """ if not os.path.exists(cache_path): logger.info(f"Cache file not found: {cache_path}") return True source_mtime = os.path.getmtime(source_path) cache_mtime = os.path.getmtime(cache_path) if source_mtime > cache_mtime: logger.info(f"Source file is newer than cache, transformation needed") return True logger.info(f"Using cached normalized file: {cache_path}") return False def transform_event_type(row: pd.Series) -> str: """ Derive event_type from boolean columns in new format. Priority order matches baseball scoring conventions. """ if row['hr'] == 1: return 'home run' elif row['triple'] == 1: return 'triple' elif row['double'] == 1: return 'double' elif row['single'] == 1: return 'single' elif row['walk'] == 1 or row['iw'] == 1: return 'walk' elif row['k'] == 1: return 'strikeout' elif row['hbp'] == 1: return 'hit by pitch' else: return 'generic out' def transform_batted_ball_type(row: pd.Series) -> str: """ Derive batted_ball_type from boolean columns. Returns 'f' (fly), 'G' (ground), 'l' (line), or empty string. """ if row['fly'] == 1: return 'f' elif row['ground'] == 1: return 'G' elif row['line'] == 1: return 'l' else: return '' def transform_hit_val(row: pd.Series) -> str: """ Derive hit_val from hit type columns. Returns '1', '2', '3', '4' for singles through home runs. """ if row['hr'] == 1: return '4' elif row['triple'] == 1: return '3' elif row['double'] == 1: return '2' elif row['single'] == 1: return '1' else: return '' def bool_to_tf(val) -> str: """Convert 1/0 or True/False to 't'/'f' strings.""" if pd.isna(val): return 'f' return 't' if val == 1 or val is True else 'f' def transform_retrosheet_csv(source_path: str) -> pd.DataFrame: """ Transform new Retrosheet CSV format to legacy format. Args: source_path: Path to source CSV file Returns: Transformed DataFrame in legacy format """ logger.info(f"Reading source CSV: {source_path}") df = pd.read_csv(source_path, low_memory=False) logger.info(f"Transforming {len(df)} rows to legacy format") # Create new dataframe with legacy column names transformed = pd.DataFrame() # Simple renames (with case conversion for handedness) transformed['game_id'] = df['gid'] transformed['batter_id'] = df['batter'] transformed['pitcher_id'] = df['pitcher'] transformed['batter_hand'] = df['bathand'].str.lower() # Convert R/L to r/l transformed['pitcher_hand'] = df['pithand'].str.lower() # Convert R/L to r/l transformed['hit_location'] = df['loc'].astype(str) # Ensure string type for .str operations # Derive event_type from multiple columns logger.info("Deriving event_type from hit/walk/strikeout columns") transformed['event_type'] = df.apply(transform_event_type, axis=1) # Derive batted_ball_type logger.info("Deriving batted_ball_type from fly/ground/line columns") transformed['batted_ball_type'] = df.apply(transform_batted_ball_type, axis=1).astype(str) # Derive hit_val logger.info("Deriving hit_val from hit type columns") transformed['hit_val'] = df.apply(transform_hit_val, axis=1).astype(str) # Boolean conversions to 't'/'f' format logger.info("Converting boolean columns to 't'/'f' format") transformed['batter_event'] = df['pa'].apply(bool_to_tf) transformed['ab'] = df['ab'].apply(bool_to_tf) transformed['bunt'] = df['bunt'].apply(bool_to_tf) transformed['tp'] = df['tp'].apply(bool_to_tf) # Combine gdp + othdp for double play indicator transformed['dp'] = (df['gdp'].fillna(0) + df['othdp'].fillna(0)).apply(lambda x: 't' if x > 0 else 'f') # Use batter_hand as result_batter_hand (assumption: most batters don't switch mid-AB) # This may need refinement if we have switch hitter data transformed['result_batter_hand'] = df['bathand'].str.lower() # Convert R/L to r/l # Add placeholder columns that may be referenced but aren't critical for stats # These can be populated if needed in the future transformed['event_id'] = range(1, len(df) + 1) transformed['batting_team'] = '' transformed['inning'] = df['inning'] if 'inning' in df.columns else '' transformed['outs'] = '' transformed['balls'] = '' transformed['strikes'] = '' transformed['pitch_seq'] = '' transformed['vis_score'] = '' transformed['home_score'] = '' transformed['result_batter_id'] = df['batter'] transformed['result_pitcher_id'] = df['pitcher'] transformed['result_pitcher_hand'] = df['pithand'] transformed['def_c'] = '' transformed['def_1b'] = '' transformed['def_2b'] = '' transformed['def_3b'] = '' transformed['def_ss'] = '' transformed['def_lf'] = '' transformed['def_cf'] = '' transformed['def_rf'] = '' transformed['run_1b'] = '' transformed['run_2b'] = '' transformed['run_3b'] = '' transformed['event_scoring'] = '' transformed['leadoff'] = '' transformed['pinch_hit'] = '' transformed['batt_def_pos'] = '' transformed['batt_lineup_pos'] = '' transformed['sac_hit'] = df['sh'].apply(bool_to_tf) if 'sh' in df.columns else 'f' transformed['sac_fly'] = df['sf'].apply(bool_to_tf) if 'sf' in df.columns else 'f' transformed['event_outs'] = '' transformed['rbi'] = '' transformed['wild_pitch'] = df['wp'].apply(bool_to_tf) if 'wp' in df.columns else 'f' transformed['passed_ball'] = df['pb'].apply(bool_to_tf) if 'pb' in df.columns else 'f' transformed['fielded_by'] = '' transformed['foul_ground'] = '' logger.info(f"Transformation complete: {len(transformed)} rows") return transformed def load_retrosheet_csv(source_path: str, force_transform: bool = False) -> pd.DataFrame: """ Load Retrosheet CSV, using cached normalized version if available. This is the main entry point for loading Retrosheet data. It handles: - Checking for cached normalized data - Transforming if needed - Saving transformed data for future use Args: source_path: Path to source Retrosheet CSV force_transform: If True, ignore cache and force transformation Returns: DataFrame in legacy format ready for retrosheet_data.py """ logger.info(f"Loading Retrosheet CSV: {source_path}") if not os.path.exists(source_path): raise FileNotFoundError(f"Source file not found: {source_path}") cache_path = get_normalized_csv_path(source_path) # Check if we need to transform if force_transform or needs_transformation(source_path, cache_path): # Transform the data df = transform_retrosheet_csv(source_path) # Save to cache logger.info(f"Saving normalized data to cache: {cache_path}") df.to_csv(cache_path, index=False) logger.info(f"Cache saved successfully") return df else: # Load from cache logger.info(f"Loading from cache: {cache_path}") # Explicitly set dtypes for string columns to ensure .str accessor works dtype_dict = { 'game_id': 'str', 'hit_val': 'str', 'hit_location': 'str', 'batted_ball_type': 'str' } return pd.read_csv(cache_path, dtype=dtype_dict, low_memory=False) if __name__ == '__main__': # Test the transformer import sys logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) if len(sys.argv) > 1: test_file = sys.argv[1] else: test_file = 'data-input/retrosheet/retrosheets_events_2005.csv' print(f"\n{'='*60}") print(f"Testing Retrosheet Transformer") print(f"{'='*60}\n") df = load_retrosheet_csv(test_file) print(f"\nTransformed DataFrame Info:") print(f"Shape: {df.shape}") print(f"\nColumns: {list(df.columns)}") print(f"\nSample rows:") print(df.head(3)) print(f"\nEvent type distribution:") print(df['event_type'].value_counts()) print(f"\nBatted ball type distribution:") print(df['batted_ball_type'].value_counts())