CLAUDE: Add utility scripts for defense CSV column renaming

Added one-time utility scripts used to prepare 2005 defense CSV files for compatibility with retrosheet_data.py. Scripts: - rename_defense_columns.py: Renamed initial batch of defense columns - RF/9 → range_factor_per_nine - RF/G → range_factor_per_game - DP → DP_def, E → E_def, Ch → chances, Inn → Inn_def - CS% → caught_stealing_perc, PO → pickoffs - Name-additional → key_bbref - rename_additional_defense_columns.py: Second batch of column renames - Fld% → fielding_perc - Rtot → tz_runs_total, Rtot/yr → tz_runs_total_per_season - Rtz → tz_runs_field, Rdp → tz_runs_infield - undo_po_rename.py: Reverted PO → pickoffs for position players - Kept 'pickoffs' for defense_p.csv (pitchers) - Changed back to 'PO' for all other positions (c, 1b, 2b, etc.) - test_retrosheet_integration.py: Integration test for retrosheet_transformer - Validates batting and pitching stats loading - Tests date range filtering - Verifies player counts These scripts have already been executed and the defense files are properly formatted. Kept for historical reference and documentation. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 16:15:37 -06:00 · 2025-11-08 16:15:37 -06:00 · f9411c5e55
commit f9411c5e55
parent 4e9e8d351d
4 changed files with 244 additions and 0 deletions
--- a/scripts/rename_additional_defense_columns.py
+++ b/scripts/rename_additional_defense_columns.py
@ -0,0 +1,69 @@
+"""
+Script to rename additional defense CSV column headers.
+"""
+
+import pandas as pd
+import os
+
+# Column mapping
+COLUMN_MAPPING = {
+    'Fld%': 'fielding_perc',
+    'Rtot': 'tz_runs_total',
+    'Rtot/yr': 'tz_runs_total_per_season',
+    'Rtz': 'tz_runs_field',
+    'Rdp': 'tz_runs_infield'
+}
+
+# Directory with defense files
+DATA_DIR = 'data-input/2005 Live Cardset/'
+
+# All defense files
+DEFENSE_FILES = [
+    'defense_c.csv',
+    'defense_1b.csv',
+    'defense_2b.csv',
+    'defense_3b.csv',
+    'defense_ss.csv',
+    'defense_lf.csv',
+    'defense_cf.csv',
+    'defense_rf.csv',
+    'defense_of.csv',
+    'defense_p.csv'
+]
+
+print("Renaming additional defense CSV columns...")
+print(f"Column mappings:")
+for old, new in COLUMN_MAPPING.items():
+    print(f"  {old} -> {new}")
+print()
+
+for filename in DEFENSE_FILES:
+    filepath = os.path.join(DATA_DIR, filename)
+
+    if not os.path.exists(filepath):
+        print(f"⚠ Skipping {filename} (not found)")
+        continue
+
+    # Read CSV
+    df = pd.read_csv(filepath)
+
+    # Track which columns were renamed
+    renamed = []
+    for old_col, new_col in COLUMN_MAPPING.items():
+        if old_col in df.columns:
+            renamed.append(f"{old_col} -> {new_col}")
+
+    # Rename columns
+    df = df.rename(columns=COLUMN_MAPPING)
+
+    # Save back
+    df.to_csv(filepath, index=False)
+
+    if renamed:
+        print(f"✓ {filename}: Renamed {len(renamed)} columns")
+        for r in renamed:
+            print(f"    {r}")
+    else:
+        print(f"  {filename}: No matching columns found")
+
+print("\n✓ All defense files processed successfully!")
--- a/scripts/rename_defense_columns.py
+++ b/scripts/rename_defense_columns.py
@ -0,0 +1,71 @@
+"""
+Script to rename defense CSV column headers to match expected format.
+"""
+
+import pandas as pd
+import os
+
+# Column mapping
+COLUMN_MAPPING = {
+    'RF/9': 'range_factor_per_nine',
+    'RF/G': 'range_factor_per_game',
+    'DP': 'DP_def',
+    'E': 'E_def',
+    'Ch': 'chances',
+    'Inn': 'Inn_def',
+    'CS%': 'caught_stealing_perc',
+    'PO': 'pickoffs',
+    'Name-additional': 'key_bbref'
+}
+
+# Directory with defense files
+DATA_DIR = 'data-input/2005 Live Cardset/'
+
+# Defense files to process
+DEFENSE_FILES = [
+    'defense_c.csv',
+    'defense_1b.csv',
+    'defense_2b.csv',
+    'defense_3b.csv',
+    'defense_ss.csv',
+    'defense_lf.csv',
+    'defense_cf.csv',
+    'defense_rf.csv',
+    'defense_of.csv',
+    'defense_p.csv'
+]
+
+print("Renaming defense CSV columns...")
+print(f"Column mappings:")
+for old, new in COLUMN_MAPPING.items():
+    print(f"  {old} -> {new}")
+print()
+
+for filename in DEFENSE_FILES:
+    filepath = os.path.join(DATA_DIR, filename)
+
+    if not os.path.exists(filepath):
+        print(f"⚠ Skipping {filename} (not found)")
+        continue
+
+    # Read CSV
+    df = pd.read_csv(filepath)
+
+    # Track which columns were renamed
+    renamed = []
+    for old_col, new_col in COLUMN_MAPPING.items():
+        if old_col in df.columns:
+            renamed.append(f"{old_col} -> {new_col}")
+
+    # Rename columns
+    df = df.rename(columns=COLUMN_MAPPING)
+
+    # Save back
+    df.to_csv(filepath, index=False)
+
+    print(f"✓ {filename}: Renamed {len(renamed)} columns")
+    if renamed:
+        for r in renamed:
+            print(f"    {r}")
+
+print("\n✓ All defense files processed successfully!")
--- a/scripts/test_retrosheet_integration.py
+++ b/scripts/test_retrosheet_integration.py
@ -0,0 +1,56 @@
+"""
+Test script to verify retrosheet_data.py works with the new transformer.
+"""
+
+import sys
+import logging
+from retrosheet_data import get_base_batting_df, get_base_pitching_df, RETRO_FILE_PATH, EVENTS_FILENAME
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+
+print(f"\n{'='*60}")
+print(f"Testing Retrosheet Data Integration")
+print(f"{'='*60}\n")
+
+file_path = f'{RETRO_FILE_PATH}{EVENTS_FILENAME}'
+print(f"Loading from: {file_path}")
+
+# Test date range (full 2005 season)
+start_date = 20050403  # Opening Day 2005
+end_date = 20051003    # End of 2005 regular season
+
+print(f"\nTest 1: Loading Batting Data")
+print(f"Date range: {start_date} to {end_date}")
+
+try:
+    all_plays_batting, batting_stats = get_base_batting_df(file_path, start_date, end_date)
+    print(f"✓ Success!")
+    print(f"  - Total plays loaded: {len(all_plays_batting)}")
+    print(f"  - Qualified batters: {len(batting_stats)}")
+    print(f"\nSample batting stats:")
+    print(batting_stats.head(3))
+except Exception as e:
+    print(f"✗ Failed: {e}")
+    sys.exit(1)
+
+print(f"\n{'-'*60}\n")
+print(f"Test 2: Loading Pitching Data")
+
+try:
+    all_plays_pitching, pitching_stats = get_base_pitching_df(file_path, start_date, end_date)
+    print(f"✓ Success!")
+    print(f"  - Total plays loaded: {len(all_plays_pitching)}")
+    print(f"  - Qualified pitchers: {len(pitching_stats)}")
+    print(f"\nSample pitching stats:")
+    print(pitching_stats.head(3))
+except Exception as e:
+    print(f"✗ Failed: {e}")
+    sys.exit(1)
+
+print(f"\n{'='*60}")
+print(f"All tests passed! ✓")
+print(f"{'='*60}\n")
--- a/scripts/undo_po_rename.py
+++ b/scripts/undo_po_rename.py
@ -0,0 +1,48 @@
+"""
+Script to undo PO -> pickoffs rename for position player defense files.
+Keeps the rename for defense_p.csv (pitchers).
+"""
+
+import pandas as pd
+import os
+
+# Directory with defense files
+DATA_DIR = 'data-input/2005 Live Cardset/'
+
+# Position player defense files (NOT pitchers)
+POSITION_DEFENSE_FILES = [
+    'defense_c.csv',
+    'defense_1b.csv',
+    'defense_2b.csv',
+    'defense_3b.csv',
+    'defense_ss.csv',
+    'defense_lf.csv',
+    'defense_cf.csv',
+    'defense_rf.csv',
+    'defense_of.csv'
+]
+
+print("Undoing PO -> pickoffs rename for position player defense files...")
+print("(Keeping pickoffs for defense_p.csv)")
+print()
+
+for filename in POSITION_DEFENSE_FILES:
+    filepath = os.path.join(DATA_DIR, filename)
+
+    if not os.path.exists(filepath):
+        print(f"⚠ Skipping {filename} (not found)")
+        continue
+
+    # Read CSV
+    df = pd.read_csv(filepath)
+
+    # Rename pickoffs back to PO if it exists
+    if 'pickoffs' in df.columns:
+        df = df.rename(columns={'pickoffs': 'PO'})
+        df.to_csv(filepath, index=False)
+        print(f"✓ {filename}: Renamed pickoffs -> PO")
+    else:
+        print(f"⚠ {filename}: Column 'pickoffs' not found")
+
+print("\n✓ Position player defense files updated!")
+print("  defense_p.csv still has 'pickoffs' column")