From f9411c5e550df49be1f8877393ad63491806e2e7 Mon Sep 17 00:00:00 2001 From: Cal Corum Date: Sat, 8 Nov 2025 16:15:37 -0600 Subject: [PATCH] CLAUDE: Add utility scripts for defense CSV column renaming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added one-time utility scripts used to prepare 2005 defense CSV files for compatibility with retrosheet_data.py. Scripts: - rename_defense_columns.py: Renamed initial batch of defense columns - RF/9 → range_factor_per_nine - RF/G → range_factor_per_game - DP → DP_def, E → E_def, Ch → chances, Inn → Inn_def - CS% → caught_stealing_perc, PO → pickoffs - Name-additional → key_bbref - rename_additional_defense_columns.py: Second batch of column renames - Fld% → fielding_perc - Rtot → tz_runs_total, Rtot/yr → tz_runs_total_per_season - Rtz → tz_runs_field, Rdp → tz_runs_infield - undo_po_rename.py: Reverted PO → pickoffs for position players - Kept 'pickoffs' for defense_p.csv (pitchers) - Changed back to 'PO' for all other positions (c, 1b, 2b, etc.) - test_retrosheet_integration.py: Integration test for retrosheet_transformer - Validates batting and pitching stats loading - Tests date range filtering - Verifies player counts These scripts have already been executed and the defense files are properly formatted. Kept for historical reference and documentation. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- scripts/rename_additional_defense_columns.py | 69 +++++++++++++++++++ scripts/rename_defense_columns.py | 71 ++++++++++++++++++++ scripts/test_retrosheet_integration.py | 56 +++++++++++++++ scripts/undo_po_rename.py | 48 +++++++++++++ 4 files changed, 244 insertions(+) create mode 100644 scripts/rename_additional_defense_columns.py create mode 100644 scripts/rename_defense_columns.py create mode 100644 scripts/test_retrosheet_integration.py create mode 100644 scripts/undo_po_rename.py diff --git a/scripts/rename_additional_defense_columns.py b/scripts/rename_additional_defense_columns.py new file mode 100644 index 0000000..eb1afe4 --- /dev/null +++ b/scripts/rename_additional_defense_columns.py @@ -0,0 +1,69 @@ +""" +Script to rename additional defense CSV column headers. +""" + +import pandas as pd +import os + +# Column mapping +COLUMN_MAPPING = { + 'Fld%': 'fielding_perc', + 'Rtot': 'tz_runs_total', + 'Rtot/yr': 'tz_runs_total_per_season', + 'Rtz': 'tz_runs_field', + 'Rdp': 'tz_runs_infield' +} + +# Directory with defense files +DATA_DIR = 'data-input/2005 Live Cardset/' + +# All defense files +DEFENSE_FILES = [ + 'defense_c.csv', + 'defense_1b.csv', + 'defense_2b.csv', + 'defense_3b.csv', + 'defense_ss.csv', + 'defense_lf.csv', + 'defense_cf.csv', + 'defense_rf.csv', + 'defense_of.csv', + 'defense_p.csv' +] + +print("Renaming additional defense CSV columns...") +print(f"Column mappings:") +for old, new in COLUMN_MAPPING.items(): + print(f" {old} -> {new}") +print() + +for filename in DEFENSE_FILES: + filepath = os.path.join(DATA_DIR, filename) + + if not os.path.exists(filepath): + print(f"⚠ Skipping {filename} (not found)") + continue + + # Read CSV + df = pd.read_csv(filepath) + + # Track which columns were renamed + renamed = [] + for old_col, new_col in COLUMN_MAPPING.items(): + if old_col in df.columns: + renamed.append(f"{old_col} -> {new_col}") + + # Rename columns + df = df.rename(columns=COLUMN_MAPPING) + + # Save back + df.to_csv(filepath, index=False) + + if renamed: + print(f"✓ {filename}: Renamed {len(renamed)} columns") + for r in renamed: + print(f" {r}") + else: + print(f" {filename}: No matching columns found") + +print("\n✓ All defense files processed successfully!") diff --git a/scripts/rename_defense_columns.py b/scripts/rename_defense_columns.py new file mode 100644 index 0000000..1765533 --- /dev/null +++ b/scripts/rename_defense_columns.py @@ -0,0 +1,71 @@ +""" +Script to rename defense CSV column headers to match expected format. +""" + +import pandas as pd +import os + +# Column mapping +COLUMN_MAPPING = { + 'RF/9': 'range_factor_per_nine', + 'RF/G': 'range_factor_per_game', + 'DP': 'DP_def', + 'E': 'E_def', + 'Ch': 'chances', + 'Inn': 'Inn_def', + 'CS%': 'caught_stealing_perc', + 'PO': 'pickoffs', + 'Name-additional': 'key_bbref' +} + +# Directory with defense files +DATA_DIR = 'data-input/2005 Live Cardset/' + +# Defense files to process +DEFENSE_FILES = [ + 'defense_c.csv', + 'defense_1b.csv', + 'defense_2b.csv', + 'defense_3b.csv', + 'defense_ss.csv', + 'defense_lf.csv', + 'defense_cf.csv', + 'defense_rf.csv', + 'defense_of.csv', + 'defense_p.csv' +] + +print("Renaming defense CSV columns...") +print(f"Column mappings:") +for old, new in COLUMN_MAPPING.items(): + print(f" {old} -> {new}") +print() + +for filename in DEFENSE_FILES: + filepath = os.path.join(DATA_DIR, filename) + + if not os.path.exists(filepath): + print(f"⚠ Skipping {filename} (not found)") + continue + + # Read CSV + df = pd.read_csv(filepath) + + # Track which columns were renamed + renamed = [] + for old_col, new_col in COLUMN_MAPPING.items(): + if old_col in df.columns: + renamed.append(f"{old_col} -> {new_col}") + + # Rename columns + df = df.rename(columns=COLUMN_MAPPING) + + # Save back + df.to_csv(filepath, index=False) + + print(f"✓ {filename}: Renamed {len(renamed)} columns") + if renamed: + for r in renamed: + print(f" {r}") + +print("\n✓ All defense files processed successfully!") diff --git a/scripts/test_retrosheet_integration.py b/scripts/test_retrosheet_integration.py new file mode 100644 index 0000000..62f0bab --- /dev/null +++ b/scripts/test_retrosheet_integration.py @@ -0,0 +1,56 @@ +""" +Test script to verify retrosheet_data.py works with the new transformer. +""" + +import sys +import logging +from retrosheet_data import get_base_batting_df, get_base_pitching_df, RETRO_FILE_PATH, EVENTS_FILENAME + +# Set up logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + +print(f"\n{'='*60}") +print(f"Testing Retrosheet Data Integration") +print(f"{'='*60}\n") + +file_path = f'{RETRO_FILE_PATH}{EVENTS_FILENAME}' +print(f"Loading from: {file_path}") + +# Test date range (full 2005 season) +start_date = 20050403 # Opening Day 2005 +end_date = 20051003 # End of 2005 regular season + +print(f"\nTest 1: Loading Batting Data") +print(f"Date range: {start_date} to {end_date}") + +try: + all_plays_batting, batting_stats = get_base_batting_df(file_path, start_date, end_date) + print(f"✓ Success!") + print(f" - Total plays loaded: {len(all_plays_batting)}") + print(f" - Qualified batters: {len(batting_stats)}") + print(f"\nSample batting stats:") + print(batting_stats.head(3)) +except Exception as e: + print(f"✗ Failed: {e}") + sys.exit(1) + +print(f"\n{'-'*60}\n") +print(f"Test 2: Loading Pitching Data") + +try: + all_plays_pitching, pitching_stats = get_base_pitching_df(file_path, start_date, end_date) + print(f"✓ Success!") + print(f" - Total plays loaded: {len(all_plays_pitching)}") + print(f" - Qualified pitchers: {len(pitching_stats)}") + print(f"\nSample pitching stats:") + print(pitching_stats.head(3)) +except Exception as e: + print(f"✗ Failed: {e}") + sys.exit(1) + +print(f"\n{'='*60}") +print(f"All tests passed! ✓") +print(f"{'='*60}\n") diff --git a/scripts/undo_po_rename.py b/scripts/undo_po_rename.py new file mode 100644 index 0000000..cc21421 --- /dev/null +++ b/scripts/undo_po_rename.py @@ -0,0 +1,48 @@ +""" +Script to undo PO -> pickoffs rename for position player defense files. +Keeps the rename for defense_p.csv (pitchers). +""" + +import pandas as pd +import os + +# Directory with defense files +DATA_DIR = 'data-input/2005 Live Cardset/' + +# Position player defense files (NOT pitchers) +POSITION_DEFENSE_FILES = [ + 'defense_c.csv', + 'defense_1b.csv', + 'defense_2b.csv', + 'defense_3b.csv', + 'defense_ss.csv', + 'defense_lf.csv', + 'defense_cf.csv', + 'defense_rf.csv', + 'defense_of.csv' +] + +print("Undoing PO -> pickoffs rename for position player defense files...") +print("(Keeping pickoffs for defense_p.csv)") +print() + +for filename in POSITION_DEFENSE_FILES: + filepath = os.path.join(DATA_DIR, filename) + + if not os.path.exists(filepath): + print(f"⚠ Skipping {filename} (not found)") + continue + + # Read CSV + df = pd.read_csv(filepath) + + # Rename pickoffs back to PO if it exists + if 'pickoffs' in df.columns: + df = df.rename(columns={'pickoffs': 'PO'}) + df.to_csv(filepath, index=False) + print(f"✓ {filename}: Renamed pickoffs -> PO") + else: + print(f"⚠ {filename}: Column 'pickoffs' not found") + +print("\n✓ Position player defense files updated!") +print(" defense_p.csv still has 'pickoffs' column")