CLAUDE: Add utility scripts for defense CSV column renaming

Added one-time utility scripts used to prepare 2005 defense CSV files
for compatibility with retrosheet_data.py.

Scripts:
- rename_defense_columns.py: Renamed initial batch of defense columns
  - RF/9 → range_factor_per_nine
  - RF/G → range_factor_per_game
  - DP → DP_def, E → E_def, Ch → chances, Inn → Inn_def
  - CS% → caught_stealing_perc, PO → pickoffs
  - Name-additional → key_bbref

- rename_additional_defense_columns.py: Second batch of column renames
  - Fld% → fielding_perc
  - Rtot → tz_runs_total, Rtot/yr → tz_runs_total_per_season
  - Rtz → tz_runs_field, Rdp → tz_runs_infield

- undo_po_rename.py: Reverted PO → pickoffs for position players
  - Kept 'pickoffs' for defense_p.csv (pitchers)
  - Changed back to 'PO' for all other positions (c, 1b, 2b, etc.)

- test_retrosheet_integration.py: Integration test for retrosheet_transformer
  - Validates batting and pitching stats loading
  - Tests date range filtering
  - Verifies player counts

These scripts have already been executed and the defense files are
properly formatted. Kept for historical reference and documentation.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Cal Corum 2025-11-08 16:15:37 -06:00
parent 4e9e8d351d
commit f9411c5e55
4 changed files with 244 additions and 0 deletions

View File

@ -0,0 +1,69 @@
"""
Script to rename additional defense CSV column headers.
"""
import pandas as pd
import os
# Column mapping
COLUMN_MAPPING = {
'Fld%': 'fielding_perc',
'Rtot': 'tz_runs_total',
'Rtot/yr': 'tz_runs_total_per_season',
'Rtz': 'tz_runs_field',
'Rdp': 'tz_runs_infield'
}
# Directory with defense files
DATA_DIR = 'data-input/2005 Live Cardset/'
# All defense files
DEFENSE_FILES = [
'defense_c.csv',
'defense_1b.csv',
'defense_2b.csv',
'defense_3b.csv',
'defense_ss.csv',
'defense_lf.csv',
'defense_cf.csv',
'defense_rf.csv',
'defense_of.csv',
'defense_p.csv'
]
print("Renaming additional defense CSV columns...")
print(f"Column mappings:")
for old, new in COLUMN_MAPPING.items():
print(f" {old} -> {new}")
print()
for filename in DEFENSE_FILES:
filepath = os.path.join(DATA_DIR, filename)
if not os.path.exists(filepath):
print(f"⚠ Skipping {filename} (not found)")
continue
# Read CSV
df = pd.read_csv(filepath)
# Track which columns were renamed
renamed = []
for old_col, new_col in COLUMN_MAPPING.items():
if old_col in df.columns:
renamed.append(f"{old_col} -> {new_col}")
# Rename columns
df = df.rename(columns=COLUMN_MAPPING)
# Save back
df.to_csv(filepath, index=False)
if renamed:
print(f"{filename}: Renamed {len(renamed)} columns")
for r in renamed:
print(f" {r}")
else:
print(f" {filename}: No matching columns found")
print("\n✓ All defense files processed successfully!")

View File

@ -0,0 +1,71 @@
"""
Script to rename defense CSV column headers to match expected format.
"""
import pandas as pd
import os
# Column mapping
COLUMN_MAPPING = {
'RF/9': 'range_factor_per_nine',
'RF/G': 'range_factor_per_game',
'DP': 'DP_def',
'E': 'E_def',
'Ch': 'chances',
'Inn': 'Inn_def',
'CS%': 'caught_stealing_perc',
'PO': 'pickoffs',
'Name-additional': 'key_bbref'
}
# Directory with defense files
DATA_DIR = 'data-input/2005 Live Cardset/'
# Defense files to process
DEFENSE_FILES = [
'defense_c.csv',
'defense_1b.csv',
'defense_2b.csv',
'defense_3b.csv',
'defense_ss.csv',
'defense_lf.csv',
'defense_cf.csv',
'defense_rf.csv',
'defense_of.csv',
'defense_p.csv'
]
print("Renaming defense CSV columns...")
print(f"Column mappings:")
for old, new in COLUMN_MAPPING.items():
print(f" {old} -> {new}")
print()
for filename in DEFENSE_FILES:
filepath = os.path.join(DATA_DIR, filename)
if not os.path.exists(filepath):
print(f"⚠ Skipping {filename} (not found)")
continue
# Read CSV
df = pd.read_csv(filepath)
# Track which columns were renamed
renamed = []
for old_col, new_col in COLUMN_MAPPING.items():
if old_col in df.columns:
renamed.append(f"{old_col} -> {new_col}")
# Rename columns
df = df.rename(columns=COLUMN_MAPPING)
# Save back
df.to_csv(filepath, index=False)
print(f"{filename}: Renamed {len(renamed)} columns")
if renamed:
for r in renamed:
print(f" {r}")
print("\n✓ All defense files processed successfully!")

View File

@ -0,0 +1,56 @@
"""
Test script to verify retrosheet_data.py works with the new transformer.
"""
import sys
import logging
from retrosheet_data import get_base_batting_df, get_base_pitching_df, RETRO_FILE_PATH, EVENTS_FILENAME
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
print(f"\n{'='*60}")
print(f"Testing Retrosheet Data Integration")
print(f"{'='*60}\n")
file_path = f'{RETRO_FILE_PATH}{EVENTS_FILENAME}'
print(f"Loading from: {file_path}")
# Test date range (full 2005 season)
start_date = 20050403 # Opening Day 2005
end_date = 20051003 # End of 2005 regular season
print(f"\nTest 1: Loading Batting Data")
print(f"Date range: {start_date} to {end_date}")
try:
all_plays_batting, batting_stats = get_base_batting_df(file_path, start_date, end_date)
print(f"✓ Success!")
print(f" - Total plays loaded: {len(all_plays_batting)}")
print(f" - Qualified batters: {len(batting_stats)}")
print(f"\nSample batting stats:")
print(batting_stats.head(3))
except Exception as e:
print(f"✗ Failed: {e}")
sys.exit(1)
print(f"\n{'-'*60}\n")
print(f"Test 2: Loading Pitching Data")
try:
all_plays_pitching, pitching_stats = get_base_pitching_df(file_path, start_date, end_date)
print(f"✓ Success!")
print(f" - Total plays loaded: {len(all_plays_pitching)}")
print(f" - Qualified pitchers: {len(pitching_stats)}")
print(f"\nSample pitching stats:")
print(pitching_stats.head(3))
except Exception as e:
print(f"✗ Failed: {e}")
sys.exit(1)
print(f"\n{'='*60}")
print(f"All tests passed! ✓")
print(f"{'='*60}\n")

48
scripts/undo_po_rename.py Normal file
View File

@ -0,0 +1,48 @@
"""
Script to undo PO -> pickoffs rename for position player defense files.
Keeps the rename for defense_p.csv (pitchers).
"""
import pandas as pd
import os
# Directory with defense files
DATA_DIR = 'data-input/2005 Live Cardset/'
# Position player defense files (NOT pitchers)
POSITION_DEFENSE_FILES = [
'defense_c.csv',
'defense_1b.csv',
'defense_2b.csv',
'defense_3b.csv',
'defense_ss.csv',
'defense_lf.csv',
'defense_cf.csv',
'defense_rf.csv',
'defense_of.csv'
]
print("Undoing PO -> pickoffs rename for position player defense files...")
print("(Keeping pickoffs for defense_p.csv)")
print()
for filename in POSITION_DEFENSE_FILES:
filepath = os.path.join(DATA_DIR, filename)
if not os.path.exists(filepath):
print(f"⚠ Skipping {filename} (not found)")
continue
# Read CSV
df = pd.read_csv(filepath)
# Rename pickoffs back to PO if it exists
if 'pickoffs' in df.columns:
df = df.rename(columns={'pickoffs': 'PO'})
df.to_csv(filepath, index=False)
print(f"{filename}: Renamed pickoffs -> PO")
else:
print(f"{filename}: Column 'pickoffs' not found")
print("\n✓ Position player defense files updated!")
print(" defense_p.csv still has 'pickoffs' column")