paper-dynasty-card-creation/test_retrosheet_arms.py
2025-11-23 01:28:33 -06:00

254 lines
8.7 KiB
Python

"""
Validation script for Retrosheet-based OF arm ratings
This script:
1. Loads 2005 Retrosheet events data
2. Calculates arm ratings using the new Retrosheet method
3. Compares against known strong/weak arms from 2005
4. Generates a detailed report
Usage:
python test_retrosheet_arms.py
"""
import pandas as pd
from defenders.retrosheet_arm_calculator import (
calculate_of_arms_from_retrosheet,
calculate_position_baselines,
calculate_player_arm_rating
)
def main():
print("="*80)
print("RETROSHEET ARM RATING VALIDATION - 2005 SEASON")
print("="*80)
print()
# Load Retrosheet events
print("Loading Retrosheet events data...")
events_file = 'data-input/retrosheet/retrosheets_events_2005.csv'
df_events = pd.read_csv(events_file, low_memory=False)
print(f"Loaded {len(df_events):,} events")
print()
# Calculate arm ratings
print("Calculating arm ratings for all outfielders...")
arm_ratings = calculate_of_arms_from_retrosheet(df_events, season_pct=1.0)
print(f"Calculated ratings for {len(arm_ratings)} outfielders")
print()
# Distribution summary
print("ARM RATING DISTRIBUTION")
print("-"*80)
ratings_series = pd.Series(list(arm_ratings.values()))
for rating in sorted(ratings_series.unique()):
count = (ratings_series == rating).sum()
pct = count / len(ratings_series) * 100
stars = '*' * int(pct / 2)
print(f"Rating {rating:+2d}: {count:3d} players ({pct:5.1f}%) {stars}")
print()
# Show top and bottom performers
print("TOP 20 ARM RATINGS (Elite Arms)")
print("-"*80)
sorted_players = sorted(arm_ratings.items(), key=lambda x: x[1])
top_20 = sorted_players[:20]
for i, (player_id, rating) in enumerate(top_20, 1):
# Get their stats
player_stats = get_player_arm_stats(df_events, player_id)
print(f"{i:2d}. {player_id:12s} Rating: {rating:+2d} "
f"Assists: {player_stats['assists']:2d} "
f"Home Throws: {player_stats['home_throws']:2d} "
f"Assist Rate: {player_stats['assist_rate']:.2f}% "
f"Balls: {player_stats['balls_fielded']:3d}")
print()
print("BOTTOM 15 ARM RATINGS (Weak Arms)")
print("-"*80)
bottom_15 = sorted_players[-15:]
for i, (player_id, rating) in enumerate(bottom_15, 1):
player_stats = get_player_arm_stats(df_events, player_id)
print(f"{i:2d}. {player_id:12s} Rating: {rating:+2d} "
f"Assists: {player_stats['assists']:2d} "
f"Assist Rate: {player_stats['assist_rate']:.2f}% "
f"Balls: {player_stats['balls_fielded']:3d}")
print()
# Known strong arms to validate
print("VALIDATION: Known Strong Arms (2005)")
print("-"*80)
known_strong_arms = [
('suzui001', 'Ichiro Suzuki', 'RF', 'Gold Glove, legendary arm'),
('crawc002', 'Carl Crawford', 'LF', 'Multiple stolen base deterrent'),
('edmoj001', 'Jim Edmonds', 'CF', 'Gold Glove, strong arm'),
('guerv001', 'Vladimir Guerrero', 'RF', 'Feared arm'),
]
for player_id, name, pos, notes in known_strong_arms:
rating = arm_ratings.get(player_id, 'NOT FOUND')
if rating != 'NOT FOUND':
stats = get_player_arm_stats(df_events, player_id)
print(f"{name:20s} ({pos}): Rating {rating:+2d} "
f"Assists: {stats['assists']:2d} "
f"Note: {notes}")
else:
print(f"{name:20s} ({pos}): NOT FOUND IN DATA - Check player_id")
print()
# Position breakdown
print("ARM RATINGS BY POSITION")
print("-"*80)
position_stats = get_position_breakdown(df_events, arm_ratings)
for pos, stats in position_stats.items():
print(f"\n{pos}:")
print(f" Total Players: {stats['count']}")
print(f" Average Rating: {stats['avg_rating']:.2f}")
print(f" Std Dev: {stats['std_rating']:.2f}")
print(f" Elite Arms (≤-3): {stats['elite_count']}")
print(f" Weak Arms (≥+2): {stats['weak_count']}")
print()
# Detailed player report
print("DETAILED PLAYER REPORT (Elite Arms Only)")
print("-"*80)
elite_players = [(p, r) for p, r in arm_ratings.items() if r <= -3]
elite_players.sort(key=lambda x: x[1])
for player_id, rating in elite_players:
print(f"\nPlayer: {player_id} (Rating: {rating:+2d})")
detailed_stats = get_detailed_player_stats(df_events, player_id)
for key, value in detailed_stats.items():
print(f" {key:20s}: {value}")
print("\n" + "="*80)
print("VALIDATION COMPLETE")
print("="*80)
def get_player_arm_stats(df_events, player_id):
"""Get basic arm stats for a player across all OF positions."""
stats = {
'assists': 0,
'home_throws': 0,
'balls_fielded': 0,
'assist_rate': 0.0
}
for a_col, po_col, f_col in [('a7', 'po7', 'f7'), ('a8', 'po8', 'f8'), ('a9', 'po9', 'f9')]:
player_plays = df_events[df_events[f_col] == player_id]
if len(player_plays) > 0:
fielder_num = int(a_col[-1])
assists = player_plays[player_plays[a_col] > 0].shape[0]
balls = player_plays[(player_plays[po_col] > 0) | (player_plays[a_col] > 0)].shape[0]
home = player_plays[
(player_plays[a_col] > 0) &
((player_plays['brout1'] == fielder_num) |
(player_plays['brout2'] == fielder_num) |
(player_plays['brout3'] == fielder_num))
].shape[0]
stats['assists'] += assists
stats['home_throws'] += home
stats['balls_fielded'] += balls
if stats['balls_fielded'] > 0:
stats['assist_rate'] = stats['assists'] / stats['balls_fielded'] * 100
return stats
def get_detailed_player_stats(df_events, player_id):
"""Get detailed breakdown of arm-related plays."""
detailed = {}
for pos_name, a_col, po_col, f_col in [
('LF', 'a7', 'po7', 'f7'),
('CF', 'a8', 'po8', 'f8'),
('RF', 'a9', 'po9', 'f9')
]:
player_plays = df_events[df_events[f_col] == player_id]
if len(player_plays) == 0:
continue
fielder_num = int(a_col[-1])
balls_fielded = player_plays[
(player_plays[po_col] > 0) | (player_plays[a_col] > 0)
].shape[0]
if balls_fielded < 50:
continue
assists = player_plays[player_plays[a_col] > 0].shape[0]
throwouts = player_plays[
(player_plays[a_col] > 0) &
((player_plays['brout1'] == fielder_num) |
(player_plays['brout2'] == fielder_num) |
(player_plays['brout3'] == fielder_num) |
(player_plays['brout_b'] == fielder_num))
].shape[0]
home_throws = player_plays[
(player_plays[a_col] > 0) &
((player_plays['brout1'] == fielder_num) |
(player_plays['brout2'] == fielder_num) |
(player_plays['brout3'] == fielder_num))
].shape[0]
batter_extra = player_plays[
(player_plays[a_col] > 0) &
(player_plays['brout_b'] == fielder_num)
].shape[0]
detailed[f'{pos_name} Balls Fielded'] = balls_fielded
detailed[f'{pos_name} Total Assists'] = assists
detailed[f'{pos_name} Throwouts'] = throwouts
detailed[f'{pos_name} Home Throws'] = home_throws
detailed[f'{pos_name} Batter Extra Outs'] = batter_extra
detailed[f'{pos_name} Assist Rate'] = f"{assists/balls_fielded*100:.2f}%" if balls_fielded > 0 else "N/A"
return detailed
def get_position_breakdown(df_events, arm_ratings):
"""Get statistical breakdown by position."""
positions = {}
for pos_name, f_col in [('LF', 'f7'), ('CF', 'f8'), ('RF', 'f9')]:
fielders = df_events[f_col].dropna().unique()
pos_ratings = []
for fielder in fielders:
if fielder in arm_ratings:
# Check if they qualified at this position
player_plays = df_events[df_events[f_col] == fielder]
a_col = f'a{f_col[-1]}'
po_col = f'po{f_col[-1]}'
balls = player_plays[
(player_plays[po_col] > 0) | (player_plays[a_col] > 0)
].shape[0]
if balls >= 50:
pos_ratings.append(arm_ratings[fielder])
if pos_ratings:
positions[pos_name] = {
'count': len(pos_ratings),
'avg_rating': pd.Series(pos_ratings).mean(),
'std_rating': pd.Series(pos_ratings).std(),
'elite_count': sum(1 for r in pos_ratings if r <= -3),
'weak_count': sum(1 for r in pos_ratings if r >= 2)
}
return positions
if __name__ == '__main__':
main()