254 lines
8.7 KiB
Python
254 lines
8.7 KiB
Python
"""
|
|
Validation script for Retrosheet-based OF arm ratings
|
|
|
|
This script:
|
|
1. Loads 2005 Retrosheet events data
|
|
2. Calculates arm ratings using the new Retrosheet method
|
|
3. Compares against known strong/weak arms from 2005
|
|
4. Generates a detailed report
|
|
|
|
Usage:
|
|
python test_retrosheet_arms.py
|
|
"""
|
|
|
|
import pandas as pd
|
|
from defenders.retrosheet_arm_calculator import (
|
|
calculate_of_arms_from_retrosheet,
|
|
calculate_position_baselines,
|
|
calculate_player_arm_rating
|
|
)
|
|
|
|
|
|
def main():
|
|
print("="*80)
|
|
print("RETROSHEET ARM RATING VALIDATION - 2005 SEASON")
|
|
print("="*80)
|
|
print()
|
|
|
|
# Load Retrosheet events
|
|
print("Loading Retrosheet events data...")
|
|
events_file = 'data-input/retrosheet/retrosheets_events_2005.csv'
|
|
df_events = pd.read_csv(events_file, low_memory=False)
|
|
print(f"Loaded {len(df_events):,} events")
|
|
print()
|
|
|
|
# Calculate arm ratings
|
|
print("Calculating arm ratings for all outfielders...")
|
|
arm_ratings = calculate_of_arms_from_retrosheet(df_events, season_pct=1.0)
|
|
print(f"Calculated ratings for {len(arm_ratings)} outfielders")
|
|
print()
|
|
|
|
# Distribution summary
|
|
print("ARM RATING DISTRIBUTION")
|
|
print("-"*80)
|
|
ratings_series = pd.Series(list(arm_ratings.values()))
|
|
for rating in sorted(ratings_series.unique()):
|
|
count = (ratings_series == rating).sum()
|
|
pct = count / len(ratings_series) * 100
|
|
stars = '*' * int(pct / 2)
|
|
print(f"Rating {rating:+2d}: {count:3d} players ({pct:5.1f}%) {stars}")
|
|
print()
|
|
|
|
# Show top and bottom performers
|
|
print("TOP 20 ARM RATINGS (Elite Arms)")
|
|
print("-"*80)
|
|
sorted_players = sorted(arm_ratings.items(), key=lambda x: x[1])
|
|
top_20 = sorted_players[:20]
|
|
|
|
for i, (player_id, rating) in enumerate(top_20, 1):
|
|
# Get their stats
|
|
player_stats = get_player_arm_stats(df_events, player_id)
|
|
print(f"{i:2d}. {player_id:12s} Rating: {rating:+2d} "
|
|
f"Assists: {player_stats['assists']:2d} "
|
|
f"Home Throws: {player_stats['home_throws']:2d} "
|
|
f"Assist Rate: {player_stats['assist_rate']:.2f}% "
|
|
f"Balls: {player_stats['balls_fielded']:3d}")
|
|
print()
|
|
|
|
print("BOTTOM 15 ARM RATINGS (Weak Arms)")
|
|
print("-"*80)
|
|
bottom_15 = sorted_players[-15:]
|
|
for i, (player_id, rating) in enumerate(bottom_15, 1):
|
|
player_stats = get_player_arm_stats(df_events, player_id)
|
|
print(f"{i:2d}. {player_id:12s} Rating: {rating:+2d} "
|
|
f"Assists: {player_stats['assists']:2d} "
|
|
f"Assist Rate: {player_stats['assist_rate']:.2f}% "
|
|
f"Balls: {player_stats['balls_fielded']:3d}")
|
|
print()
|
|
|
|
# Known strong arms to validate
|
|
print("VALIDATION: Known Strong Arms (2005)")
|
|
print("-"*80)
|
|
known_strong_arms = [
|
|
('suzui001', 'Ichiro Suzuki', 'RF', 'Gold Glove, legendary arm'),
|
|
('crawc002', 'Carl Crawford', 'LF', 'Multiple stolen base deterrent'),
|
|
('edmoj001', 'Jim Edmonds', 'CF', 'Gold Glove, strong arm'),
|
|
('guerv001', 'Vladimir Guerrero', 'RF', 'Feared arm'),
|
|
]
|
|
|
|
for player_id, name, pos, notes in known_strong_arms:
|
|
rating = arm_ratings.get(player_id, 'NOT FOUND')
|
|
if rating != 'NOT FOUND':
|
|
stats = get_player_arm_stats(df_events, player_id)
|
|
print(f"{name:20s} ({pos}): Rating {rating:+2d} "
|
|
f"Assists: {stats['assists']:2d} "
|
|
f"Note: {notes}")
|
|
else:
|
|
print(f"{name:20s} ({pos}): NOT FOUND IN DATA - Check player_id")
|
|
print()
|
|
|
|
# Position breakdown
|
|
print("ARM RATINGS BY POSITION")
|
|
print("-"*80)
|
|
position_stats = get_position_breakdown(df_events, arm_ratings)
|
|
for pos, stats in position_stats.items():
|
|
print(f"\n{pos}:")
|
|
print(f" Total Players: {stats['count']}")
|
|
print(f" Average Rating: {stats['avg_rating']:.2f}")
|
|
print(f" Std Dev: {stats['std_rating']:.2f}")
|
|
print(f" Elite Arms (≤-3): {stats['elite_count']}")
|
|
print(f" Weak Arms (≥+2): {stats['weak_count']}")
|
|
print()
|
|
|
|
# Detailed player report
|
|
print("DETAILED PLAYER REPORT (Elite Arms Only)")
|
|
print("-"*80)
|
|
elite_players = [(p, r) for p, r in arm_ratings.items() if r <= -3]
|
|
elite_players.sort(key=lambda x: x[1])
|
|
|
|
for player_id, rating in elite_players:
|
|
print(f"\nPlayer: {player_id} (Rating: {rating:+2d})")
|
|
detailed_stats = get_detailed_player_stats(df_events, player_id)
|
|
for key, value in detailed_stats.items():
|
|
print(f" {key:20s}: {value}")
|
|
|
|
print("\n" + "="*80)
|
|
print("VALIDATION COMPLETE")
|
|
print("="*80)
|
|
|
|
|
|
def get_player_arm_stats(df_events, player_id):
|
|
"""Get basic arm stats for a player across all OF positions."""
|
|
stats = {
|
|
'assists': 0,
|
|
'home_throws': 0,
|
|
'balls_fielded': 0,
|
|
'assist_rate': 0.0
|
|
}
|
|
|
|
for a_col, po_col, f_col in [('a7', 'po7', 'f7'), ('a8', 'po8', 'f8'), ('a9', 'po9', 'f9')]:
|
|
player_plays = df_events[df_events[f_col] == player_id]
|
|
|
|
if len(player_plays) > 0:
|
|
fielder_num = int(a_col[-1])
|
|
assists = player_plays[player_plays[a_col] > 0].shape[0]
|
|
balls = player_plays[(player_plays[po_col] > 0) | (player_plays[a_col] > 0)].shape[0]
|
|
|
|
home = player_plays[
|
|
(player_plays[a_col] > 0) &
|
|
((player_plays['brout1'] == fielder_num) |
|
|
(player_plays['brout2'] == fielder_num) |
|
|
(player_plays['brout3'] == fielder_num))
|
|
].shape[0]
|
|
|
|
stats['assists'] += assists
|
|
stats['home_throws'] += home
|
|
stats['balls_fielded'] += balls
|
|
|
|
if stats['balls_fielded'] > 0:
|
|
stats['assist_rate'] = stats['assists'] / stats['balls_fielded'] * 100
|
|
|
|
return stats
|
|
|
|
|
|
def get_detailed_player_stats(df_events, player_id):
|
|
"""Get detailed breakdown of arm-related plays."""
|
|
detailed = {}
|
|
|
|
for pos_name, a_col, po_col, f_col in [
|
|
('LF', 'a7', 'po7', 'f7'),
|
|
('CF', 'a8', 'po8', 'f8'),
|
|
('RF', 'a9', 'po9', 'f9')
|
|
]:
|
|
player_plays = df_events[df_events[f_col] == player_id]
|
|
|
|
if len(player_plays) == 0:
|
|
continue
|
|
|
|
fielder_num = int(a_col[-1])
|
|
|
|
balls_fielded = player_plays[
|
|
(player_plays[po_col] > 0) | (player_plays[a_col] > 0)
|
|
].shape[0]
|
|
|
|
if balls_fielded < 50:
|
|
continue
|
|
|
|
assists = player_plays[player_plays[a_col] > 0].shape[0]
|
|
|
|
throwouts = player_plays[
|
|
(player_plays[a_col] > 0) &
|
|
((player_plays['brout1'] == fielder_num) |
|
|
(player_plays['brout2'] == fielder_num) |
|
|
(player_plays['brout3'] == fielder_num) |
|
|
(player_plays['brout_b'] == fielder_num))
|
|
].shape[0]
|
|
|
|
home_throws = player_plays[
|
|
(player_plays[a_col] > 0) &
|
|
((player_plays['brout1'] == fielder_num) |
|
|
(player_plays['brout2'] == fielder_num) |
|
|
(player_plays['brout3'] == fielder_num))
|
|
].shape[0]
|
|
|
|
batter_extra = player_plays[
|
|
(player_plays[a_col] > 0) &
|
|
(player_plays['brout_b'] == fielder_num)
|
|
].shape[0]
|
|
|
|
detailed[f'{pos_name} Balls Fielded'] = balls_fielded
|
|
detailed[f'{pos_name} Total Assists'] = assists
|
|
detailed[f'{pos_name} Throwouts'] = throwouts
|
|
detailed[f'{pos_name} Home Throws'] = home_throws
|
|
detailed[f'{pos_name} Batter Extra Outs'] = batter_extra
|
|
detailed[f'{pos_name} Assist Rate'] = f"{assists/balls_fielded*100:.2f}%" if balls_fielded > 0 else "N/A"
|
|
|
|
return detailed
|
|
|
|
|
|
def get_position_breakdown(df_events, arm_ratings):
|
|
"""Get statistical breakdown by position."""
|
|
positions = {}
|
|
|
|
for pos_name, f_col in [('LF', 'f7'), ('CF', 'f8'), ('RF', 'f9')]:
|
|
fielders = df_events[f_col].dropna().unique()
|
|
pos_ratings = []
|
|
|
|
for fielder in fielders:
|
|
if fielder in arm_ratings:
|
|
# Check if they qualified at this position
|
|
player_plays = df_events[df_events[f_col] == fielder]
|
|
a_col = f'a{f_col[-1]}'
|
|
po_col = f'po{f_col[-1]}'
|
|
balls = player_plays[
|
|
(player_plays[po_col] > 0) | (player_plays[a_col] > 0)
|
|
].shape[0]
|
|
|
|
if balls >= 50:
|
|
pos_ratings.append(arm_ratings[fielder])
|
|
|
|
if pos_ratings:
|
|
positions[pos_name] = {
|
|
'count': len(pos_ratings),
|
|
'avg_rating': pd.Series(pos_ratings).mean(),
|
|
'std_rating': pd.Series(pos_ratings).std(),
|
|
'elite_count': sum(1 for r in pos_ratings if r <= -3),
|
|
'weak_count': sum(1 for r in pos_ratings if r >= 2)
|
|
}
|
|
|
|
return positions
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|