Standardize formatting with black and apply ruff auto-fixes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
266 lines
8.9 KiB
Python
266 lines
8.9 KiB
Python
"""
|
|
Validation script for Retrosheet-based OF arm ratings
|
|
|
|
This script:
|
|
1. Loads 2005 Retrosheet events data
|
|
2. Calculates arm ratings using the new Retrosheet method
|
|
3. Compares against known strong/weak arms from 2005
|
|
4. Generates a detailed report
|
|
|
|
Usage:
|
|
python test_retrosheet_arms.py
|
|
"""
|
|
|
|
import pandas as pd
|
|
from defenders.retrosheet_arm_calculator import (
|
|
calculate_of_arms_from_retrosheet,
|
|
)
|
|
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("RETROSHEET ARM RATING VALIDATION - 2005 SEASON")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load Retrosheet events
|
|
print("Loading Retrosheet events data...")
|
|
events_file = "data-input/retrosheet/retrosheets_events_2005.csv"
|
|
df_events = pd.read_csv(events_file, low_memory=False)
|
|
print(f"Loaded {len(df_events):,} events")
|
|
print()
|
|
|
|
# Calculate arm ratings
|
|
print("Calculating arm ratings for all outfielders...")
|
|
arm_ratings = calculate_of_arms_from_retrosheet(df_events, season_pct=1.0)
|
|
print(f"Calculated ratings for {len(arm_ratings)} outfielders")
|
|
print()
|
|
|
|
# Distribution summary
|
|
print("ARM RATING DISTRIBUTION")
|
|
print("-" * 80)
|
|
ratings_series = pd.Series(list(arm_ratings.values()))
|
|
for rating in sorted(ratings_series.unique()):
|
|
count = (ratings_series == rating).sum()
|
|
pct = count / len(ratings_series) * 100
|
|
stars = "*" * int(pct / 2)
|
|
print(f"Rating {rating:+2d}: {count:3d} players ({pct:5.1f}%) {stars}")
|
|
print()
|
|
|
|
# Show top and bottom performers
|
|
print("TOP 20 ARM RATINGS (Elite Arms)")
|
|
print("-" * 80)
|
|
sorted_players = sorted(arm_ratings.items(), key=lambda x: x[1])
|
|
top_20 = sorted_players[:20]
|
|
|
|
for i, (player_id, rating) in enumerate(top_20, 1):
|
|
# Get their stats
|
|
player_stats = get_player_arm_stats(df_events, player_id)
|
|
print(
|
|
f"{i:2d}. {player_id:12s} Rating: {rating:+2d} "
|
|
f"Assists: {player_stats['assists']:2d} "
|
|
f"Home Throws: {player_stats['home_throws']:2d} "
|
|
f"Assist Rate: {player_stats['assist_rate']:.2f}% "
|
|
f"Balls: {player_stats['balls_fielded']:3d}"
|
|
)
|
|
print()
|
|
|
|
print("BOTTOM 15 ARM RATINGS (Weak Arms)")
|
|
print("-" * 80)
|
|
bottom_15 = sorted_players[-15:]
|
|
for i, (player_id, rating) in enumerate(bottom_15, 1):
|
|
player_stats = get_player_arm_stats(df_events, player_id)
|
|
print(
|
|
f"{i:2d}. {player_id:12s} Rating: {rating:+2d} "
|
|
f"Assists: {player_stats['assists']:2d} "
|
|
f"Assist Rate: {player_stats['assist_rate']:.2f}% "
|
|
f"Balls: {player_stats['balls_fielded']:3d}"
|
|
)
|
|
print()
|
|
|
|
# Known strong arms to validate
|
|
print("VALIDATION: Known Strong Arms (2005)")
|
|
print("-" * 80)
|
|
known_strong_arms = [
|
|
("suzui001", "Ichiro Suzuki", "RF", "Gold Glove, legendary arm"),
|
|
("crawc002", "Carl Crawford", "LF", "Multiple stolen base deterrent"),
|
|
("edmoj001", "Jim Edmonds", "CF", "Gold Glove, strong arm"),
|
|
("guerv001", "Vladimir Guerrero", "RF", "Feared arm"),
|
|
]
|
|
|
|
for player_id, name, pos, notes in known_strong_arms:
|
|
rating = arm_ratings.get(player_id, "NOT FOUND")
|
|
if rating != "NOT FOUND":
|
|
stats = get_player_arm_stats(df_events, player_id)
|
|
print(
|
|
f"{name:20s} ({pos}): Rating {rating:+2d} "
|
|
f"Assists: {stats['assists']:2d} "
|
|
f"Note: {notes}"
|
|
)
|
|
else:
|
|
print(f"{name:20s} ({pos}): NOT FOUND IN DATA - Check player_id")
|
|
print()
|
|
|
|
# Position breakdown
|
|
print("ARM RATINGS BY POSITION")
|
|
print("-" * 80)
|
|
position_stats = get_position_breakdown(df_events, arm_ratings)
|
|
for pos, stats in position_stats.items():
|
|
print(f"\n{pos}:")
|
|
print(f" Total Players: {stats['count']}")
|
|
print(f" Average Rating: {stats['avg_rating']:.2f}")
|
|
print(f" Std Dev: {stats['std_rating']:.2f}")
|
|
print(f" Elite Arms (≤-3): {stats['elite_count']}")
|
|
print(f" Weak Arms (≥+2): {stats['weak_count']}")
|
|
print()
|
|
|
|
# Detailed player report
|
|
print("DETAILED PLAYER REPORT (Elite Arms Only)")
|
|
print("-" * 80)
|
|
elite_players = [(p, r) for p, r in arm_ratings.items() if r <= -3]
|
|
elite_players.sort(key=lambda x: x[1])
|
|
|
|
for player_id, rating in elite_players:
|
|
print(f"\nPlayer: {player_id} (Rating: {rating:+2d})")
|
|
detailed_stats = get_detailed_player_stats(df_events, player_id)
|
|
for key, value in detailed_stats.items():
|
|
print(f" {key:20s}: {value}")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("VALIDATION COMPLETE")
|
|
print("=" * 80)
|
|
|
|
|
|
def get_player_arm_stats(df_events, player_id):
|
|
"""Get basic arm stats for a player across all OF positions."""
|
|
stats = {"assists": 0, "home_throws": 0, "balls_fielded": 0, "assist_rate": 0.0}
|
|
|
|
for a_col, po_col, f_col in [
|
|
("a7", "po7", "f7"),
|
|
("a8", "po8", "f8"),
|
|
("a9", "po9", "f9"),
|
|
]:
|
|
player_plays = df_events[df_events[f_col] == player_id]
|
|
|
|
if len(player_plays) > 0:
|
|
fielder_num = int(a_col[-1])
|
|
assists = player_plays[player_plays[a_col] > 0].shape[0]
|
|
balls = player_plays[
|
|
(player_plays[po_col] > 0) | (player_plays[a_col] > 0)
|
|
].shape[0]
|
|
|
|
home = player_plays[
|
|
(player_plays[a_col] > 0)
|
|
& (
|
|
(player_plays["brout1"] == fielder_num)
|
|
| (player_plays["brout2"] == fielder_num)
|
|
| (player_plays["brout3"] == fielder_num)
|
|
)
|
|
].shape[0]
|
|
|
|
stats["assists"] += assists
|
|
stats["home_throws"] += home
|
|
stats["balls_fielded"] += balls
|
|
|
|
if stats["balls_fielded"] > 0:
|
|
stats["assist_rate"] = stats["assists"] / stats["balls_fielded"] * 100
|
|
|
|
return stats
|
|
|
|
|
|
def get_detailed_player_stats(df_events, player_id):
|
|
"""Get detailed breakdown of arm-related plays."""
|
|
detailed = {}
|
|
|
|
for pos_name, a_col, po_col, f_col in [
|
|
("LF", "a7", "po7", "f7"),
|
|
("CF", "a8", "po8", "f8"),
|
|
("RF", "a9", "po9", "f9"),
|
|
]:
|
|
player_plays = df_events[df_events[f_col] == player_id]
|
|
|
|
if len(player_plays) == 0:
|
|
continue
|
|
|
|
fielder_num = int(a_col[-1])
|
|
|
|
balls_fielded = player_plays[
|
|
(player_plays[po_col] > 0) | (player_plays[a_col] > 0)
|
|
].shape[0]
|
|
|
|
if balls_fielded < 50:
|
|
continue
|
|
|
|
assists = player_plays[player_plays[a_col] > 0].shape[0]
|
|
|
|
throwouts = player_plays[
|
|
(player_plays[a_col] > 0)
|
|
& (
|
|
(player_plays["brout1"] == fielder_num)
|
|
| (player_plays["brout2"] == fielder_num)
|
|
| (player_plays["brout3"] == fielder_num)
|
|
| (player_plays["brout_b"] == fielder_num)
|
|
)
|
|
].shape[0]
|
|
|
|
home_throws = player_plays[
|
|
(player_plays[a_col] > 0)
|
|
& (
|
|
(player_plays["brout1"] == fielder_num)
|
|
| (player_plays["brout2"] == fielder_num)
|
|
| (player_plays["brout3"] == fielder_num)
|
|
)
|
|
].shape[0]
|
|
|
|
batter_extra = player_plays[
|
|
(player_plays[a_col] > 0) & (player_plays["brout_b"] == fielder_num)
|
|
].shape[0]
|
|
|
|
detailed[f"{pos_name} Balls Fielded"] = balls_fielded
|
|
detailed[f"{pos_name} Total Assists"] = assists
|
|
detailed[f"{pos_name} Throwouts"] = throwouts
|
|
detailed[f"{pos_name} Home Throws"] = home_throws
|
|
detailed[f"{pos_name} Batter Extra Outs"] = batter_extra
|
|
detailed[f"{pos_name} Assist Rate"] = (
|
|
f"{assists/balls_fielded*100:.2f}%" if balls_fielded > 0 else "N/A"
|
|
)
|
|
|
|
return detailed
|
|
|
|
|
|
def get_position_breakdown(df_events, arm_ratings):
|
|
"""Get statistical breakdown by position."""
|
|
positions = {}
|
|
|
|
for pos_name, f_col in [("LF", "f7"), ("CF", "f8"), ("RF", "f9")]:
|
|
fielders = df_events[f_col].dropna().unique()
|
|
pos_ratings = []
|
|
|
|
for fielder in fielders:
|
|
if fielder in arm_ratings:
|
|
# Check if they qualified at this position
|
|
player_plays = df_events[df_events[f_col] == fielder]
|
|
a_col = f"a{f_col[-1]}"
|
|
po_col = f"po{f_col[-1]}"
|
|
balls = player_plays[
|
|
(player_plays[po_col] > 0) | (player_plays[a_col] > 0)
|
|
].shape[0]
|
|
|
|
if balls >= 50:
|
|
pos_ratings.append(arm_ratings[fielder])
|
|
|
|
if pos_ratings:
|
|
positions[pos_name] = {
|
|
"count": len(pos_ratings),
|
|
"avg_rating": pd.Series(pos_ratings).mean(),
|
|
"std_rating": pd.Series(pos_ratings).std(),
|
|
"elite_count": sum(1 for r in pos_ratings if r <= -3),
|
|
"weak_count": sum(1 for r in pos_ratings if r >= 2),
|
|
}
|
|
|
|
return positions
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|