paper-dynasty-card-creation/retrosheet_data.py
2024-10-17 16:31:17 -05:00

103 lines
3.8 KiB
Python

import asyncio
import datetime
import logging
import sys
from typing import Literal
import pandas as pd
import pybaseball as pb
date = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}'
log_level = logging.INFO
logging.basicConfig(
filename=f'logs/{date}.log',
format='%(asctime)s - retrosheet_data - %(levelname)s - %(message)s',
level=log_level
)
FILE_PATH = 'data-input/retrosheet/'
EVENTS_FILENAME = 'retrosheets_events_1998_short.csv' # Removed last few columns which were throwing dtype errors
PERSONNEL_FILENAME = 'retrosheets_personnel.csv'
def get_events_by_date(start_date: int, end_date: int) -> pd.DataFrame:
all_plays = pd.read_csv(f'{FILE_PATH}{EVENTS_FILENAME}', dtype={'game_id': 'str'})
all_plays['date'] = all_plays['game_id'].str[7:-1].astype(int)
date_plays = all_plays[(all_plays.date >= start_date) & (all_plays.date <= end_date)]
return date_plays
def get_result_series(plays: pd.DataFrame, event_type: str, pitcher_hand: Literal['r', 'l'], col_name: str) -> pd.Series:
this_series = plays[(plays.event_type == event_type) & (plays.pitcher_hand == pitcher_hand)].groupby('batter_id').count()['event_type'].astype(int).rename(col_name)
return this_series
def get_batting_stats_by_date(start_date: int, end_date: int) -> pd.DataFrame:
all_plays = get_events_by_date(start_date, end_date)
bs = pd.DataFrame()
bs['batter_id'] = all_plays['batter_id'].unique()
bs = bs.set_index('batter_id')
pal_series = all_plays[(all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'l')].groupby('batter_id').count()['event_type'].astype(int).rename('PAvL')
bs = pd.concat([bs, pal_series], axis=1)
par_series = all_plays[(all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('PAvR')
bs = pd.concat([bs, par_series], axis=1)
abl_series = all_plays[(all_plays.ab == 't') & (all_plays.pitcher_hand == 'l')].groupby('batter_id').count()['event_type'].astype(int).rename('ABvL')
bs = pd.concat([bs, abl_series], axis=1)
abr_series = all_plays[(all_plays.ab == 't') & (all_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('ABvR')
bs = pd.concat([bs, abr_series], axis=1)
# Basic counting stats
for event_type, vs_hand, col_name in [
('home run', 'r', 'HRvR'),
('home run', 'l', 'HRvL'),
('single', 'r', '1BvR'),
('single', 'l', '1BvL'),
('double', 'r', '2BvR'),
('double', 'l', '2BvL'),
('triple', 'r', '3BvR'),
('triple', 'l', '3BvL'),
('walk', 'r', 'BBvR'),
('walk', 'l', 'BBvL'),
('strikeout', 'r', 'SOvR'),
('strikeout', 'l', 'SOvL'),
('hit by pitch', 'r', 'HBPvR'),
('hit by pitch', 'l', 'HBPvL')
]:
this_series = get_result_series(all_plays, event_type, vs_hand, col_name)
bs = pd.concat([bs, this_series], axis=1)
# Bespoke queries
# fill na to 0 following counting stats
bs = bs.fillna(0)
return bs
# def get_batting_stat_range(start_month: int, start_day: int, end_month: int, end_day: int):
# return get_batting_stats_by_date(
# start_date=start_month * 100 + start_day,
# end_date=end_month * 100 + end_day
# )
async def main(args):
print(f'Running the calcs...')
start = datetime.datetime.now()
data = get_batting_stats_by_date(start_date=101, end_date=430)
end_calc = datetime.datetime.now()
print(f'Saving to csv...')
data.to_csv(f'batting_stats.csv')
end = datetime.datetime.now()
print(f'Done!\n\nCalc time: {(end_calc - start).total_seconds()}s\nSave time: {(end - end_calc).total_seconds()}s\nTotal: {(end - start).total_seconds()}s')
if __name__ == '__main__':
asyncio.run(main(sys.argv[1:]))