103 lines
3.8 KiB
Python
103 lines
3.8 KiB
Python
import asyncio
|
|
import datetime
|
|
import logging
|
|
import sys
|
|
|
|
from typing import Literal
|
|
|
|
import pandas as pd
|
|
import pybaseball as pb
|
|
|
|
date = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}'
|
|
log_level = logging.INFO
|
|
logging.basicConfig(
|
|
filename=f'logs/{date}.log',
|
|
format='%(asctime)s - retrosheet_data - %(levelname)s - %(message)s',
|
|
level=log_level
|
|
)
|
|
FILE_PATH = 'data-input/retrosheet/'
|
|
EVENTS_FILENAME = 'retrosheets_events_1998_short.csv' # Removed last few columns which were throwing dtype errors
|
|
PERSONNEL_FILENAME = 'retrosheets_personnel.csv'
|
|
|
|
|
|
def get_events_by_date(start_date: int, end_date: int) -> pd.DataFrame:
|
|
all_plays = pd.read_csv(f'{FILE_PATH}{EVENTS_FILENAME}', dtype={'game_id': 'str'})
|
|
all_plays['date'] = all_plays['game_id'].str[7:-1].astype(int)
|
|
date_plays = all_plays[(all_plays.date >= start_date) & (all_plays.date <= end_date)]
|
|
return date_plays
|
|
|
|
|
|
def get_result_series(plays: pd.DataFrame, event_type: str, pitcher_hand: Literal['r', 'l'], col_name: str) -> pd.Series:
|
|
this_series = plays[(plays.event_type == event_type) & (plays.pitcher_hand == pitcher_hand)].groupby('batter_id').count()['event_type'].astype(int).rename(col_name)
|
|
return this_series
|
|
|
|
|
|
def get_batting_stats_by_date(start_date: int, end_date: int) -> pd.DataFrame:
|
|
all_plays = get_events_by_date(start_date, end_date)
|
|
|
|
bs = pd.DataFrame()
|
|
bs['batter_id'] = all_plays['batter_id'].unique()
|
|
bs = bs.set_index('batter_id')
|
|
|
|
pal_series = all_plays[(all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'l')].groupby('batter_id').count()['event_type'].astype(int).rename('PAvL')
|
|
bs = pd.concat([bs, pal_series], axis=1)
|
|
par_series = all_plays[(all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('PAvR')
|
|
bs = pd.concat([bs, par_series], axis=1)
|
|
|
|
|
|
abl_series = all_plays[(all_plays.ab == 't') & (all_plays.pitcher_hand == 'l')].groupby('batter_id').count()['event_type'].astype(int).rename('ABvL')
|
|
bs = pd.concat([bs, abl_series], axis=1)
|
|
abr_series = all_plays[(all_plays.ab == 't') & (all_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('ABvR')
|
|
bs = pd.concat([bs, abr_series], axis=1)
|
|
|
|
# Basic counting stats
|
|
for event_type, vs_hand, col_name in [
|
|
('home run', 'r', 'HRvR'),
|
|
('home run', 'l', 'HRvL'),
|
|
('single', 'r', '1BvR'),
|
|
('single', 'l', '1BvL'),
|
|
('double', 'r', '2BvR'),
|
|
('double', 'l', '2BvL'),
|
|
('triple', 'r', '3BvR'),
|
|
('triple', 'l', '3BvL'),
|
|
('walk', 'r', 'BBvR'),
|
|
('walk', 'l', 'BBvL'),
|
|
('strikeout', 'r', 'SOvR'),
|
|
('strikeout', 'l', 'SOvL'),
|
|
('hit by pitch', 'r', 'HBPvR'),
|
|
('hit by pitch', 'l', 'HBPvL')
|
|
]:
|
|
this_series = get_result_series(all_plays, event_type, vs_hand, col_name)
|
|
bs = pd.concat([bs, this_series], axis=1)
|
|
|
|
# Bespoke queries
|
|
|
|
# fill na to 0 following counting stats
|
|
bs = bs.fillna(0)
|
|
|
|
return bs
|
|
|
|
|
|
# def get_batting_stat_range(start_month: int, start_day: int, end_month: int, end_day: int):
|
|
# return get_batting_stats_by_date(
|
|
# start_date=start_month * 100 + start_day,
|
|
# end_date=end_month * 100 + end_day
|
|
# )
|
|
|
|
|
|
async def main(args):
|
|
print(f'Running the calcs...')
|
|
start = datetime.datetime.now()
|
|
data = get_batting_stats_by_date(start_date=101, end_date=430)
|
|
end_calc = datetime.datetime.now()
|
|
|
|
print(f'Saving to csv...')
|
|
data.to_csv(f'batting_stats.csv')
|
|
end = datetime.datetime.now()
|
|
|
|
print(f'Done!\n\nCalc time: {(end_calc - start).total_seconds()}s\nSave time: {(end - end_calc).total_seconds()}s\nTotal: {(end - start).total_seconds()}s')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
asyncio.run(main(sys.argv[1:]))
|