paper-dynasty-card-creation/retrosheet_data.py
Cal Corum b3102201c8 Added Devil Rays to club and franchise lists
Fixed bphr fraction bug
Removed player post limit
2024-10-25 12:24:08 -05:00

867 lines
38 KiB
Python

import asyncio
import datetime
import logging
import sys
from typing import Literal
import pandas as pd
import pybaseball as pb
from pybaseball import cache
import urllib
from creation_helpers import get_args, CLUB_LIST, FRANCHISE_LIST
from batters.stat_prep import DataMismatchError
from db_calls import DB_URL, db_get, db_patch, db_post, db_put
import batters.calcs_batter as cba
import defenders.calcs_defense as cde
cache.enable()
date = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}'
log_level = logging.INFO
logging.basicConfig(
filename=f'logs/{date}.log',
format='%(asctime)s - retrosheet_data - %(levelname)s - %(message)s',
level=log_level
)
RETRO_FILE_PATH = 'data-input/retrosheet/'
EVENTS_FILENAME = 'retrosheets_events_1998_short.csv' # Removed last few columns which were throwing dtype errors
PERSONNEL_FILENAME = 'retrosheets_personnel.csv'
DATA_INPUT_FILE_PATH = 'data-input/1998 Season Cardset/'
CARD_BASE_URL = f'{DB_URL}/v2/players/'
start_time = datetime.datetime.now()
RELEASE_DIRECTORY = f'{start_time.year}-{start_time.month}-{start_time.day}'
MIN_PA_VL = 20
MIN_PA_VR = 40
MIN_TBF_VL = MIN_PA_VL
MIN_TBF_VR = MIN_PA_VR
CARDSET_ID = 20
PLAYER_DESCRIPTION = 'Live'
async def store_defense_to_csv(season: int):
for position in ['c', '1b', '2b', '3b', 'ss', 'lf', 'cf', 'rf', 'of', 'p']:
pos_df = cde.get_bbref_fielding_df(position, season)
pos_df.to_csv(f'{DATA_INPUT_FILE_PATH}defense_{position}.csv')
await asyncio.sleep(8)
def get_events_by_date(file_path: str, start_date: int, end_date: int) -> pd.DataFrame:
all_plays = pd.read_csv(f'{file_path}', dtype={'game_id': 'str'})
all_plays['date'] = all_plays['game_id'].str[3:-1].astype(int)
date_plays = all_plays[(all_plays.date >= start_date) & (all_plays.date <= end_date)]
return date_plays
def get_result_series(plays: pd.DataFrame, event_type: str, pitcher_hand: Literal['r', 'l'], col_name: str) -> pd.Series:
this_series = plays[(plays.event_type == event_type) & (plays.pitcher_hand == pitcher_hand)].groupby('batter_id').count()['event_type'].astype(int).rename(col_name)
return this_series
def get_run_stat_df(input_path: str):
run_data = pd.read_csv(f'{input_path}running.csv') #.set_index('Name-additional'))
# if 'Player' in run_data:
# run_data = run_data.rename(columns={'Player': 'Full Name'})
# if 'Name' in run_data:
# run_data = run_data.rename(columns={'Name': 'Full Name'})
if 'Player-additional' in run_data:
run_data = run_data.rename(columns={'Player-additional': 'key_bbref'})
if 'Name-additional' in run_data:
run_data = run_data.rename(columns={'Name-additional': 'key_bbref'})
run_data = run_data[['key_bbref', 'Tm', 'ROE', 'XI', 'RS%', 'SBO', 'SB', 'CS', 'SB%', 'SB2', 'CS2', 'SB3', 'CS3', 'SBH', 'CSH', 'PO', 'PCS', 'OOB', 'OOB1', 'OOB2', 'OOB3', 'OOBHm', 'BT', 'XBT%', '1stS', '1stS2', '1stS3', '1stD', '1stD3', '1stDH', '2ndS', '2ndS3', '2ndSH']]
run_data = run_data.fillna(0)
return run_data.set_index('key_bbref')
def get_player_ids(plays: pd.DataFrame, which: Literal['batters', 'pitchers']) -> pd.DataFrame:
RETRO_PLAYERS = pd.read_csv(f'{RETRO_FILE_PATH}{PERSONNEL_FILENAME}')
id_key = 'batter_id' if which == 'batters' else 'pitcher_id'
players = pd.DataFrame()
unique_players = pd.Series(plays[id_key].unique()).to_frame('id')
players = pd.merge(
left=RETRO_PLAYERS,
right=unique_players,
how='right',
left_on='id',
right_on='id'
).rename(columns={'id': id_key})
def get_pids(row):
# return get_all_pybaseball_ids([row[id_key]], 'retro', full_name=f'{row["use_name"]} {row["last_name"]}')
pull = pb.playerid_reverse_lookup([row[id_key]], key_type='retro')
if len(pull.values) == 0:
print(f'Could not find id {row[id_key]} in pybaseball lookup')
return pull.loc[0][['key_mlbam', 'key_retro', 'key_bbref', 'key_fangraphs']]
players = players[[id_key, 'last_name', 'use_name']]
start_time = datetime.datetime.now()
other_ids = players.apply(get_pids, axis=1)
end_time = datetime.datetime.now()
print(f'ID lookup: {(end_time - start_time).total_seconds():.2f}s')
players = pd.merge(
left=players,
right=other_ids,
left_on=id_key,
right_on='key_retro'
)
players = players.set_index(id_key)
def get_bat_hand(row):
pa_vl = plays[(plays.batter_id == row['key_retro']) & (plays.pitcher_hand == 'l')].groupby('result_batter_hand').count()['game_id'].astype(int)
pa_vr = plays[(plays.batter_id == row['key_retro']) & (plays.pitcher_hand == 'r')].groupby('result_batter_hand').count()['game_id'].astype(int)
l_vs_l = 0 if 'l' not in pa_vl else pa_vl['l']
l_vs_r = 0 if 'l' not in pa_vr else pa_vr['l']
r_vs_l = 0 if 'r' not in pa_vl else pa_vl['r']
r_vs_r = 0 if 'r' not in pa_vr else pa_vr['r']
if sum([l_vs_l, l_vs_r]) == 0 and sum([r_vs_l, r_vs_r]) > 0:
return 'R'
elif sum([l_vs_l, l_vs_r]) > 0 and sum([r_vs_l, r_vs_r]) == 0:
return 'L'
if sum([l_vs_l, l_vs_r, r_vs_l, r_vs_r]) < 10:
if sum([l_vs_l, l_vs_r]) > sum([r_vs_l, r_vs_r]):
return 'L'
else:
return 'R'
else:
return 'S'
if which == 'batters':
players['bat_hand'] = players.apply(get_bat_hand, axis=1)
return players
def get_base_batting_df(all_plays: pd.DataFrame) -> pd.DataFrame:
bs = get_player_ids(all_plays, 'batters')
pal_series = all_plays[(all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'l')].groupby('batter_id').count()['event_type'].astype(int).rename('PA_vL')
bs = pd.concat([bs, pal_series], axis=1)
par_series = all_plays[(all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('PA_vR')
bs = pd.concat([bs, par_series], axis=1)
abl_series = all_plays[(all_plays.ab == 't') & (all_plays.pitcher_hand == 'l')].groupby('batter_id').count()['event_type'].astype(int).rename('AB_vL')
bs = pd.concat([bs, abl_series], axis=1)
abr_series = all_plays[(all_plays.ab == 't') & (all_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('AB_vR')
bs = pd.concat([bs, abr_series], axis=1)
return bs.dropna().query(f'PA_vL >= {MIN_PA_VL} & PA_vR >= {MIN_PA_VR}')
def get_batting_stats_by_date(retro_file_path, start_date: int, end_date: int) -> pd.DataFrame:
start = datetime.datetime.now()
all_plays = get_events_by_date(retro_file_path, start_date, end_date)
print(f'Pull events: {(datetime.datetime.now() - start).total_seconds():.2f}s')
start = datetime.datetime.now()
batting_stats = get_base_batting_df(all_plays)
print(f'Get base dataframe: {(datetime.datetime.now() - start).total_seconds():.2f}s')
start = datetime.datetime.now()
all_player_ids = batting_stats['key_retro']
all_plays = all_plays[all_plays['batter_id'].isin(all_player_ids)]
print(f'Shrink all_plays: {(datetime.datetime.now() - start).total_seconds():.2f}s')
# Basic counting stats
start = datetime.datetime.now()
for event_type, vs_hand, col_name in [
('home run', 'r', 'HR_vR'),
('home run', 'l', 'HR_vL'),
('single', 'r', '1B_vR'),
('single', 'l', '1B_vL'),
('double', 'r', '2B_vR'),
('double', 'l', '2B_vL'),
('triple', 'r', '3B_vR'),
('triple', 'l', '3B_vL'),
('walk', 'r', 'BB_vR'),
('walk', 'l', 'BB_vL'),
('strikeout', 'r', 'SO_vR'),
('strikeout', 'l', 'SO_vL'),
('hit by pitch', 'r', 'HBP_vR'),
('hit by pitch', 'l', 'HBP_vL')
]:
this_series = get_result_series(all_plays, event_type, vs_hand, col_name)
batting_stats[col_name] = this_series
print(f'Count basic stats: {(datetime.datetime.now() - start).total_seconds():.2f}s')
# Bespoke counting stats
start = datetime.datetime.now()
def get_fb_vl(row):
return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batted_ball_type == 'f') & (all_plays.pitcher_hand == 'l')].count()['event_type'].astype(int)
def get_fb_vr(row):
return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batted_ball_type == 'f') & (all_plays.pitcher_hand == 'r')].count()['event_type'].astype(int)
def get_gb_vl(row):
return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batted_ball_type == 'G') & (all_plays.pitcher_hand == 'l')].count()['event_type'].astype(int)
def get_gb_vr(row):
return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batted_ball_type == 'G') & (all_plays.pitcher_hand == 'r')].count()['event_type'].astype(int)
def get_ld_vl(row):
return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batted_ball_type == 'l') & (all_plays.pitcher_hand == 'l')].count()['event_type'].astype(int)
def get_ld_vr(row):
return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batted_ball_type == 'l') & (all_plays.pitcher_hand == 'r')].count()['event_type'].astype(int)
def get_gdp_vl(row):
dp = all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'l') & (all_plays.dp == 't')].count()['event_type'].astype(int)
tp = all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'l') & (all_plays.tp == 't')].count()['event_type'].astype(int)
return dp + tp
def get_gdp_vr(row):
dp = all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'r') & (all_plays.dp == 't')].count()['event_type'].astype(int)
tp = all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'r') & (all_plays.tp == 't')].count()['event_type'].astype(int)
return dp + tp
def get_bunt(row):
return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.bunt == 't')].count()['event_type'].astype(int)
batting_stats['FB_vL'] = batting_stats.apply(get_fb_vl, axis=1)
batting_stats['FB_vR'] = batting_stats.apply(get_fb_vr, axis=1)
batting_stats['GB_vL'] = batting_stats.apply(get_gb_vl, axis=1)
batting_stats['GB_vR'] = batting_stats.apply(get_gb_vr, axis=1)
batting_stats['LD_vL'] = batting_stats.apply(get_ld_vl, axis=1)
batting_stats['LD_vR'] = batting_stats.apply(get_ld_vr, axis=1)
batting_stats['GDP_vL'] = batting_stats.apply(get_gdp_vl, axis=1)
batting_stats['GDP_vR'] = batting_stats.apply(get_gdp_vr, axis=1)
batting_stats['Bunts'] = batting_stats.apply(get_bunt, axis=1)
print(f'Custom counting stats: {(datetime.datetime.now() - start).total_seconds():.2f}s')
# Infield Hit %
ifh_vl = all_plays[(all_plays.hit_val.str.contains('1|2|3')) & (all_plays.pitcher_hand == 'l') & (all_plays.hit_location.str.contains('1|2|3|4|5|6')) & (~all_plays.hit_location.str.contains('D', na=False))].groupby('batter_id').count()['event_type'].astype(int).rename('ifh_vL')
ifh_vr = all_plays[(all_plays.hit_val.str.contains('1|2|3')) & (all_plays.pitcher_hand == 'r') & (all_plays.hit_location.str.contains('1|2|3|4|5|6')) & (~all_plays.hit_location.str.contains('D', na=False))].groupby('batter_id').count()['event_type'].astype(int).rename('ifh_vR')
batting_stats['ifh_vL'] = ifh_vl
batting_stats['ifh_vR'] = ifh_vr
def get_pull_vl(row):
pull_loc = '5|7' if row['bat_hand'] != 'L' else '3|9'
x = all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.pitcher_hand == 'l') & (all_plays.hit_location.str.contains(pull_loc))].count()['event_type'].astype(int)
return x
def get_pull_vr(row):
pull_loc = '5|7' if row['bat_hand'] == 'R' else '3|9'
x = all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.pitcher_hand == 'r') & (all_plays.hit_location.str.contains(pull_loc))].count()['event_type'].astype(int)
return x
# Bespoke Queries
batting_stats['pull_vL'] = batting_stats.apply(get_pull_vl, axis=1)
batting_stats['pull_vR'] = batting_stats.apply(get_pull_vr, axis=1)
center_vl = all_plays[(all_plays.pitcher_hand == 'l') & (all_plays.hit_location.str.contains('1|4|6|8'))].groupby('batter_id').count()['event_type'].astype(int).rename('center_vl')
center_vr = all_plays[(all_plays.pitcher_hand == 'r') & (all_plays.hit_location.str.contains('1|4|6|8'))].groupby('batter_id').count()['event_type'].astype(int).rename('center_vr')
batting_stats['center_vL'] = center_vl
batting_stats['center_vR'] = center_vr
oppo_vl = all_plays[(all_plays.pitcher_hand == 'l') & (all_plays.hit_location.str.contains('5|7'))].groupby('batter_id').count()['event_type'].astype(int).rename('oppo_vL')
oppo_vr = all_plays[(all_plays.pitcher_hand == 'r') & (all_plays.hit_location.str.contains('5|7'))].groupby('batter_id').count()['event_type'].astype(int).rename('oppo_vR')
batting_stats['oppo_vL'] = oppo_vl
batting_stats['oppo_vR'] = oppo_vr
# fill na to 0 following counting stats
batting_stats = batting_stats.fillna(0)
# Calculated Fields
start = datetime.datetime.now()
batting_stats['H_vL'] = batting_stats['1B_vL'] + batting_stats['2B_vL'] + batting_stats['3B_vL'] + batting_stats['HR_vL']
batting_stats['H_vR'] = batting_stats['1B_vR'] + batting_stats['2B_vR'] + batting_stats['3B_vR'] + batting_stats['HR_vR']
batting_stats['AVG_vL'] = round(batting_stats['H_vL'] / batting_stats['AB_vL'], 5)
batting_stats['AVG_vR'] = round(batting_stats['H_vR'] / batting_stats['AB_vR'], 5)
batting_stats['OBP_vL'] = round((batting_stats['H_vL'] + batting_stats['BB_vL'] + batting_stats['HBP_vL']) / batting_stats['PA_vL'], 5)
batting_stats['OBP_vR'] = round((batting_stats['H_vR'] + batting_stats['BB_vR'] + batting_stats['HBP_vR']) / batting_stats['PA_vR'], 5)
batting_stats['SLG_vL'] = round((batting_stats['1B_vL'] + batting_stats['2B_vL'] * 2 + batting_stats['3B_vL'] * 3 + batting_stats['HR_vL'] * 4) / batting_stats['AB_vL'], 5)
batting_stats['SLG_vR'] = round((batting_stats['1B_vR'] + batting_stats['2B_vR'] * 2 + batting_stats['3B_vR'] * 3 + batting_stats['HR_vR'] * 4) / batting_stats['AB_vR'], 5)
batting_stats['HR/FB_vL'] = round(batting_stats['HR_vL'] / batting_stats['FB_vL'], 5)
batting_stats['HR/FB_vR'] = round(batting_stats['HR_vR'] / batting_stats['FB_vR'], 5)
batting_stats['FB%_vL'] = round(batting_stats['FB_vL'] / (batting_stats['FB_vL'] + batting_stats['GB_vL'] + batting_stats['LD_vL']), 5)
batting_stats['FB%_vR'] = round(batting_stats['FB_vR'] / (batting_stats['FB_vR'] + batting_stats['GB_vR'] + batting_stats['LD_vR']), 5)
batting_stats['GB%_vL'] = round(batting_stats['GB_vL'] / (batting_stats['FB_vL'] + batting_stats['GB_vL'] + batting_stats['LD_vL']), 5)
batting_stats['GB%_vR'] = round(batting_stats['GB_vR'] / (batting_stats['FB_vR'] + batting_stats['GB_vR'] + batting_stats['LD_vR']), 5)
batting_stats['LD%_vL'] = round(batting_stats['LD_vL'] / (batting_stats['FB_vL'] + batting_stats['GB_vL'] + batting_stats['LD_vL']), 5)
batting_stats['LD%_vR'] = round(batting_stats['LD_vR'] / (batting_stats['FB_vR'] + batting_stats['GB_vR'] + batting_stats['LD_vR']), 5)
batting_stats['Hard%_vL'] = round(0.2 + batting_stats['SLG_vL'] - batting_stats['AVG_vL'], 5)
batting_stats['Hard%_vR'] = round(0.2 + batting_stats['SLG_vR'] - batting_stats['AVG_vR'], 5)
def get_med_vL(row):
high = 0.9 - row['Hard%_vL']
low = (row['SLG_vL'] - row['AVG_vL']) * 1.5
return round(max(min(high, low),0.1), 5)
def get_med_vR(row):
high = 0.9 - row['Hard%_vR']
low = (row['SLG_vR'] - row['AVG_vR']) * 1.5
return round(max(min(high, low),0.1), 5)
batting_stats['Med%_vL'] = batting_stats.apply(get_med_vL, axis=1)
batting_stats['Med%_vR'] = batting_stats.apply(get_med_vR, axis=1)
batting_stats['Soft%_vL'] = round(1 - batting_stats['Hard%_vL'] - batting_stats['Med%_vL'], 5)
batting_stats['Soft%_vR'] = round(1 - batting_stats['Hard%_vR'] - batting_stats['Med%_vR'], 5)
batting_stats['IFH%_vL'] = round(batting_stats['ifh_vL'] / batting_stats['H_vL'], 5)
batting_stats['IFH%_vR'] = round(batting_stats['ifh_vR'] / batting_stats['H_vR'], 5)
batting_stats['Pull%_vL'] = round(batting_stats['pull_vL'] / (batting_stats['pull_vL'] + batting_stats['center_vL'] + batting_stats['oppo_vL']), 5)
batting_stats['Pull%_vR'] = round(batting_stats['pull_vR'] / (batting_stats['pull_vR'] + batting_stats['center_vR'] + batting_stats['oppo_vR']), 5)
batting_stats['Cent%_vL'] = round(batting_stats['center_vL'] / (batting_stats['pull_vL'] + batting_stats['center_vL'] + batting_stats['oppo_vL']), 5)
batting_stats['Cent%_vR'] = round(batting_stats['center_vL'] / (batting_stats['pull_vR'] + batting_stats['center_vR'] + batting_stats['oppo_vR']), 5)
batting_stats['Oppo%_vL'] = round(1 - batting_stats['Pull%_vL'] - batting_stats['Cent%_vL'], 5)
batting_stats['Oppo%_vR'] = round(1 - batting_stats['Pull%_vR'] - batting_stats['Cent%_vR'], 5)
batting_stats = batting_stats.fillna(0)
print(f'Calculated fields: {(datetime.datetime.now() - start).total_seconds():.2f}s')
return batting_stats
def calc_batting_cards(bs: pd.DataFrame, season_pct: float) -> pd.DataFrame:
def create_batting_card(row):
steal_data = cba.stealing(
chances=int(row['SBO']),
sb2s=int(row['SB2']),
cs2s=int(row['CS2']),
sb3s=int(row['SB3']),
cs3s=int(row['CS3']),
season_pct=1.0
)
y = pd.DataFrame({
'key_bbref': [row['key_bbref']],
'steal_low': [steal_data[0]],
'steal_high': [steal_data[1]],
'steal_auto': [steal_data[2]],
'steal_jump': [steal_data[3]],
'hit_and_run': [cba.hit_and_run(
row['AB_vL'], row['AB_vR'], row['H_vL'], row['H_vR'],
row['HR_vL'], row['HR_vR'], row['SO_vL'], row['SO_vR']
)],
'bunt': [cba.bunting(row['Bunts'], season_pct)],
'running': [cba.running(row['XBT%'])],
'hand': [row['bat_hand']],
})
return y.loc[0]
all_cards = bs.apply(create_batting_card, axis=1)
all_cards = all_cards.set_index('key_bbref')
return all_cards
def calc_batter_ratings(bs: pd.DataFrame) -> pd.DataFrame:
def create_batting_rating(row):
ratings = cba.get_batter_ratings(row)
ops_vl = ratings[0]['obp'] + ratings[0]['slg']
ops_vr = ratings[1]['obp'] + ratings[1]['slg']
total_ops = (ops_vl + ops_vr + min(ops_vr, ops_vl)) / 3
def calc_cost(total_ops, base_cost, base_ops, max_delta) -> int:
delta = ((total_ops - base_ops) / 0.1) * 2
if delta < 1:
delta = (max_delta * (1 - (total_ops / base_ops))) * -0.1
final_cost = base_cost + (max_delta * delta)
return round(final_cost)
if total_ops >= 1.2:
rarity_id = 99
cost = calc_cost(total_ops, base_cost=2400, base_ops=1.215, max_delta=810)
elif total_ops >= 1:
rarity_id = 1
cost = calc_cost(total_ops, base_cost=810, base_ops=1.05, max_delta=270)
elif total_ops >= 0.9:
rarity_id = 2
cost = calc_cost(total_ops, base_cost=270, base_ops=0.95, max_delta=90)
elif total_ops >= 0.8:
rarity_id = 3
cost = calc_cost(total_ops, base_cost=90, base_ops=0.85, max_delta=30)
elif total_ops >= 0.7:
rarity_id = 4
cost = calc_cost(total_ops, base_cost=30, base_ops=0.75, max_delta=10)
else:
rarity_id = 5
cost = calc_cost(total_ops, base_cost=10, base_ops=0.61, max_delta=8)
x = pd.DataFrame({
'key_bbref': [row['key_bbref']],
'ratings_vL': [ratings[0]],
'ratings_vR': [ratings[1]],
'ops_vL': ops_vl,
'ops_vR': ops_vr,
'total_ops': total_ops,
'rarity_id': rarity_id,
'cost': cost
})
return x.loc[0]
all_ratings = bs.apply(create_batting_rating, axis=1)
all_ratings = all_ratings.set_index('key_bbref')
return all_ratings
def calc_positions(bs: pd.DataFrame) -> pd.DataFrame:
df_c = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_c.csv').set_index('key_bbref')
df_1b = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_1b.csv').set_index('key_bbref')
df_2b = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_2b.csv').set_index('key_bbref')
df_3b = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_3b.csv').set_index('key_bbref')
df_ss = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_ss.csv').set_index('key_bbref')
df_lf = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_lf.csv').set_index('key_bbref')
df_cf = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_cf.csv').set_index('key_bbref')
df_rf = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_rf.csv').set_index('key_bbref')
df_of = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_of.csv').set_index('key_bbref')
season_pct = 1.0
all_pos = []
def process_pos(row):
no_data = True
# TODO: Add pos_1 through pos_8 to def df to be pulled in at post time
for pos_df, position in [(df_1b, '1b'), (df_2b, '2b'), (df_3b, '3b'), (df_ss, 'ss')]:
if row['key_bbref'] in pos_df.index:
logging.info(f'Running {position} stats for {row["use_name"]} {row["last_name"]}')
try:
if 'tz_runs_total' in row:
average_range = (int(pos_df.at[row["key_bbref"], 'tz_runs_total']) +
int(pos_df.at[row["key_bbref"], 'bis_runs_total']) +
min(
int(pos_df.at[row["key_bbref"], 'tz_runs_total']),
int(pos_df.at[row["key_bbref"], 'bis_runs_total'])
)) / 3
else:
average_range = pos_df.at[row["key_bbref"], 'tz_runs_total']
if float(pos_df.at[row["key_bbref"], 'Inn_def']) >= 10.0:
all_pos.append({
"key_bbref": row['key_bbref'],
"position": position.upper(),
"innings": float(pos_df.at[row["key_bbref"], 'Inn_def']),
"range": cde.get_if_range(
pos_code=position,
tz_runs=round(average_range),
r_dp=0,
season_pct=season_pct
),
"error": cde.get_any_error(
pos_code=position,
errors=int(pos_df.at[row["key_bbref"], 'E_def']),
chances=int(pos_df.at[row["key_bbref"], 'chances']),
season_pct=season_pct
)
})
no_data = False
except Exception as e:
logging.info(f'Infield position failed: {e}')
of_arms = []
of_payloads = []
for pos_df, position in [(df_lf, 'lf'), (df_cf, 'cf'), (df_rf, 'rf')]:
if row["key_bbref"] in pos_df.index:
try:
if 'tz_runs_total' in row:
average_range = (int(pos_df.at[row["key_bbref"], 'tz_runs_total']) +
int(pos_df.at[row["key_bbref"], 'bis_runs_total']) +
min(
int(pos_df.at[row["key_bbref"], 'tz_runs_total']),
int(pos_df.at[row["key_bbref"], 'bis_runs_total'])
)) / 3
else:
average_range = pos_df.at[row["key_bbref"], 'tz_runs_total']
if float(pos_df.at[row["key_bbref"], 'Inn_def']) >= 10.0:
of_payloads.append({
"key_bbref": row['key_bbref'],
"position": position.upper(),
"innings": float(pos_df.at[row["key_bbref"], 'Inn_def']),
"range": cde.get_of_range(
pos_code=position,
tz_runs=round(average_range),
season_pct=season_pct
)
})
of_run_rating = 'bis_runs_outfield' if 'bis_runs_outfield' in pos_df else 'tz_runs_outfield'
of_arms.append(int(pos_df.at[row["key_bbref"], of_run_rating]))
no_data = False
except Exception as e:
logging.info(f'Outfield position failed: {e}')
if row["key_bbref"] in df_of.index and len(of_arms) > 0 and len(of_payloads) > 0:
try:
error_rating = cde.get_any_error(
pos_code=position,
errors=int(df_of.at[row["key_bbref"], 'E_def']),
chances=int(df_of.at[row["key_bbref"], 'chances']),
season_pct=season_pct
)
arm_rating = cde.arm_outfield(of_arms)
for f in of_payloads:
f['error'] = error_rating
f['arm'] = arm_rating
all_pos.append(f)
no_data = False
except Exception as e:
logging.info(f'Outfield position failed: {e}')
if row["key_bbref"] in df_c.index:
try:
run_rating = 'bis_runs_catcher_sb' if 'bis_runs_catcher_sb' in df_c else 'tz_runs_catcher'
if df_c.at[row["key_bbref"], 'SB'] + df_c.at[row["key_bbref"], 'CS'] == 0:
arm_rating = 3
else:
arm_rating = cde.arm_catcher(
cs_pct=df_c.at[row["key_bbref"], 'caught_stealing_perc'],
raa=int(df_c.at[row["key_bbref"], run_rating]),
season_pct=season_pct
)
if float(df_c.at[row["key_bbref"], 'Inn_def']) >= 10.0:
all_pos.append({
"key_bbref": row['key_bbref'],
"position": 'C',
"innings": float(df_c.at[row["key_bbref"], 'Inn_def']),
"range": cde.range_catcher(
rs_value=int(df_c.at[row["key_bbref"], 'tz_runs_catcher']),
season_pct=season_pct
),
"error": cde.get_any_error(
pos_code='c',
errors=int(df_c.at[row["key_bbref"], 'E_def']),
chances=int(df_c.at[row["key_bbref"], 'chances']),
season_pct=season_pct
),
"arm": arm_rating,
"pb": cde.pb_catcher(
pb=int(df_c.at[row["key_bbref"], 'PB']),
innings=int(float(df_c.at[row["key_bbref"], 'Inn_def'])),
season_pct=season_pct
),
"overthrow": cde.ot_catcher(
errors=int(df_c.at[row["key_bbref"], 'E_def']),
chances=int(df_c.at[row["key_bbref"], 'chances']),
season_pct=season_pct
)
})
no_data = False
except Exception as e:
logging.info(f'Catcher position failed: {e}')
if no_data:
all_pos.append({
"key_bbref": row['key_bbref'],
"position": 'DH',
"innings": row['PA_vL'] + row['PA_vR']
})
bs.apply(process_pos, axis=1)
pos_df = pd.DataFrame(all_pos)
pos_df = pos_df.set_index('key_bbref')
return pos_df
async def get_or_post_players(stat_df: pd.DataFrame, bat_card_df: pd.DataFrame, bat_rat_df: pd.DataFrame, def_rat_df: pd.DataFrame) -> pd.DataFrame:
# Refactor this to support batters or pitchers, make stat dfs optional and send to getorpostbatters/getorpostpitchers
all_players = []
dev_count = 0
for index, row in stat_df.iterrows():
if dev_count < 0:
break
p_query = await db_get('players', params=[('bbref_id', row["key_bbref"]), ('cardset_id', CARDSET_ID)])
if p_query['count'] > 0:
this_record = p_query['players'][0]
# if 'id' in this_record:
# player_id = this_record['id']
# else:
# player_id = this_record['player_id']
# all_bbref_ids.append(row['key_bbref'])
# all_player_ids.append(player_id)
all_players.append(this_record)
else:
mlb_query = await db_get('mlbplayers', params=[('key_retro', row['key_retro'])])
if mlb_query['count'] > 0:
mlb_player = mlb_query['players'][0]
else:
mlb_player = await db_post(
'mlbplayers/one',
payload={
'first_name': row['use_name'],
'last_name': row['last_name'],
'key_mlbam': row['key_mlbam'],
'key_fangraphs': row['key_fangraphs'],
'key_bbref': row['key_bbref'],
'key_retro': row['key_retro']
}
)
player_payload = {
'p_name': f'{row["use_name"]} {row["last_name"]}',
'cost': f'{bat_rat_df.loc[row['key_bbref']]["cost"]}',
'image': f'change-me',
'mlbclub': CLUB_LIST[row['Tm']],
'franchise': FRANCHISE_LIST[row['Tm']],
'cardset_id': CARDSET_ID,
'set_num': int(float(row['key_fangraphs'])),
'rarity_id': int(bat_rat_df.loc[row['key_bbref']]['rarity_id']),
'description': PLAYER_DESCRIPTION,
'bbref_id': row['key_bbref'],
'fangr_id': row['key_fangraphs'],
'mlbplayer_id': mlb_player['id']
}
try:
count = 1
all_pos = def_rat_df.loc[row['key_bbref']].sort_values(by='innings', ascending=False)
for index, pos_row in all_pos.iterrows():
player_payload[f'pos_{count}'] = pos_row.position
count += 1
except KeyError:
logging.info(f'No positions found for {row['use_name']} {row['last_name']}')
player_payload['pos_1'] = 'DH'
except TypeError:
logging.info(f'Only one position found for {row['use_name']} {row['last_name']}')
player_payload['pos_1'] = def_rat_df.loc[row['key_bbref']].position
new_player = await db_post('players', payload=player_payload)
if 'id' in new_player:
player_id = new_player['id']
else:
player_id = new_player['player_id']
await db_patch('players', object_id=player_id, params=[('image', f'{CARD_BASE_URL}{player_id}/battingcard{urllib.parse.quote("?d=")}{RELEASE_DIRECTORY}')])
# all_bbref_ids.append(row['key_bbref'])
# all_player_ids.append(player_id)
all_players.append(new_player)
dev_count += 1
players_df = pd.DataFrame(all_players).set_index('bbref_id')
return players_df
async def post_batting_cards(cards_df: pd.DataFrame):
all_cards = []
cards_df.apply(lambda x: all_cards.append({
'player_id': int(x["player_id"]),
'steal_low': x['steal_low'],
'steal_high': x['steal_high'],
'steal_auto': x['steal_auto'],
'steal_jump': x['steal_jump'],
'bunting': x['bunt'],
'hit_and_run': x['hit_and_run'],
'running': x['running'],
'hand': x['hand']
}), axis=1)
resp = await db_put('battingcards', payload={'cards': all_cards}, timeout=6)
if resp is not None:
pass
else:
e_msg = 'Unable to post batting cards'
logging.debug(e_msg, stack_info=True)
raise ValueError(e_msg)
bc_query = await db_get('battingcards', params=[('cardset_id', CARDSET_ID)])
if bc_query['count'] > 0:
bc_data = bc_query['cards']
for line in bc_data:
line['player_id'] = line['player']['player_id']
line['key_bbref'] = line['player']['bbref_id']
line['battingcard_id'] = line['id']
return pd.DataFrame(bc_data).set_index('key_bbref')
async def post_batting_ratings(ratings_df: pd.DataFrame):
all_ratings = []
def append_ratings(row):
vl = row['ratings_vL']
vl['player_id'] = row['player_id']
vl['battingcard_id'] = row['battingcard_id']
vr = row['ratings_vR']
vr['player_id'] = row['player_id']
vr['battingcard_id'] = row['battingcard_id']
all_ratings.append(vl)
all_ratings.append(vr)
ratings_df.apply(append_ratings, axis=1)
resp = await db_put('battingcardratings', payload={'ratings': all_ratings}, timeout=6)
if resp is not None:
return True
else:
e_msg = 'Unable to post batting ratings'
logging.debug(e_msg, stack_info=True)
raise ValueError(e_msg)
async def post_positions(pos_df: pd.DataFrame):
all_pos = []
def append_positions(row):
clean_row = row.dropna()
new_val = clean_row.to_dict()
new_val['player_id'] = int(row['player_id'])
all_pos.append(new_val)
pos_df.apply(append_positions, axis=1)
resp = await db_put('cardpositions', payload={'positions': all_pos}, timeout=6)
if resp is not None:
return True
else:
e_msg = 'Unable to post positions'
logging.debug(e_msg, stack_info=True)
raise ValueError(e_msg)
async def post_batter_data(bs: pd.DataFrame, bc: pd.DataFrame, br: pd.DataFrame, dr: pd.date_range) -> int:
all_players = await get_or_post_players(bs, bc, br, dr)
# Post Batting Cards
bc = pd.merge(
left=bc,
right=all_players,
how='right', # 'left', TODO: switch back to left when all players are pulled
left_on='key_bbref',
right_on='bbref_id'
)
bc = await post_batting_cards(bc)
# Post Batting Ratings
br = pd.merge(
left=br,
right=bc,
how='right', #'left', TODO: switch back to left when all players are pulled
left_on='key_bbref',
right_on='key_bbref'
)
br = await post_batting_ratings(br)
# Post Positions
dr = pd.merge(
left=dr,
right=all_players,
how='right', # 'left',
left_on='key_bbref',
right_on='bbref_id'
)
await post_positions(dr)
return len(all_players)
async def run_batters(data_input_path: str, start_date: int, end_date: int, post_data: bool = False, season_pct: float = 1.0):
print(f'Running the batter calcs...')
batter_start = datetime.datetime.now()
# Get batting stats
batting_stats = get_batting_stats_by_date(f'{RETRO_FILE_PATH}{EVENTS_FILENAME}', start_date=start_date, end_date=end_date)
bs_len = len(batting_stats)
end_calc = datetime.datetime.now()
print(f'Combined batting stats: {(end_calc - batter_start).total_seconds():.2f}s\n')
running_start = datetime.datetime.now()
# Get running stats
running_stats = get_run_stat_df(data_input_path)
batting_stats = pd.merge(
left=batting_stats,
right=running_stats,
how='left',
left_on='key_bbref',
right_on='key_bbref'
)
end_calc = datetime.datetime.now()
print(f'Running stats: {(end_calc - running_start).total_seconds():.2f}s')
if len(batting_stats) != bs_len:
raise DataMismatchError(f'retrosheet_data - run_batters - We started with {bs_len} batting lines and have {len(batting_stats)} after merging with running_stats')
# Calculate batting cards
card_start = datetime.datetime.now()
all_batting_cards = calc_batting_cards(batting_stats, season_pct)
card_end = datetime.datetime.now()
print(f'Create batting cards: {(card_end - card_start).total_seconds():.2f}s')
# Calculate batting ratings
rating_start = datetime.datetime.now()
batting_stats['battingcard_id'] = batting_stats['key_fangraphs']
all_batting_ratings = calc_batter_ratings(batting_stats)
rating_end = datetime.datetime.now()
print(f'Create batting ratings: {(rating_end - rating_start).total_seconds():.2f}s')
# Calculate defense ratings
defense_start = datetime.datetime.now()
all_defense_ratings = calc_positions(batting_stats)
defense_end = datetime.datetime.now()
print(f'Create defense ratings: {(defense_end - defense_start).total_seconds():.2f}s')
# Post all data
if post_data:
print(f'Posting player data...')
post_start = datetime.datetime.now()
num_players = await post_batter_data(batting_stats, all_batting_cards, all_batting_ratings, all_defense_ratings)
post_end = datetime.datetime.now()
print(f'Post player data: {(post_end - post_start).total_seconds()}s')
post_msg = f'Posted {num_players} players to the database'
logging.info(post_msg)
print(post_msg)
else:
post_msg = f'Players are NOT being posted to the database'
logging.warning(post_msg)
print(post_msg)
return batting_stats
async def main(args):
batter_start = datetime.datetime.now()
batting_stats = await run_batters(f'{DATA_INPUT_FILE_PATH}', start_date=19980101, end_date=19980430, post_data=True, season_pct=0.16666667)
batting_stats.to_csv(f'batting_stats.csv')
batter_end = datetime.datetime.now()
pitcher_start = datetime.datetime.now()
pitcher_end = datetime.datetime.now()
print(f'\n\nBatter time: {(batter_end - batter_start).total_seconds():.2f}s \nPitcher time: {(pitcher_end - pitcher_start).total_seconds():.2f}s\nTotal: {(pitcher_end - batter_start).total_seconds():.2f}s\n\nDone!')
# await store_defense_to_csv(1998)
if __name__ == '__main__':
asyncio.run(main(sys.argv[1:]))