import asyncio import datetime import logging import sys from typing import Literal import pandas as pd import pybaseball as pb from pybaseball import cache import urllib from creation_helpers import get_args, CLUB_LIST, FRANCHISE_LIST from batters.stat_prep import DataMismatchError from db_calls import DB_URL, db_get, db_patch, db_post import batters.calcs_batter as cba import defenders.calcs_defense as cde cache.enable() date = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}' log_level = logging.INFO logging.basicConfig( filename=f'logs/{date}.log', format='%(asctime)s - retrosheet_data - %(levelname)s - %(message)s', level=log_level ) RETRO_FILE_PATH = 'data-input/retrosheet/' EVENTS_FILENAME = 'retrosheets_events_1998_short.csv' # Removed last few columns which were throwing dtype errors PERSONNEL_FILENAME = 'retrosheets_personnel.csv' DATA_INPUT_FILE_PATH = 'data-input/1998 Season Cardset/' CARD_BASE_URL = f'{DB_URL}/v2/players/' start_time = datetime.datetime.now() RELEASE_DIRECTORY = f'{start_time.year}-{start_time.month}-{start_time.day}' MIN_PA_VL = 20 MIN_PA_VR = 40 MIN_TBF_VL = MIN_PA_VL MIN_TBF_VR = MIN_PA_VR CARDSET_ID = 20 PLAYER_DESCRIPTION = 'Live' async def store_defense_to_csv(season: int): for position in ['c', '1b', '2b', '3b', 'ss', 'lf', 'cf', 'rf', 'of', 'p']: pos_df = cde.get_bbref_fielding_df(position, season) pos_df.to_csv(f'{DATA_INPUT_FILE_PATH}defense_{position}.csv') await asyncio.sleep(8) def get_events_by_date(file_path: str, start_date: int, end_date: int) -> pd.DataFrame: all_plays = pd.read_csv(f'{file_path}', dtype={'game_id': 'str'}) all_plays['date'] = all_plays['game_id'].str[3:-1].astype(int) date_plays = all_plays[(all_plays.date >= start_date) & (all_plays.date <= end_date)] return date_plays def get_result_series(plays: pd.DataFrame, event_type: str, pitcher_hand: Literal['r', 'l'], col_name: str) -> pd.Series: this_series = plays[(plays.event_type == event_type) & (plays.pitcher_hand == pitcher_hand)].groupby('batter_id').count()['event_type'].astype(int).rename(col_name) return this_series def get_run_stat_df(input_path: str): run_data = pd.read_csv(f'{input_path}running.csv') #.set_index('Name-additional')) # if 'Player' in run_data: # run_data = run_data.rename(columns={'Player': 'Full Name'}) # if 'Name' in run_data: # run_data = run_data.rename(columns={'Name': 'Full Name'}) if 'Player-additional' in run_data: run_data = run_data.rename(columns={'Player-additional': 'key_bbref'}) if 'Name-additional' in run_data: run_data = run_data.rename(columns={'Name-additional': 'key_bbref'}) run_data = run_data[['key_bbref', 'Tm', 'ROE', 'XI', 'RS%', 'SBO', 'SB', 'CS', 'SB%', 'SB2', 'CS2', 'SB3', 'CS3', 'SBH', 'CSH', 'PO', 'PCS', 'OOB', 'OOB1', 'OOB2', 'OOB3', 'OOBHm', 'BT', 'XBT%', '1stS', '1stS2', '1stS3', '1stD', '1stD3', '1stDH', '2ndS', '2ndS3', '2ndSH']] run_data = run_data.fillna(0) return run_data.set_index('key_bbref') def get_player_ids(plays: pd.DataFrame, which: Literal['batters', 'pitchers']) -> pd.DataFrame: RETRO_PLAYERS = pd.read_csv(f'{RETRO_FILE_PATH}{PERSONNEL_FILENAME}') id_key = 'batter_id' if which == 'batters' else 'pitcher_id' players = pd.DataFrame() unique_players = pd.Series(plays[id_key].unique()).to_frame('id') players = pd.merge( left=RETRO_PLAYERS, right=unique_players, how='right', left_on='id', right_on='id' ).rename(columns={'id': id_key}) def get_pids(row): # return get_all_pybaseball_ids([row[id_key]], 'retro', full_name=f'{row["use_name"]} {row["last_name"]}') pull = pb.playerid_reverse_lookup([row[id_key]], key_type='retro') if len(pull.values) == 0: print(f'Could not find id {row[id_key]} in pybaseball lookup') return pull.loc[0][['key_mlbam', 'key_retro', 'key_bbref', 'key_fangraphs']] players = players[[id_key, 'last_name', 'use_name']] start_time = datetime.datetime.now() other_ids = players.apply(get_pids, axis=1) end_time = datetime.datetime.now() print(f'ID lookup: {(end_time - start_time).total_seconds():.2f}s') players = pd.merge( left=players, right=other_ids, left_on=id_key, right_on='key_retro' ) players = players.set_index(id_key) def get_bat_hand(row): pa_vl = plays[(plays.batter_id == row['key_retro']) & (plays.pitcher_hand == 'l')].groupby('result_batter_hand').count()['game_id'].astype(int) pa_vr = plays[(plays.batter_id == row['key_retro']) & (plays.pitcher_hand == 'r')].groupby('result_batter_hand').count()['game_id'].astype(int) l_vs_l = 0 if 'l' not in pa_vl else pa_vl['l'] l_vs_r = 0 if 'l' not in pa_vr else pa_vr['l'] r_vs_l = 0 if 'r' not in pa_vl else pa_vl['r'] r_vs_r = 0 if 'r' not in pa_vr else pa_vr['r'] if sum([l_vs_l, l_vs_r]) == 0 and sum([r_vs_l, r_vs_r]) > 0: return 'R' elif sum([l_vs_l, l_vs_r]) > 0 and sum([r_vs_l, r_vs_r]) == 0: return 'L' if sum([l_vs_l, l_vs_r, r_vs_l, r_vs_r]) < 10: if sum([l_vs_l, l_vs_r]) > sum([r_vs_l, r_vs_r]): return 'L' else: return 'R' else: return 'S' if which == 'batters': players['bat_hand'] = players.apply(get_bat_hand, axis=1) return players def get_base_batting_df(all_plays: pd.DataFrame) -> pd.DataFrame: bs = get_player_ids(all_plays, 'batters') pal_series = all_plays[(all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'l')].groupby('batter_id').count()['event_type'].astype(int).rename('PA_vL') bs = pd.concat([bs, pal_series], axis=1) par_series = all_plays[(all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('PA_vR') bs = pd.concat([bs, par_series], axis=1) abl_series = all_plays[(all_plays.ab == 't') & (all_plays.pitcher_hand == 'l')].groupby('batter_id').count()['event_type'].astype(int).rename('AB_vL') bs = pd.concat([bs, abl_series], axis=1) abr_series = all_plays[(all_plays.ab == 't') & (all_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('AB_vR') bs = pd.concat([bs, abr_series], axis=1) return bs.dropna().query(f'PA_vL >= {MIN_PA_VL} & PA_vR >= {MIN_PA_VR}') def get_batting_stats_by_date(retro_file_path, start_date: int, end_date: int) -> pd.DataFrame: start = datetime.datetime.now() all_plays = get_events_by_date(retro_file_path, start_date, end_date) print(f'Pull events: {(datetime.datetime.now() - start).total_seconds():.2f}s') start = datetime.datetime.now() batting_stats = get_base_batting_df(all_plays) print(f'Get base dataframe: {(datetime.datetime.now() - start).total_seconds():.2f}s') start = datetime.datetime.now() all_player_ids = batting_stats['key_retro'] all_plays = all_plays[all_plays['batter_id'].isin(all_player_ids)] print(f'Shrink all_plays: {(datetime.datetime.now() - start).total_seconds():.2f}s') # Basic counting stats start = datetime.datetime.now() for event_type, vs_hand, col_name in [ ('home run', 'r', 'HR_vR'), ('home run', 'l', 'HR_vL'), ('single', 'r', '1B_vR'), ('single', 'l', '1B_vL'), ('double', 'r', '2B_vR'), ('double', 'l', '2B_vL'), ('triple', 'r', '3B_vR'), ('triple', 'l', '3B_vL'), ('walk', 'r', 'BB_vR'), ('walk', 'l', 'BB_vL'), ('strikeout', 'r', 'SO_vR'), ('strikeout', 'l', 'SO_vL'), ('hit by pitch', 'r', 'HBP_vR'), ('hit by pitch', 'l', 'HBP_vL') ]: this_series = get_result_series(all_plays, event_type, vs_hand, col_name) batting_stats[col_name] = this_series print(f'Count basic stats: {(datetime.datetime.now() - start).total_seconds():.2f}s') # Bespoke counting stats start = datetime.datetime.now() def get_fb_vl(row): return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batted_ball_type == 'f') & (all_plays.pitcher_hand == 'l')].count()['event_type'].astype(int) def get_fb_vr(row): return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batted_ball_type == 'f') & (all_plays.pitcher_hand == 'r')].count()['event_type'].astype(int) def get_gb_vl(row): return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batted_ball_type == 'G') & (all_plays.pitcher_hand == 'l')].count()['event_type'].astype(int) def get_gb_vr(row): return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batted_ball_type == 'G') & (all_plays.pitcher_hand == 'r')].count()['event_type'].astype(int) def get_ld_vl(row): return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batted_ball_type == 'l') & (all_plays.pitcher_hand == 'l')].count()['event_type'].astype(int) def get_ld_vr(row): return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batted_ball_type == 'l') & (all_plays.pitcher_hand == 'r')].count()['event_type'].astype(int) def get_gdp_vl(row): dp = all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'l') & (all_plays.dp == 't')].count()['event_type'].astype(int) tp = all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'l') & (all_plays.tp == 't')].count()['event_type'].astype(int) return dp + tp def get_gdp_vr(row): dp = all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'r') & (all_plays.dp == 't')].count()['event_type'].astype(int) tp = all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'r') & (all_plays.tp == 't')].count()['event_type'].astype(int) return dp + tp def get_bunt(row): return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.bunt == 't')].count()['event_type'].astype(int) batting_stats['FB_vL'] = batting_stats.apply(get_fb_vl, axis=1) batting_stats['FB_vR'] = batting_stats.apply(get_fb_vr, axis=1) batting_stats['GB_vL'] = batting_stats.apply(get_gb_vl, axis=1) batting_stats['GB_vR'] = batting_stats.apply(get_gb_vr, axis=1) batting_stats['LD_vL'] = batting_stats.apply(get_ld_vl, axis=1) batting_stats['LD_vR'] = batting_stats.apply(get_ld_vr, axis=1) batting_stats['GDP_vL'] = batting_stats.apply(get_gdp_vl, axis=1) batting_stats['GDP_vR'] = batting_stats.apply(get_gdp_vr, axis=1) batting_stats['Bunts'] = batting_stats.apply(get_bunt, axis=1) print(f'Custom counting stats: {(datetime.datetime.now() - start).total_seconds():.2f}s') # Infield Hit % ifh_vl = all_plays[(all_plays.hit_val.str.contains('1|2|3')) & (all_plays.pitcher_hand == 'l') & (all_plays.hit_location.str.contains('1|2|3|4|5|6')) & (~all_plays.hit_location.str.contains('D', na=False))].groupby('batter_id').count()['event_type'].astype(int).rename('ifh_vL') ifh_vr = all_plays[(all_plays.hit_val.str.contains('1|2|3')) & (all_plays.pitcher_hand == 'r') & (all_plays.hit_location.str.contains('1|2|3|4|5|6')) & (~all_plays.hit_location.str.contains('D', na=False))].groupby('batter_id').count()['event_type'].astype(int).rename('ifh_vR') batting_stats['ifh_vL'] = ifh_vl batting_stats['ifh_vR'] = ifh_vr def get_pull_vl(row): pull_loc = '5|7' if row['bat_hand'] != 'L' else '3|9' x = all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.pitcher_hand == 'l') & (all_plays.hit_location.str.contains(pull_loc))].count()['event_type'].astype(int) return x def get_pull_vr(row): pull_loc = '5|7' if row['bat_hand'] == 'R' else '3|9' x = all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.pitcher_hand == 'r') & (all_plays.hit_location.str.contains(pull_loc))].count()['event_type'].astype(int) return x # Bespoke Queries batting_stats['pull_vL'] = batting_stats.apply(get_pull_vl, axis=1) batting_stats['pull_vR'] = batting_stats.apply(get_pull_vr, axis=1) center_vl = all_plays[(all_plays.pitcher_hand == 'l') & (all_plays.hit_location.str.contains('1|4|6|8'))].groupby('batter_id').count()['event_type'].astype(int).rename('center_vl') center_vr = all_plays[(all_plays.pitcher_hand == 'r') & (all_plays.hit_location.str.contains('1|4|6|8'))].groupby('batter_id').count()['event_type'].astype(int).rename('center_vr') batting_stats['center_vL'] = center_vl batting_stats['center_vR'] = center_vr oppo_vl = all_plays[(all_plays.pitcher_hand == 'l') & (all_plays.hit_location.str.contains('5|7'))].groupby('batter_id').count()['event_type'].astype(int).rename('oppo_vL') oppo_vr = all_plays[(all_plays.pitcher_hand == 'r') & (all_plays.hit_location.str.contains('5|7'))].groupby('batter_id').count()['event_type'].astype(int).rename('oppo_vR') batting_stats['oppo_vL'] = oppo_vl batting_stats['oppo_vR'] = oppo_vr # fill na to 0 following counting stats batting_stats = batting_stats.fillna(0) # Calculated Fields start = datetime.datetime.now() batting_stats['H_vL'] = batting_stats['1B_vL'] + batting_stats['2B_vL'] + batting_stats['3B_vL'] + batting_stats['HR_vL'] batting_stats['H_vR'] = batting_stats['1B_vR'] + batting_stats['2B_vR'] + batting_stats['3B_vR'] + batting_stats['HR_vR'] batting_stats['AVG_vL'] = round(batting_stats['H_vL'] / batting_stats['AB_vL'], 5) batting_stats['AVG_vR'] = round(batting_stats['H_vR'] / batting_stats['AB_vR'], 5) batting_stats['OBP_vL'] = round((batting_stats['H_vL'] + batting_stats['BB_vL'] + batting_stats['HBP_vL']) / batting_stats['PA_vL'], 5) batting_stats['OBP_vR'] = round((batting_stats['H_vR'] + batting_stats['BB_vR'] + batting_stats['HBP_vR']) / batting_stats['PA_vR'], 5) batting_stats['SLG_vL'] = round((batting_stats['1B_vL'] + batting_stats['2B_vL'] * 2 + batting_stats['3B_vL'] * 3 + batting_stats['HR_vL'] * 4) / batting_stats['AB_vL'], 5) batting_stats['SLG_vR'] = round((batting_stats['1B_vR'] + batting_stats['2B_vR'] * 2 + batting_stats['3B_vR'] * 3 + batting_stats['HR_vR'] * 4) / batting_stats['AB_vR'], 5) batting_stats['HR/FB_vL'] = round(batting_stats['HR_vL'] / batting_stats['FB_vL'], 5) batting_stats['HR/FB_vR'] = round(batting_stats['HR_vR'] / batting_stats['FB_vR'], 5) batting_stats['FB%_vL'] = round(batting_stats['FB_vL'] / (batting_stats['FB_vL'] + batting_stats['GB_vL'] + batting_stats['LD_vL']), 5) batting_stats['FB%_vR'] = round(batting_stats['FB_vR'] / (batting_stats['FB_vR'] + batting_stats['GB_vR'] + batting_stats['LD_vR']), 5) batting_stats['GB%_vL'] = round(batting_stats['GB_vL'] / (batting_stats['FB_vL'] + batting_stats['GB_vL'] + batting_stats['LD_vL']), 5) batting_stats['GB%_vR'] = round(batting_stats['GB_vR'] / (batting_stats['FB_vR'] + batting_stats['GB_vR'] + batting_stats['LD_vR']), 5) batting_stats['LD%_vL'] = round(batting_stats['LD_vL'] / (batting_stats['FB_vL'] + batting_stats['GB_vL'] + batting_stats['LD_vL']), 5) batting_stats['LD%_vR'] = round(batting_stats['LD_vR'] / (batting_stats['FB_vR'] + batting_stats['GB_vR'] + batting_stats['LD_vR']), 5) batting_stats['Hard%_vL'] = round(0.2 + batting_stats['SLG_vL'] - batting_stats['AVG_vL'], 5) batting_stats['Hard%_vR'] = round(0.2 + batting_stats['SLG_vR'] - batting_stats['AVG_vR'], 5) def get_med_vL(row): high = 0.9 - row['Hard%_vL'] low = (row['SLG_vL'] - row['AVG_vL']) * 1.5 return round(max(min(high, low),0.1), 5) def get_med_vR(row): high = 0.9 - row['Hard%_vR'] low = (row['SLG_vR'] - row['AVG_vR']) * 1.5 return round(max(min(high, low),0.1), 5) batting_stats['Med%_vL'] = batting_stats.apply(get_med_vL, axis=1) batting_stats['Med%_vR'] = batting_stats.apply(get_med_vR, axis=1) batting_stats['Soft%_vL'] = round(1 - batting_stats['Hard%_vL'] - batting_stats['Med%_vL'], 5) batting_stats['Soft%_vR'] = round(1 - batting_stats['Hard%_vR'] - batting_stats['Med%_vR'], 5) batting_stats['IFH%_vL'] = round(batting_stats['ifh_vL'] / batting_stats['H_vL'], 5) batting_stats['IFH%_vR'] = round(batting_stats['ifh_vR'] / batting_stats['H_vR'], 5) batting_stats['Pull%_vL'] = round(batting_stats['pull_vL'] / (batting_stats['pull_vL'] + batting_stats['center_vL'] + batting_stats['oppo_vL']), 5) batting_stats['Pull%_vR'] = round(batting_stats['pull_vR'] / (batting_stats['pull_vR'] + batting_stats['center_vR'] + batting_stats['oppo_vR']), 5) batting_stats['Cent%_vL'] = round(batting_stats['center_vL'] / (batting_stats['pull_vL'] + batting_stats['center_vL'] + batting_stats['oppo_vL']), 5) batting_stats['Cent%_vR'] = round(batting_stats['center_vL'] / (batting_stats['pull_vR'] + batting_stats['center_vR'] + batting_stats['oppo_vR']), 5) batting_stats['Oppo%_vL'] = round(1 - batting_stats['Pull%_vL'] - batting_stats['Cent%_vL'], 5) batting_stats['Oppo%_vR'] = round(1 - batting_stats['Pull%_vR'] - batting_stats['Cent%_vR'], 5) batting_stats = batting_stats.fillna(0) print(f'Calculated fields: {(datetime.datetime.now() - start).total_seconds():.2f}s') return batting_stats def calc_batting_cards(bs: pd.DataFrame) -> pd.DataFrame: def create_batting_card(row): steal_data = cba.stealing( chances=int(row['SBO']), sb2s=int(row['SB2']), cs2s=int(row['CS2']), sb3s=int(row['SB3']), cs3s=int(row['CS3']), season_pct=1.0 ) y = pd.DataFrame({ 'key_bbref': [row['key_bbref']], 'steal_low': [steal_data[0]], 'steal_high': [steal_data[1]], 'steal_auto': [steal_data[2]], 'steal_jump': [steal_data[3]], 'hit_and_run': [cba.hit_and_run( row['AB_vL'], row['AB_vR'], row['H_vL'], row['H_vR'], row['HR_vL'], row['HR_vR'], row['SO_vL'], row['SO_vR'] )], 'bunt': [0], 'running': [cba.running(row['XBT%'])], 'hand': [row['bat_hand']], }) return y.loc[0] all_cards = bs.apply(create_batting_card, axis=1) all_cards = all_cards.set_index('key_bbref') return all_cards def calc_batter_ratings(bs: pd.DataFrame) -> pd.DataFrame: def create_batting_rating(row): ratings = cba.get_batter_ratings(row) ops_vl = ratings[0]['obp'] + ratings[0]['slg'] ops_vr = ratings[1]['obp'] + ratings[1]['slg'] total_ops = (ops_vl + ops_vr + min(ops_vr, ops_vl)) / 3 def calc_cost(total_ops, base_cost, base_ops, max_delta) -> int: delta = ((total_ops - base_ops) / 0.1) * 2 if delta < 1: delta = (max_delta * (1 - (total_ops / base_ops))) * -0.1 final_cost = base_cost + (max_delta * delta) return round(final_cost) if total_ops >= 1.2: rarity_id = 99 cost = calc_cost(total_ops, base_cost=2400, base_ops=1.215, max_delta=810) elif total_ops >= 1: rarity_id = 1 cost = calc_cost(total_ops, base_cost=810, base_ops=1.05, max_delta=270) elif total_ops >= 0.9: rarity_id = 2 cost = calc_cost(total_ops, base_cost=270, base_ops=0.95, max_delta=90) elif total_ops >= 0.8: rarity_id = 3 cost = calc_cost(total_ops, base_cost=90, base_ops=0.85, max_delta=30) elif total_ops >= 0.7: rarity_id = 4 cost = calc_cost(total_ops, base_cost=30, base_ops=0.75, max_delta=10) else: rarity_id = 5 cost = calc_cost(total_ops, base_cost=10, base_ops=0.61, max_delta=8) x = pd.DataFrame({ 'key_bbref': [row['key_bbref']], 'ratings_vL': [ratings[0]], 'ratings_vR': [ratings[1]], 'ops_vL': ops_vl, 'ops_vR': ops_vr, 'total_ops': total_ops, 'rarity_id': rarity_id, 'cost': cost }) return x.loc[0] all_ratings = bs.apply(create_batting_rating, axis=1) all_ratings = all_ratings.set_index('key_bbref') return all_ratings def calc_positions(bs: pd.DataFrame) -> pd.DataFrame: df_c = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_c.csv').set_index('key_bbref') df_1b = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_1b.csv').set_index('key_bbref') df_2b = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_2b.csv').set_index('key_bbref') df_3b = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_3b.csv').set_index('key_bbref') df_ss = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_ss.csv').set_index('key_bbref') df_lf = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_lf.csv').set_index('key_bbref') df_cf = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_cf.csv').set_index('key_bbref') df_rf = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_rf.csv').set_index('key_bbref') df_of = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_of.csv').set_index('key_bbref') season_pct = 1.0 all_pos = [] def process_pos(row): no_data = True # TODO: Add pos_1 through pos_8 to def df to be pulled in at post time for pos_df, position in [(df_1b, '1b'), (df_2b, '2b'), (df_3b, '3b'), (df_ss, 'ss')]: if row['key_bbref'] in pos_df.index: logging.info(f'Running {position} stats for {row["use_name"]} {row["last_name"]}') try: if 'tz_runs_total' in row: average_range = (int(pos_df.at[row["key_bbref"], 'tz_runs_total']) + int(pos_df.at[row["key_bbref"], 'bis_runs_total']) + min( int(pos_df.at[row["key_bbref"], 'tz_runs_total']), int(pos_df.at[row["key_bbref"], 'bis_runs_total']) )) / 3 else: average_range = pos_df.at[row["key_bbref"], 'tz_runs_total'] if float(pos_df.at[row["key_bbref"], 'Inn_def']) >= 10.0: all_pos.append({ "key_bbref": row['key_bbref'], "position": position.upper(), "innings": float(pos_df.at[row["key_bbref"], 'Inn_def']), "range": cde.get_if_range( pos_code=position, tz_runs=round(average_range), r_dp=0, season_pct=season_pct ), "error": cde.get_any_error( pos_code=position, errors=int(pos_df.at[row["key_bbref"], 'E_def']), chances=int(pos_df.at[row["key_bbref"], 'chances']), season_pct=season_pct ) }) no_data = False except Exception as e: logging.info(f'Infield position failed: {e}') of_arms = [] of_payloads = [] for pos_df, position in [(df_lf, 'lf'), (df_cf, 'cf'), (df_rf, 'rf')]: if row["key_bbref"] in pos_df.index: try: if 'tz_runs_total' in row: average_range = (int(pos_df.at[row["key_bbref"], 'tz_runs_total']) + int(pos_df.at[row["key_bbref"], 'bis_runs_total']) + min( int(pos_df.at[row["key_bbref"], 'tz_runs_total']), int(pos_df.at[row["key_bbref"], 'bis_runs_total']) )) / 3 else: average_range = pos_df.at[row["key_bbref"], 'tz_runs_total'] if float(pos_df.at[row["key_bbref"], 'Inn_def']) >= 10.0: of_payloads.append({ "key_bbref": row['key_bbref'], "position": position.upper(), "innings": float(pos_df.at[row["key_bbref"], 'Inn_def']), "range": cde.get_of_range( pos_code=position, tz_runs=round(average_range), season_pct=season_pct ) }) of_run_rating = 'bis_runs_outfield' if 'bis_runs_outfield' in pos_df else 'tz_runs_outfield' of_arms.append(int(pos_df.at[row["key_bbref"], of_run_rating])) no_data = False except Exception as e: logging.info(f'Outfield position failed: {e}') if row["key_bbref"] in df_of.index and len(of_arms) > 0 and len(of_payloads) > 0: try: error_rating = cde.get_any_error( pos_code=position, errors=int(df_of.at[row["key_bbref"], 'E_def']), chances=int(df_of.at[row["key_bbref"], 'chances']), season_pct=season_pct ) arm_rating = cde.arm_outfield(of_arms) for f in of_payloads: f['error'] = error_rating f['arm'] = arm_rating all_pos.append(f) no_data = False except Exception as e: logging.info(f'Outfield position failed: {e}') if row["key_bbref"] in df_c.index: try: run_rating = 'bis_runs_catcher_sb' if 'bis_runs_catcher_sb' in df_c else 'tz_runs_catcher' if df_c.at[row["key_bbref"], 'SB'] + df_c.at[row["key_bbref"], 'CS'] == 0: arm_rating = 3 else: arm_rating = cde.arm_catcher( cs_pct=df_c.at[row["key_bbref"], 'caught_stealing_perc'], raa=int(df_c.at[row["key_bbref"], run_rating]), season_pct=season_pct ) if float(df_c.at[row["key_bbref"], 'Inn_def']) >= 10.0: all_pos.append({ "key_bbref": row['key_bbref'], "position": 'C', "innings": float(df_c.at[row["key_bbref"], 'Inn_def']), "range": cde.range_catcher( rs_value=int(df_c.at[row["key_bbref"], 'tz_runs_catcher']), season_pct=season_pct ), "error": cde.get_any_error( pos_code='c', errors=int(df_c.at[row["key_bbref"], 'E_def']), chances=int(df_c.at[row["key_bbref"], 'chances']), season_pct=season_pct ), "arm": arm_rating, "pb": cde.pb_catcher( pb=int(df_c.at[row["key_bbref"], 'PB']), innings=int(float(df_c.at[row["key_bbref"], 'Inn_def'])), season_pct=season_pct ), "overthrow": cde.ot_catcher( errors=int(df_c.at[row["key_bbref"], 'E_def']), chances=int(df_c.at[row["key_bbref"], 'chances']), season_pct=season_pct ) }) no_data = False except Exception as e: logging.info(f'Catcher position failed: {e}') if no_data: all_pos.append({ "key_bbref": row['key_bbref'], "position": 'DH', "innings": row['PA_vL'] + row['PA_vR'] }) bs.apply(process_pos, axis=1) pos_df = pd.DataFrame(all_pos) pos_df = pos_df.set_index('key_bbref') return pos_df async def get_or_post_players(stat_df: pd.DataFrame, bat_card_df: pd.DataFrame, bat_rat_df: pd.DataFrame, def_rat_df: pd.DataFrame) -> pd.DataFrame: # Columns: bbref_id, player_id all_bbref_ids = [] all_player_ids = [] dev_count = 0 for index, row in stat_df.iterrows(): if dev_count > 0: break p_query = await db_get('players', params=[('key_bbref', row["key_bbref"]), ('cardset_id', CARDSET_ID)]) if p_query['count'] > 0: this_record = p_query['players'][0] if 'id' in this_record: player_id = this_record['id'] else: player_id = this_record['player_id'] all_bbref_ids.append(row['key_bbref']) all_player_ids.append(player_id) else: mlb_query = await db_get('mlbplayers', params=[('key_retro', row['key_retro'])]) if mlb_query['count'] > 0: mlb_player = mlb_query['players'][0] else: mlb_player = await db_post( 'mlbplayers/one', payload={ 'first_name': row['use_name'], 'last_name': row['last_name'], 'key_mlbam': row['key_mlbam'], 'key_fangraphs': row['key_fangraphs'], 'key_bbref': row['key_bbref'], 'key_retro': row['key_retro'] } ) new_player = await db_post({ 'p_name': f'{row["use_name"]} {row["last_name"]}', 'cost': f'{bat_rat_df["cost"]}', 'image': f'change-me', 'mlbclub': CLUB_LIST[row['Tm']], 'franchise': FRANCHISE_LIST[row['Tm']], 'cardset_id': CARDSET_ID, 'set_num': int(float(row['key_fangraphs'])), 'rarity_id': bat_rat_df.loc[row['key_bbref']]['rarity_id'], 'pos_1': row['pos_1'], 'description': PLAYER_DESCRIPTION, 'bbref_id': row['key_bbref'], 'fangr_id': row['key_fangraphs'], 'mlbplayer_id': mlb_player['id'] }) if 'id' in new_player: player_id = new_player['id'] else: player_id = new_player['player_id'] await db_patch('players', object_id=player_id, params=[('image', f'{CARD_BASE_URL}{player_id}/battingcard{urllib.parse.quote("?d=")}{RELEASE_DIRECTORY}')]) all_bbref_ids.append(row['key_bbref']) all_player_ids.append(player_id) dev_count += 1 new_data = {'key_bbref': all_bbref_ids, 'player_id': all_player_ids} players_df = pd.DataFrame(new_data) return players_df async def post_batter_data(bs: pd.DataFrame, bc: pd.DataFrame, br: pd.DataFrame, dr: pd.date_range) -> int: pd_ids = await get_or_post_players(bs, bc, br, dr) async def run_batters(data_input_path: str, start_date: int, end_date: int, post_data: bool = False): print(f'Running the batter calcs...') batter_start = datetime.datetime.now() # Get batting stats batting_stats = get_batting_stats_by_date(f'{RETRO_FILE_PATH}{EVENTS_FILENAME}', start_date=start_date, end_date=end_date) bs_len = len(batting_stats) end_calc = datetime.datetime.now() print(f'Combined batting stats: {(end_calc - batter_start).total_seconds():.2f}s\n') running_start = datetime.datetime.now() # Get running stats running_stats = get_run_stat_df(data_input_path) batting_stats = pd.merge( left=batting_stats, right=running_stats, how='left', left_on='key_bbref', right_on='key_bbref' ) end_calc = datetime.datetime.now() print(f'Running stats: {(end_calc - running_start).total_seconds():.2f}s') if len(batting_stats) != bs_len: raise DataMismatchError(f'retrosheet_data - run_batters - We started with {bs_len} batting lines and have {len(batting_stats)} after merging with running_stats') # Calculate batting cards card_start = datetime.datetime.now() all_batting_cards = calc_batting_cards(batting_stats) card_end = datetime.datetime.now() print(f'Create batting cards: {(card_end - card_start).total_seconds():.2f}s') # Calculate batting ratings rating_start = datetime.datetime.now() batting_stats['battingcard_id'] = batting_stats['key_fangraphs'] all_batting_ratings = calc_batter_ratings(batting_stats) rating_end = datetime.datetime.now() print(f'Create batting ratings: {(rating_end - rating_start).total_seconds():.2f}s') # Calculate defense ratings defense_start = datetime.datetime.now() all_defense_ratings = calc_positions(batting_stats) defense_end = datetime.datetime.now() print(f'Create defense ratings: {(defense_end - defense_start).total_seconds():.2f}s') # Post all data if post_data: print(f'Posting player data...') post_start = datetime.datetime.now() num_players = await post_batter_data(batting_stats, all_batting_cards, all_batting_ratings, all_defense_ratings) post_end = datetime.datetime.now() print(f'Post player data: {(post_end - post_start).total_seconds()}s') post_msg = f'Posted {num_players} players to the database' logging.info(post_msg) print(post_msg) else: post_msg = f'Players are NOT being posted to the database' logging.warning(post_msg) print(post_msg) return batting_stats async def main(args): batter_start = datetime.datetime.now() batting_stats = await run_batters(f'{DATA_INPUT_FILE_PATH}', start_date=19980101, end_date=19980430, post_data=True) batting_stats.to_csv(f'batting_stats.csv') batter_end = datetime.datetime.now() pitcher_start = datetime.datetime.now() pitcher_end = datetime.datetime.now() print(f'\n\nBatter time: {(batter_end - batter_start).total_seconds():.2f}s \nPitcher time: {(pitcher_end - pitcher_start).total_seconds():.2f}s\nTotal: {(pitcher_end - batter_start).total_seconds():.2f}s\n\nDone!') # await store_defense_to_csv(1998) if __name__ == '__main__': asyncio.run(main(sys.argv[1:]))