Add defense calcs

Begin work on posting data
This commit is contained in:
Cal Corum 2024-10-20 22:57:45 -05:00
parent eb79430de7
commit 44e8e22bc0

View File

@ -8,9 +8,11 @@ from typing import Literal
import pandas as pd
import pybaseball as pb
from pybaseball import cache
import urllib
from creation_helpers import get_args
from creation_helpers import get_args, CLUB_LIST, FRANCHISE_LIST
from batters.stat_prep import DataMismatchError
from db_calls import DB_URL, db_get, db_patch, db_post
import batters.calcs_batter as cba
import defenders.calcs_defense as cde
@ -27,11 +29,17 @@ RETRO_FILE_PATH = 'data-input/retrosheet/'
EVENTS_FILENAME = 'retrosheets_events_1998_short.csv' # Removed last few columns which were throwing dtype errors
PERSONNEL_FILENAME = 'retrosheets_personnel.csv'
DATA_INPUT_FILE_PATH = 'data-input/1998 Season Cardset/'
CARD_BASE_URL = f'{DB_URL}/v2/players/'
start_time = datetime.datetime.now()
RELEASE_DIRECTORY = f'{start_time.year}-{start_time.month}-{start_time.day}'
MIN_PA_VL = 20
MIN_PA_VR = 40
MIN_TBF_VL = MIN_PA_VL
MIN_TBF_VR = MIN_PA_VR
MIN_TBF_VR = MIN_PA_VR
CARDSET_ID = 20
PLAYER_DESCRIPTION = 'Live'
async def store_defense_to_csv(season: int):
@ -64,7 +72,7 @@ def get_run_stat_df(input_path: str):
if 'Name-additional' in run_data:
run_data = run_data.rename(columns={'Name-additional': 'key_bbref'})
run_data = run_data[['key_bbref', 'ROE', 'XI', 'RS%', 'SBO', 'SB', 'CS', 'SB%', 'SB2', 'CS2', 'SB3', 'CS3', 'SBH', 'CSH', 'PO', 'PCS', 'OOB', 'OOB1', 'OOB2', 'OOB3', 'OOBHm', 'BT', 'XBT%', '1stS', '1stS2', '1stS3', '1stD', '1stD3', '1stDH', '2ndS', '2ndS3', '2ndSH']]
run_data = run_data[['key_bbref', 'Tm', 'ROE', 'XI', 'RS%', 'SBO', 'SB', 'CS', 'SB%', 'SB2', 'CS2', 'SB3', 'CS3', 'SBH', 'CSH', 'PO', 'PCS', 'OOB', 'OOB1', 'OOB2', 'OOB3', 'OOBHm', 'BT', 'XBT%', '1stS', '1stS2', '1stS3', '1stD', '1stD3', '1stDH', '2ndS', '2ndS3', '2ndSH']]
run_data = run_data.fillna(0)
return run_data.set_index('key_bbref')
@ -136,8 +144,6 @@ def get_player_ids(plays: pd.DataFrame, which: Literal['batters', 'pitchers']) -
def get_base_batting_df(all_plays: pd.DataFrame) -> pd.DataFrame:
bs = get_player_ids(all_plays, 'batters')
# bs['key_mlbam'] = bs.apply()
pal_series = all_plays[(all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'l')].groupby('batter_id').count()['event_type'].astype(int).rename('PA_vL')
bs = pd.concat([bs, pal_series], axis=1)
par_series = all_plays[(all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('PA_vR')
@ -160,6 +166,11 @@ def get_batting_stats_by_date(retro_file_path, start_date: int, end_date: int) -
batting_stats = get_base_batting_df(all_plays)
print(f'Get base dataframe: {(datetime.datetime.now() - start).total_seconds():.2f}s')
start = datetime.datetime.now()
all_player_ids = batting_stats['key_retro']
all_plays = all_plays[all_plays['batter_id'].isin(all_player_ids)]
print(f'Shrink all_plays: {(datetime.datetime.now() - start).total_seconds():.2f}s')
# Basic counting stats
start = datetime.datetime.now()
for event_type, vs_hand, col_name in [
@ -317,6 +328,8 @@ def get_batting_stats_by_date(retro_file_path, start_date: int, end_date: int) -
batting_stats['Oppo%_vL'] = round(1 - batting_stats['Pull%_vL'] - batting_stats['Cent%_vL'], 5)
batting_stats['Oppo%_vR'] = round(1 - batting_stats['Pull%_vR'] - batting_stats['Cent%_vR'], 5)
batting_stats = batting_stats.fillna(0)
print(f'Calculated fields: {(datetime.datetime.now() - start).total_seconds():.2f}s')
return batting_stats
@ -349,6 +362,7 @@ def calc_batting_cards(bs: pd.DataFrame) -> pd.DataFrame:
return y.loc[0]
all_cards = bs.apply(create_batting_card, axis=1)
all_cards = all_cards.set_index('key_bbref')
return all_cards
@ -356,150 +370,290 @@ def calc_batting_cards(bs: pd.DataFrame) -> pd.DataFrame:
def calc_batter_ratings(bs: pd.DataFrame) -> pd.DataFrame:
def create_batting_rating(row):
ratings = cba.get_batter_ratings(row)
# list_of_ratings = ratings[0]
ops_vl = ratings[0]['obp'] + ratings[0]['slg']
ops_vr = ratings[1]['obp'] + ratings[1]['slg']
total_ops = (ops_vl + ops_vr + min(ops_vr, ops_vl)) / 3
def calc_cost(total_ops, base_cost, base_ops, max_delta) -> int:
delta = ((total_ops - base_ops) / 0.1) * 2
if delta < 1:
delta = (max_delta * (1 - (total_ops / base_ops))) * -0.1
final_cost = base_cost + (max_delta * delta)
return round(final_cost)
if total_ops >= 1.2:
rarity_id = 99
cost = calc_cost(total_ops, base_cost=2400, base_ops=1.215, max_delta=810)
elif total_ops >= 1:
rarity_id = 1
cost = calc_cost(total_ops, base_cost=810, base_ops=1.05, max_delta=270)
elif total_ops >= 0.9:
rarity_id = 2
cost = calc_cost(total_ops, base_cost=270, base_ops=0.95, max_delta=90)
elif total_ops >= 0.8:
rarity_id = 3
cost = calc_cost(total_ops, base_cost=90, base_ops=0.85, max_delta=30)
elif total_ops >= 0.7:
rarity_id = 4
cost = calc_cost(total_ops, base_cost=30, base_ops=0.75, max_delta=10)
else:
rarity_id = 5
cost = calc_cost(total_ops, base_cost=10, base_ops=0.61, max_delta=8)
x = pd.DataFrame({
'key_bbref': [row['key_bbref']],
'ratings_vL': [ratings[0]],
'ratings_vR': [ratings[1]]
'ratings_vR': [ratings[1]],
'ops_vL': ops_vl,
'ops_vR': ops_vr,
'total_ops': total_ops,
'rarity_id': rarity_id,
'cost': cost
})
return x.loc[0]
all_ratings = bs.apply(create_batting_rating, axis=1)
all_ratings = all_ratings.set_index('key_bbref')
return all_ratings
def calc_positions(bs: pd.DataFrame) -> pd.DataFrame:
df_c = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_c.csv').set_index('key_bbref')
df_1b = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_1b.csv').set_index('key_bbref')
df_2b = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_2b.csv').set_index('key_bbref')
df_3b = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_3b.csv').set_index('key_bbref')
df_ss = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_ss.csv').set_index('key_bbref')
df_lf = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_lf.csv').set_index('key_bbref')
df_cf = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_cf.csv').set_index('key_bbref')
df_rf = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_rf.csv').set_index('key_bbref')
df_of = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_of.csv').set_index('key_bbref')
season_pct = 1.0
all_pos = []
def process_pos(row):
no_data = True
for pos_data in [(df_1b, '1b'), (df_2b, '2b'), (df_3b, '3b'), (df_ss, 'ss')]:
if row['key_bbref'] in pos_data[0].index:
logging.info(f'Running {pos_data[1]} stats for {row["p_name"]}')
# TODO: Add pos_1 through pos_8 to def df to be pulled in at post time
for pos_df, position in [(df_1b, '1b'), (df_2b, '2b'), (df_3b, '3b'), (df_ss, 'ss')]:
if row['key_bbref'] in pos_df.index:
logging.info(f'Running {position} stats for {row["use_name"]} {row["last_name"]}')
try:
average_range = (int(pos_data[0].at[row["key_bbref"], 'tz_runs_total']) +
int(pos_data[0].at[row["key_bbref"], 'bis_runs_total']) +
min(
int(pos_data[0].at[row["key_bbref"], 'tz_runs_total']),
int(pos_data[0].at[row["key_bbref"], 'bis_runs_total'])
)) / 3
if 'tz_runs_total' in row:
average_range = (int(pos_df.at[row["key_bbref"], 'tz_runs_total']) +
int(pos_df.at[row["key_bbref"], 'bis_runs_total']) +
min(
int(pos_df.at[row["key_bbref"], 'tz_runs_total']),
int(pos_df.at[row["key_bbref"], 'bis_runs_total'])
)) / 3
else:
average_range = pos_df.at[row["key_bbref"], 'tz_runs_total']
position_payload.append({ # TODO: convert position_payload to a list?
"player_id": int(row['player_id']),
"position": pos_data[1].upper(),
"innings": float(pos_data[0].at[row["key_bbref"], 'Inn_def']),
"range": get_if_range(
pos_code=pos_data[1],
tz_runs=round(average_range),
r_dp=0,
season_pct=season_pct
),
"error": get_any_error(
pos_code=pos_data[1],
errors=int(pos_data[0].at[row["key_bbref"], 'E_def']),
chances=int(pos_data[0].at[row["key_bbref"], 'chances']),
season_pct=season_pct
)
})
no_data = False
if float(pos_df.at[row["key_bbref"], 'Inn_def']) >= 10.0:
all_pos.append({
"key_bbref": row['key_bbref'],
"position": position.upper(),
"innings": float(pos_df.at[row["key_bbref"], 'Inn_def']),
"range": cde.get_if_range(
pos_code=position,
tz_runs=round(average_range),
r_dp=0,
season_pct=season_pct
),
"error": cde.get_any_error(
pos_code=position,
errors=int(pos_df.at[row["key_bbref"], 'E_def']),
chances=int(pos_df.at[row["key_bbref"], 'chances']),
season_pct=season_pct
)
})
no_data = False
except Exception as e:
logging.info(f'Infield position failed: {e}')
of_arms = []
of_payloads = []
for pos_data in [(df_lf, 'lf'), (df_cf, 'cf'), (df_rf, 'rf')]:
if row["key_bbref"] in pos_data[0].index:
for pos_df, position in [(df_lf, 'lf'), (df_cf, 'cf'), (df_rf, 'rf')]:
if row["key_bbref"] in pos_df.index:
try:
average_range = (int(pos_data[0].at[row["key_bbref"], 'tz_runs_total']) +
int(pos_data[0].at[row["key_bbref"], 'bis_runs_total']) +
min(
int(pos_data[0].at[row["key_bbref"], 'tz_runs_total']),
int(pos_data[0].at[row["key_bbref"], 'bis_runs_total'])
)) / 3
of_payloads.append({
"player_id": int(row['player_id']),
"position": pos_data[1].upper(),
"innings": float(pos_data[0].at[row["key_bbref"], 'Inn_def']),
"range": get_of_range(
pos_code=pos_data[1],
tz_runs=round(average_range),
season_pct=season_pct
)
})
of_arms.append(int(pos_data[0].at[row["key_bbref"], 'bis_runs_outfield']))
no_data = False
if 'tz_runs_total' in row:
average_range = (int(pos_df.at[row["key_bbref"], 'tz_runs_total']) +
int(pos_df.at[row["key_bbref"], 'bis_runs_total']) +
min(
int(pos_df.at[row["key_bbref"], 'tz_runs_total']),
int(pos_df.at[row["key_bbref"], 'bis_runs_total'])
)) / 3
else:
average_range = pos_df.at[row["key_bbref"], 'tz_runs_total']
if float(pos_df.at[row["key_bbref"], 'Inn_def']) >= 10.0:
of_payloads.append({
"key_bbref": row['key_bbref'],
"position": position.upper(),
"innings": float(pos_df.at[row["key_bbref"], 'Inn_def']),
"range": cde.get_of_range(
pos_code=position,
tz_runs=round(average_range),
season_pct=season_pct
)
})
of_run_rating = 'bis_runs_outfield' if 'bis_runs_outfield' in pos_df else 'tz_runs_outfield'
of_arms.append(int(pos_df.at[row["key_bbref"], of_run_rating]))
no_data = False
except Exception as e:
logging.info(f'Outfield position failed: {e}')
if row["key_bbref"] in df_of.index and len(of_arms) > 0 and len(of_payloads) > 0:
try:
error_rating = get_any_error(
pos_code=pos_data[1],
error_rating = cde.get_any_error(
pos_code=position,
errors=int(df_of.at[row["key_bbref"], 'E_def']),
chances=int(df_of.at[row["key_bbref"], 'chances']),
season_pct=season_pct
)
arm_rating = arm_outfield(of_arms)
arm_rating = cde.arm_outfield(of_arms)
for f in of_payloads:
f['error'] = error_rating
f['arm'] = arm_rating
position_payload.append(f)
all_pos.append(f)
no_data = False
except Exception as e:
logging.info(f'Outfield position failed: {e}')
if row["key_bbref"] in df_c.index:
try:
run_rating = 'bis_runs_catcher_sb' if 'bis_runs_catcher_sb' in df_c else 'tz_runs_catcher'
if df_c.at[row["key_bbref"], 'SB'] + df_c.at[row["key_bbref"], 'CS'] == 0:
arm_rating = 3
else:
arm_rating = arm_catcher(
arm_rating = cde.arm_catcher(
cs_pct=df_c.at[row["key_bbref"], 'caught_stealing_perc'],
raa=int(df_c.at[row["key_bbref"], 'bis_runs_catcher_sb']),
raa=int(df_c.at[row["key_bbref"], run_rating]),
season_pct=season_pct
)
position_payload.append({
"player_id": int(row['player_id']),
"position": 'C',
"innings": float(df_c.at[row["key_bbref"], 'Inn_def']),
"range": range_catcher(
rs_value=int(df_c.at[row["key_bbref"], 'tz_runs_catcher']),
season_pct=season_pct
),
"error": get_any_error(
pos_code='c',
errors=int(df_c.at[row["key_bbref"], 'E_def']),
chances=int(df_c.at[row["key_bbref"], 'chances']),
season_pct=season_pct
),
"arm": arm_rating,
"pb": pb_catcher(
pb=int(df_c.at[row["key_bbref"], 'PB']),
innings=int(float(df_c.at[row["key_bbref"], 'Inn_def'])),
season_pct=season_pct
),
"overthrow": ot_catcher(
errors=int(df_c.at[row["key_bbref"], 'E_def']),
chances=int(df_c.at[row["key_bbref"], 'chances']),
season_pct=season_pct
)
})
no_data = False
if float(df_c.at[row["key_bbref"], 'Inn_def']) >= 10.0:
all_pos.append({
"key_bbref": row['key_bbref'],
"position": 'C',
"innings": float(df_c.at[row["key_bbref"], 'Inn_def']),
"range": cde.range_catcher(
rs_value=int(df_c.at[row["key_bbref"], 'tz_runs_catcher']),
season_pct=season_pct
),
"error": cde.get_any_error(
pos_code='c',
errors=int(df_c.at[row["key_bbref"], 'E_def']),
chances=int(df_c.at[row["key_bbref"], 'chances']),
season_pct=season_pct
),
"arm": arm_rating,
"pb": cde.pb_catcher(
pb=int(df_c.at[row["key_bbref"], 'PB']),
innings=int(float(df_c.at[row["key_bbref"], 'Inn_def'])),
season_pct=season_pct
),
"overthrow": cde.ot_catcher(
errors=int(df_c.at[row["key_bbref"], 'E_def']),
chances=int(df_c.at[row["key_bbref"], 'chances']),
season_pct=season_pct
)
})
no_data = False
except Exception as e:
logging.info(f'Catcher position failed: {e}')
if no_data:
position_payload.append({
"player_id": int(row['player_id']),
all_pos.append({
"key_bbref": row['key_bbref'],
"position": 'DH',
"innings": row['PA_vL'] + row['PA_vR']
})
all_pos = bs.apply(process_pos, axis=1)
bs.apply(process_pos, axis=1)
pos_df = pd.DataFrame(all_pos)
pos_df = pos_df.set_index('key_bbref')
return all_pos
return pos_df
def run_batters(data_input_path: str, start_date: int, end_date: int):
async def get_or_post_players(stat_df: pd.DataFrame, bat_card_df: pd.DataFrame, bat_rat_df: pd.DataFrame, def_rat_df: pd.DataFrame) -> pd.DataFrame:
# Columns: bbref_id, player_id
all_bbref_ids = []
all_player_ids = []
dev_count = 0
for index, row in stat_df.iterrows():
if dev_count > 0:
break
p_query = await db_get('players', params=[('key_bbref', row["key_bbref"]), ('cardset_id', CARDSET_ID)])
if p_query['count'] > 0:
this_record = p_query['players'][0]
if 'id' in this_record:
player_id = this_record['id']
else:
player_id = this_record['player_id']
all_bbref_ids.append(row['key_bbref'])
all_player_ids.append(player_id)
else:
mlb_query = await db_get('mlbplayers', params=[('key_retro', row['key_retro'])])
if mlb_query['count'] > 0:
mlb_player = mlb_query['players'][0]
else:
mlb_player = await db_post(
'mlbplayers/one',
payload={
'first_name': row['use_name'],
'last_name': row['last_name'],
'key_mlbam': row['key_mlbam'],
'key_fangraphs': row['key_fangraphs'],
'key_bbref': row['key_bbref'],
'key_retro': row['key_retro']
}
)
new_player = await db_post({
'p_name': f'{row["use_name"]} {row["last_name"]}',
'cost': f'{bat_rat_df["cost"]}',
'image': f'change-me',
'mlbclub': CLUB_LIST[row['Tm']],
'franchise': FRANCHISE_LIST[row['Tm']],
'cardset_id': CARDSET_ID,
'set_num': int(float(row['key_fangraphs'])),
'rarity_id': bat_rat_df.loc[row['key_bbref']]['rarity_id'],
'pos_1': row['pos_1'],
'description': PLAYER_DESCRIPTION,
'bbref_id': row['key_bbref'],
'fangr_id': row['key_fangraphs'],
'mlbplayer_id': mlb_player['id']
})
if 'id' in new_player:
player_id = new_player['id']
else:
player_id = new_player['player_id']
await db_patch('players', object_id=player_id, params=[('image', f'{CARD_BASE_URL}{player_id}/battingcard{urllib.parse.quote("?d=")}{RELEASE_DIRECTORY}')])
all_bbref_ids.append(row['key_bbref'])
all_player_ids.append(player_id)
dev_count += 1
new_data = {'key_bbref': all_bbref_ids, 'player_id': all_player_ids}
players_df = pd.DataFrame(new_data)
return players_df
async def post_batter_data(bs: pd.DataFrame, bc: pd.DataFrame, br: pd.DataFrame, dr: pd.date_range) -> int:
pd_ids = await get_or_post_players(bs, bc, br, dr)
async def run_batters(data_input_path: str, start_date: int, end_date: int, post_data: bool = False):
print(f'Running the batter calcs...')
batter_start = datetime.datetime.now()
@ -508,12 +662,11 @@ def run_batters(data_input_path: str, start_date: int, end_date: int):
bs_len = len(batting_stats)
end_calc = datetime.datetime.now()
print(f'Batting stats: {(end_calc - batter_start).total_seconds():.2f}s')
print(f'Combined batting stats: {(end_calc - batter_start).total_seconds():.2f}s\n')
running_start = datetime.datetime.now()
# Get running stats
running_stats = get_run_stat_df(data_input_path)
run_len = len(running_stats)
batting_stats = pd.merge(
left=batting_stats,
@ -533,7 +686,7 @@ def run_batters(data_input_path: str, start_date: int, end_date: int):
all_batting_cards = calc_batting_cards(batting_stats)
card_end = datetime.datetime.now()
print(f'Create batting cards: {(card_end - card_start).total_seconds()}s')
print(f'Create batting cards: {(card_end - card_start).total_seconds():.2f}s')
# Calculate batting ratings
rating_start = datetime.datetime.now()
@ -541,29 +694,47 @@ def run_batters(data_input_path: str, start_date: int, end_date: int):
all_batting_ratings = calc_batter_ratings(batting_stats)
rating_end = datetime.datetime.now()
print(f'Create batting ratings: {(rating_end - rating_start).total_seconds()}s')
print(f'Create batting ratings: {(rating_end - rating_start).total_seconds():.2f}s')
# Calculate defense ratings
defense_start = datetime.datetime.now()
all_defense_ratings = calc_positions(batting_stats)
defense_end = datetime.datetime.now()
print(f'Create defense ratings: {(defense_end - defense_start).total_seconds()}s')
print(f'Create defense ratings: {(defense_end - defense_start).total_seconds():.2f}s')
# Post all data
if post_data:
print(f'Posting player data...')
post_start = datetime.datetime.now()
num_players = await post_batter_data(batting_stats, all_batting_cards, all_batting_ratings, all_defense_ratings)
post_end = datetime.datetime.now()
print(f'Post player data: {(post_end - post_start).total_seconds()}s')
post_msg = f'Posted {num_players} players to the database'
logging.info(post_msg)
print(post_msg)
else:
post_msg = f'Players are NOT being posted to the database'
logging.warning(post_msg)
print(post_msg)
return batting_stats
async def main(args):
# batter_start = datetime.datetime.now()
# batting_stats = run_batters(f'{DATA_INPUT_FILE_PATH}', start_date=19980101, end_date=19980430)
# batting_stats.to_csv(f'batting_stats.csv')
# batter_end = datetime.datetime.now()
batter_start = datetime.datetime.now()
batting_stats = await run_batters(f'{DATA_INPUT_FILE_PATH}', start_date=19980101, end_date=19980430, post_data=True)
batting_stats.to_csv(f'batting_stats.csv')
batter_end = datetime.datetime.now()
# pitcher_start = datetime.datetime.now()
# pitcher_end = datetime.datetime.now()
pitcher_start = datetime.datetime.now()
pitcher_end = datetime.datetime.now()
# print(f'\n\nBatter time: {(batter_end - batter_start).total_seconds():.2f}s \nPitcher time: {(pitcher_end - pitcher_start).total_seconds():.2f}s\nTotal: {(pitcher_end - batter_start).total_seconds():.2f}s\n\nDone!')
await store_defense_to_csv(1998)
print(f'\n\nBatter time: {(batter_end - batter_start).total_seconds():.2f}s \nPitcher time: {(pitcher_end - pitcher_start).total_seconds():.2f}s\nTotal: {(pitcher_end - batter_start).total_seconds():.2f}s\n\nDone!')
# await store_defense_to_csv(1998)
if __name__ == '__main__':