Updated mround to return float

Counting stats nearly complete for batters
This commit is contained in:
Cal Corum 2024-10-18 12:12:40 -05:00
parent 1109a12434
commit 0de2239100
7 changed files with 223966 additions and 681 deletions

File diff suppressed because it is too large Load Diff

View File

@ -570,7 +570,8 @@ def get_pitching_peripherals(season: int):
def mround(x, prec=2, base=.05):
return round(base * round(float(x) / base), prec)
return float(round(Decimal(str(x)) / Decimal(str(base))) * Decimal(str(base)))
# return round(base * round(float(x) / base), prec)
def chances_from_row(row_num):
@ -913,7 +914,6 @@ def legacy_sanitize_chance_output(total_chances: float, min_chances: float = 1.0
return x
def mlbteam_and_franchise(mlbam_playerid):
api_url = f'https://statsapi.mlb.com/api/v1/people/{mlbam_playerid}?hydrate=currentTeam'
logging.info(f'Calling {api_url}')

View File

@ -8,6 +8,8 @@ from typing import Literal
import pandas as pd
import pybaseball as pb
from creation_helpers import get_all_pybaseball_ids
date = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}'
log_level = logging.INFO
logging.basicConfig(
@ -20,8 +22,8 @@ EVENTS_FILENAME = 'retrosheets_events_1998_short.csv' # Removed last few column
PERSONNEL_FILENAME = 'retrosheets_personnel.csv'
def get_events_by_date(start_date: int, end_date: int) -> pd.DataFrame:
all_plays = pd.read_csv(f'{FILE_PATH}{EVENTS_FILENAME}', dtype={'game_id': 'str'})
def get_events_by_date(file_path: str, start_date: int, end_date: int) -> pd.DataFrame:
all_plays = pd.read_csv(f'{file_path}', dtype={'game_id': 'str'})
all_plays['date'] = all_plays['game_id'].str[7:-1].astype(int)
date_plays = all_plays[(all_plays.date >= start_date) & (all_plays.date <= end_date)]
return date_plays
@ -32,70 +34,111 @@ def get_result_series(plays: pd.DataFrame, event_type: str, pitcher_hand: Litera
return this_series
def get_batting_stats_by_date(start_date: int, end_date: int) -> pd.DataFrame:
all_plays = get_events_by_date(start_date, end_date)
# def get_batting_handedness(plays: pd.DataFrame) -> pd.DataFrame:
bs = pd.DataFrame()
bs['batter_id'] = all_plays['batter_id'].unique()
bs = bs.set_index('batter_id')
pal_series = all_plays[(all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'l')].groupby('batter_id').count()['event_type'].astype(int).rename('PAvL')
def get_player_ids(plays: pd.DataFrame, which: Literal['batters', 'pitchers']) -> pd.DataFrame:
RETRO_PLAYERS = pd.read_csv(f'{FILE_PATH}{PERSONNEL_FILENAME}')
id_key = 'batter_id' if which == 'batters' else 'pitcher_id'
players = pd.DataFrame()
unique_players = pd.Series(plays[id_key].unique()).to_frame('id')
players = pd.merge(
left=RETRO_PLAYERS,
right=unique_players,
how='right',
left_on='id',
right_on='id'
).rename(columns={'id': id_key})
def get_pids(row):
# return get_all_pybaseball_ids([row[id_key]], 'retro', full_name=f'{row["use_name"]} {row["last_name"]}')
pull = pb.playerid_reverse_lookup([row[id_key]], key_type='retro')
if len(pull.values) == 0:
print(f'Could not find id {row[id_key]} in pybaseball lookup')
return pull.loc[0][['key_mlbam', 'key_retro', 'key_bbref', 'key_fangraphs']]
players = players[[id_key, 'last_name', 'use_name']]
start_time = datetime.datetime.now()
other_ids = players.apply(get_pids, axis=1)
end_time = datetime.datetime.now()
print(f'ID lookup time: {(end_time - start_time).total_seconds():.2f}s')
players = pd.merge(
left=players,
right=other_ids,
left_on=id_key,
right_on='key_retro'
)
players = players.set_index(id_key)
return players
def get_base_batting_df(all_plays: pd.DataFrame) -> pd.DataFrame:
bs = get_player_ids(all_plays, 'batters')
# bs['key_mlbam'] = bs.apply()
pal_series = all_plays[(all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'l')].groupby('batter_id').count()['event_type'].astype(int).rename('PA_vL')
bs = pd.concat([bs, pal_series], axis=1)
par_series = all_plays[(all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('PAvR')
par_series = all_plays[(all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('PA_vR')
bs = pd.concat([bs, par_series], axis=1)
abl_series = all_plays[(all_plays.ab == 't') & (all_plays.pitcher_hand == 'l')].groupby('batter_id').count()['event_type'].astype(int).rename('ABvL')
abl_series = all_plays[(all_plays.ab == 't') & (all_plays.pitcher_hand == 'l')].groupby('batter_id').count()['event_type'].astype(int).rename('AB_vL')
bs = pd.concat([bs, abl_series], axis=1)
abr_series = all_plays[(all_plays.ab == 't') & (all_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('ABvR')
abr_series = all_plays[(all_plays.ab == 't') & (all_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('AB_vR')
bs = pd.concat([bs, abr_series], axis=1)
# Basic counting stats
for event_type, vs_hand, col_name in [
('home run', 'r', 'HRvR'),
('home run', 'l', 'HRvL'),
('single', 'r', '1BvR'),
('single', 'l', '1BvL'),
('double', 'r', '2BvR'),
('double', 'l', '2BvL'),
('triple', 'r', '3BvR'),
('triple', 'l', '3BvL'),
('walk', 'r', 'BBvR'),
('walk', 'l', 'BBvL'),
('strikeout', 'r', 'SOvR'),
('strikeout', 'l', 'SOvL'),
('hit by pitch', 'r', 'HBPvR'),
('hit by pitch', 'l', 'HBPvL')
]:
this_series = get_result_series(all_plays, event_type, vs_hand, col_name)
bs = pd.concat([bs, this_series], axis=1)
# Bespoke queries
# fill na to 0 following counting stats
bs = bs.fillna(0)
return bs
# def get_batting_stat_range(start_month: int, start_day: int, end_month: int, end_day: int):
# return get_batting_stats_by_date(
# start_date=start_month * 100 + start_day,
# end_date=end_month * 100 + end_day
# )
def get_batting_stats_by_date(file_path, start_date: int, end_date: int) -> pd.DataFrame:
all_plays = get_events_by_date(file_path, start_date, end_date)
batting_stats = get_base_batting_df(all_plays)
# Basic counting stats
for event_type, vs_hand, col_name in [
('home run', 'r', 'HR_vR'),
('home run', 'l', 'HR_vL'),
('single', 'r', '1B_vR'),
('single', 'l', '1B_vL'),
('double', 'r', '2B_vR'),
('double', 'l', '2B_vL'),
('triple', 'r', '3B_vR'),
('triple', 'l', '3B_vL'),
('walk', 'r', 'BB_vR'),
('walk', 'l', 'BB_vL'),
('strikeout', 'r', 'SO_vR'),
('strikeout', 'l', 'SO_vL'),
('hit by pitch', 'r', 'HBP_vR'),
('hit by pitch', 'l', 'HBP_vL')
]:
this_series = get_result_series(all_plays, event_type, vs_hand, col_name)
batting_stats = pd.concat([batting_stats, this_series], axis=1)
# Bespoke queries
# Remaining:
# fill na to 0 following counting stats
batting_stats = batting_stats.fillna(0)
return batting_stats
async def main(args):
print(f'Running the calcs...')
start = datetime.datetime.now()
data = get_batting_stats_by_date(start_date=101, end_date=430)
data = get_batting_stats_by_date(f'{FILE_PATH}{EVENTS_FILENAME}', start_date=101, end_date=430)
end_calc = datetime.datetime.now()
print(f'Saving to csv...')
data.to_csv(f'batting_stats.csv')
end = datetime.datetime.now()
print(f'Done!\n\nCalc time: {(end_calc - start).total_seconds()}s\nSave time: {(end - end_calc).total_seconds()}s\nTotal: {(end - start).total_seconds()}s')
print(f'Stat calc time: {(end_calc - start).total_seconds():.2f}s\nSave time: {(end - end_calc).total_seconds():.2f}s\nTotal: {(end - start).total_seconds():.2f}s\n\nDone!')
if __name__ == '__main__':

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,56 +1,9 @@
from decimal import ROUND_HALF_EVEN, Decimal
import math
from batters.calcs_batter import bp_singles, wh_singles, sanitize_chance_output
from creation_helpers import sanitize_chance_output, mround
from batters.calcs_batter import bp_singles, wh_singles
def test_sanitize():
# def my_round_decimal(i: float):
# return Decimal(i).quantize(Decimal('0.05'), ROUND_HALF_EVEN)
# def my_round(num: float, to: float = 0.05):
# num, to = Decimal(str(num)), Decimal(str(to))
# return float(round(num / to) * to)
# assert my_round(6) == 6
# assert my_round(5.96) == 5.95
# assert my_round(5.84) == 5.85
# assert my_round(3.123) == 3.1
# assert math.floor(my_round(6)) == 6
# assert math.floor(my_round(5.96)) == 5
assert sanitize_chance_output(6) == 6.0
assert sanitize_chance_output(1.21) == 1.2
assert sanitize_chance_output(4.77) == 4.75
assert sanitize_chance_output(4.78) == 4.8
# step_1 = Decimal(6) / Decimal(0.05)
# step_1_5 = round(step_1, )
# step_2 = round(step_1)
# step_3 = float(step_2 * Decimal(0.05))
# step_4 = Decimal(step_3)
# assert Decimal(6) == Decimal(6).quantize(Decimal('0.05'), ROUND_HALF_EVEN)
# assert round(step_1) == 120
# # assert step_1 == 120
# assert step_1_5 == 120
# assert step_2 == 120
# assert step_3 == 6.0
# assert step_4 == Decimal('6.0')
# rounded_val = step_4.quantize(Decimal("0.05"), ROUND_HALF_EVEN)
# assert rounded_val == 6
# assert sanitize_chance_output(6) == 6
def test_mround():
assert mround(6) == 6.0
assert mround(1.21) == 1.2
assert mround(4.77) == 4.75
assert mround(4.78) == 4.8
def test_decimals():
assert Decimal(8) == 8

View File

@ -1,7 +1,52 @@
from creation_helpers import pd_positions_df
from creation_helpers import pd_positions_df, mround, sanitize_chance_output
def test_positions_df():
cardset_19_pos = pd_positions_df(19)
assert True == True
def test_mround():
assert mround(6.4) == 6.4
assert mround(6.66) == 6.65
def test_sanitize():
# def my_round_decimal(i: float):
# return Decimal(i).quantize(Decimal('0.05'), ROUND_HALF_EVEN)
# def my_round(num: float, to: float = 0.05):
# num, to = Decimal(str(num)), Decimal(str(to))
# return float(round(num / to) * to)
# assert my_round(6) == 6
# assert my_round(5.96) == 5.95
# assert my_round(5.84) == 5.85
# assert my_round(3.123) == 3.1
# assert math.floor(my_round(6)) == 6
# assert math.floor(my_round(5.96)) == 5
assert sanitize_chance_output(6) == 6.0
assert sanitize_chance_output(1.21) == 1.2
assert sanitize_chance_output(4.77) == 4.75
assert sanitize_chance_output(4.78) == 4.8
# step_1 = Decimal(6) / Decimal(0.05)
# step_1_5 = round(step_1, )
# step_2 = round(step_1)
# step_3 = float(step_2 * Decimal(0.05))
# step_4 = Decimal(step_3)
# assert Decimal(6) == Decimal(6).quantize(Decimal('0.05'), ROUND_HALF_EVEN)
# assert round(step_1) == 120
# # assert step_1 == 120
# assert step_1_5 == 120
# assert step_2 == 120
# assert step_3 == 6.0
# assert step_4 == Decimal('6.0')
# rounded_val = step_4.quantize(Decimal("0.05"), ROUND_HALF_EVEN)
# assert rounded_val == 6
# assert sanitize_chance_output(6) == 6