diff --git a/batters/calcs_batter.py b/batters/calcs_batter.py index 136552b..803e249 100644 --- a/batters/calcs_batter.py +++ b/batters/calcs_batter.py @@ -12,49 +12,49 @@ class BattingCardRatingsModel(pydantic.BaseModel): battingcard_id: int bat_hand: Literal['R', 'L', 'S'] vs_hand: Literal['R', 'L'] - all_hits: Decimal = Decimal(0.0) - all_other_ob: Decimal = Decimal(0.0) - all_outs: Decimal = Decimal(0.0) - rem_singles: Decimal = Decimal(0.0) - rem_xbh: Decimal = Decimal(0.0) - rem_hr: Decimal = Decimal(0.0) - rem_doubles: Decimal = Decimal(0.0) - hard_rate: Decimal - med_rate: Decimal - soft_rate: Decimal - pull_rate: Decimal - center_rate: Decimal - slap_rate: Decimal - homerun: Decimal = Decimal(0.0) - bp_homerun: Decimal = Decimal(0.0) - triple: Decimal = Decimal(0.0) - double_three: Decimal = Decimal(0.0) - double_two: Decimal = Decimal(0.0) - double_pull: Decimal = Decimal(0.0) - single_two: Decimal = Decimal(0.0) - single_one: Decimal = Decimal(0.0) - single_center: Decimal = Decimal(0.0) - bp_single: Decimal = Decimal(0.0) - hbp: Decimal = Decimal(0.0) - walk: Decimal = Decimal(0.0) - strikeout: Decimal = Decimal(0.0) - lineout: Decimal = Decimal(0.0) - popout: Decimal = Decimal(0.0) - rem_flyballs: Decimal = Decimal(0.0) - flyout_a: Decimal = Decimal(0.0) - flyout_bq: Decimal = Decimal(0.0) - flyout_lf_b: Decimal = Decimal(0.0) - flyout_rf_b: Decimal = Decimal(0.0) - rem_groundballs: Decimal = Decimal(0.0) - groundout_a: Decimal = Decimal(0.0) - groundout_b: Decimal = Decimal(0.0) - groundout_c: Decimal = Decimal(0.0) - avg: Decimal = 0.0 - obp: Decimal = 0.0 - slg: Decimal = 0.0 + all_hits: float = 0.0 + all_other_ob: float = 0.0 + all_outs: float = 0.0 + rem_singles: float = 0.0 + rem_xbh: float = 0.0 + rem_hr: float = 0.0 + rem_doubles: float = 0.0 + hard_rate: float + med_rate: float + soft_rate: float + pull_rate: float + center_rate: float + slap_rate: float + homerun: float = 0.0 + bp_homerun: float = 0.0 + triple: float = 0.0 + double_three: float = 0.0 + double_two: float = 0.0 + double_pull: float = 0.0 + single_two: float = 0.0 + single_one: float = 0.0 + single_center: float = 0.0 + bp_single: float = 0.0 + hbp: float = 0.0 + walk: float = 0.0 + strikeout: float = 0.0 + lineout: float = 0.0 + popout: float = 0.0 + rem_flyballs: float = 0.0 + flyout_a: float = 0.0 + flyout_bq: float = 0.0 + flyout_lf_b: float = 0.0 + flyout_rf_b: float = 0.0 + rem_groundballs: float = 0.0 + groundout_a: float = 0.0 + groundout_b: float = 0.0 + groundout_c: float = 0.0 + avg: float = 0.0 + obp: float = 0.0 + slg: float = 0.0 def total_chances(self): - return Decimal(sum([ + return mround(sum([ self.homerun, self.bp_homerun, self.triple, self.double_three, self.double_two, self.double_pull, self.single_two, self.single_one, self.single_center, self.bp_single, self.hbp, self.walk, self.strikeout, self.lineout, self.popout, self.flyout_a, self.flyout_bq, self.flyout_lf_b, self.flyout_rf_b, @@ -62,7 +62,7 @@ class BattingCardRatingsModel(pydantic.BaseModel): ])) def total_hits(self): - return Decimal(sum([ + return mround(sum([ self.homerun, self.bp_homerun, self.triple, self.double_three, self.double_two, self.double_pull, self.single_two, self.single_one, self.single_center, self.bp_single ])) @@ -75,7 +75,7 @@ class BattingCardRatingsModel(pydantic.BaseModel): ])) def rem_outs(self): - return Decimal(self.all_outs - + return mround(self.all_outs - sum([ self.strikeout, self.lineout, self.popout, self.flyout_a, self.flyout_bq, self.flyout_lf_b, self.flyout_rf_b, self.groundout_a, self.groundout_b, self.groundout_c @@ -85,7 +85,7 @@ class BattingCardRatingsModel(pydantic.BaseModel): return self.all_other_ob - self.hbp - self.walk def calculate_singles(self, szn_singles, szn_hits, ifh_rate: Decimal): - tot = sanitize_chance_output(self.all_hits * Decimal((szn_singles * .8) / max(szn_hits, 1))) + tot = sanitize_chance_output(self.all_hits * mround((szn_singles * .8) / max(szn_hits, 1))) logging.debug(f'tot: {tot}') self.rem_singles = tot @@ -107,15 +107,15 @@ class BattingCardRatingsModel(pydantic.BaseModel): self.triple = triples(self.rem_xbh, szn_triples, szn_doubles + szn_hr) self.rem_xbh -= self.triple - tot_doubles = sanitize_chance_output(self.rem_xbh * Decimal(szn_doubles / max(szn_hr + szn_doubles, 1))) + tot_doubles = sanitize_chance_output(self.rem_xbh * mround(szn_doubles / max(szn_hr + szn_doubles, 1))) self.double_two = two_doubles(tot_doubles, self.soft_rate) self.double_pull = sanitize_chance_output(tot_doubles - self.double_two) - self.rem_xbh -= Decimal(self.double_two + self.double_pull) + self.rem_xbh -= mround(self.double_two + self.double_pull) - if (self.rem_xbh > Decimal(0)) and szn_hr > 0: + if (self.rem_xbh > mround(0)) and szn_hr > 0: self.bp_homerun = bp_homeruns(self.rem_xbh, hr_per_fb) self.homerun = sanitize_chance_output(self.rem_xbh - self.bp_homerun, min_chances=0.5) - self.rem_xbh -= Decimal(self.bp_homerun + self.homerun) + self.rem_xbh -= mround(self.bp_homerun + self.homerun) if szn_triples > 0 and self.rem_xbh > 0: logging.error(f'Adding {self.rem_xbh} results to triples') @@ -133,13 +133,13 @@ class BattingCardRatingsModel(pydantic.BaseModel): rem = self.all_other_ob - self.walk - self.hbp logging.error(f'Adding {rem} chances to all_outs') # print(self) - self.all_outs += Decimal(rem) + self.all_outs += mround(rem) def calculate_strikeouts(self, szn_so, szn_ab, szn_hits): self.strikeout = strikeouts(self.all_outs, (szn_so / max(szn_ab - szn_hits, 1))) def calculate_other_outs(self, fb_rate, ld_rate, gb_rate, szn_gidp, szn_ab): - self.rem_flyballs = sanitize_chance_output(self.rem_outs() * Decimal(fb_rate)) + self.rem_flyballs = sanitize_chance_output(self.rem_outs() * mround(fb_rate)) self.flyout_a = flyout_a(self.rem_flyballs, self.hard_rate) self.rem_flyballs -= self.flyout_a @@ -158,8 +158,8 @@ class BattingCardRatingsModel(pydantic.BaseModel): if self.rem_flyballs > 0: logging.debug(f'Adding {self.rem_flyballs} chances to lineouts') - tot_oneouts = sanitize_chance_output(self.rem_outs() * Decimal(ld_rate / max(ld_rate + gb_rate, .01))) - self.lineout = sanitize_chance_output(Decimal(random.random()) * tot_oneouts) + tot_oneouts = sanitize_chance_output(self.rem_outs() * mround(ld_rate / max(ld_rate + gb_rate, .01))) + self.lineout = sanitize_chance_output(mround(random.random()) * tot_oneouts) self.popout = sanitize_chance_output(tot_oneouts - self.lineout) self.groundout_a = groundball_a(self.rem_outs(), szn_gidp, szn_ab) @@ -167,9 +167,9 @@ class BattingCardRatingsModel(pydantic.BaseModel): self.groundout_b = self.rem_outs() def calculate_rate_stats(self): - self.avg = Decimal(round(self.total_hits() / 108, 3)) - self.obp = Decimal(round((self.total_hits() + self.hbp + self.walk) / 108, 3)) - self.slg = Decimal(round( + self.avg = mround(round(self.total_hits() / 108, 3)) + self.obp = mround(round((self.total_hits() + self.hbp + self.walk) / 108, 3)) + self.slg = mround(round( self.homerun * 4 + self.triple * 3 + self.single_center + self.single_two + self.single_two + (self.double_two + self.double_three + self.double_two + self.bp_homerun) * 2 + self.bp_single / 2 )) @@ -178,31 +178,31 @@ class BattingCardRatingsModel(pydantic.BaseModel): return { 'battingcard_id': self.battingcard_id, 'vs_hand': self.vs_hand, - 'homerun': float(self.homerun), - 'bp_homerun': float(self.bp_homerun), - 'triple': float(self.triple), - 'double_three': float(self.double_three), - 'double_two': float(self.double_two), - 'double_pull': float(self.double_pull), - 'single_two': float(self.single_two), - 'single_one': float(self.single_one), - 'single_center': float(self.single_center), - 'bp_single': float(self.bp_single), - 'hbp': float(self.hbp), - 'walk': float(self.walk), - 'strikeout': float(self.strikeout), - 'lineout': float(self.lineout), - 'popout': float(self.popout), - 'flyout_a': float(self.flyout_a), - 'flyout_bq': float(self.flyout_bq), - 'flyout_lf_b': float(self.flyout_lf_b), - 'flyout_rf_b': float(self.flyout_rf_b), - 'groundout_a': float(self.groundout_a), - 'groundout_b': float(self.groundout_b), - 'groundout_c': float(self.groundout_c), - 'pull_rate': float(self.pull_rate), - 'center_rate': float(self.center_rate), - 'slap_rate': float(self.slap_rate) + 'homerun': self.homerun, + 'bp_homerun': self.bp_homerun, + 'triple': self.triple, + 'double_three': self.double_three, + 'double_two': self.double_two, + 'double_pull': self.double_pull, + 'single_two': self.single_two, + 'single_one': self.single_one, + 'single_center': self.single_center, + 'bp_single': self.bp_single, + 'hbp': self.hbp, + 'walk': self.walk, + 'strikeout': mround(self.strikeout), + 'lineout': self.lineout, + 'popout': self.popout, + 'flyout_a': self.flyout_a, + 'flyout_bq': self.flyout_bq, + 'flyout_lf_b': self.flyout_lf_b, + 'flyout_rf_b': self.flyout_rf_b, + 'groundout_a': self.groundout_a, + 'groundout_b': self.groundout_b, + 'groundout_c': self.groundout_c, + 'pull_rate': self.pull_rate, + 'center_rate': self.center_rate, + 'slap_rate': self.slap_rate } # def total_chances(chance_data): @@ -220,27 +220,27 @@ def total_singles(all_hits, szn_singles, szn_hits): def bp_singles(all_singles): if all_singles < 6: - return Decimal(0) + return mround(0) else: - return Decimal(5) + return mround(5) def wh_singles(rem_singles, hard_rate): if rem_singles == 0 or hard_rate < .2: return 0 elif hard_rate > .4: - return sanitize_chance_output(rem_singles * Decimal(.666), min_chances=2) + return sanitize_chance_output(rem_singles * 2 / 3, min_chances=2) else: - return sanitize_chance_output(rem_singles * Decimal(.333), min_chances=2) + return sanitize_chance_output(rem_singles / 3, min_chances=2) def one_singles(rem_singles, ifh_rate, force_rem=False): if force_rem: return mround(rem_singles) elif rem_singles == 0 or ifh_rate < .05: - return Decimal(0) + return mround(0) else: - return sanitize_chance_output(rem_singles * ifh_rate * Decimal(3), min_chances=2) + return sanitize_chance_output(rem_singles * ifh_rate * mround(3), min_chances=2) def all_homeruns(rem_hits, all_hits, hrs, hits, singles): @@ -252,7 +252,7 @@ def all_homeruns(rem_hits, all_hits, hrs, hits, singles): def nd_homeruns(all_hr, hr_rate): if all_hr == 0 or hr_rate == 0: - return Decimal(0) + return mround(0) elif hr_rate > .2: return sanitize_chance_output(all_hr * .6) else: @@ -261,23 +261,23 @@ def nd_homeruns(all_hr, hr_rate): def bp_homeruns(all_hr, hr_rate): if all_hr == 0 or hr_rate == 0: - return Decimal(0) + return mround(0) elif hr_rate > .2: - return sanitize_chance_output(all_hr * Decimal(.4), rounding=1.0) + return sanitize_chance_output(all_hr * mround(.4), rounding=1.0) else: - return sanitize_chance_output(all_hr * Decimal(.8), rounding=1.0) + return sanitize_chance_output(all_hr * mround(.8), rounding=1.0) def triples(all_xbh, tr_count, do_count): - if all_xbh == Decimal(0) or tr_count == Decimal(0): - return Decimal(0) + if all_xbh == mround(0) or tr_count == mround(0): + return mround(0) else: - return sanitize_chance_output(all_xbh * Decimal(tr_count / max(tr_count + do_count, 1)), min_chances=1) + return sanitize_chance_output(all_xbh * mround(tr_count / max(tr_count + do_count, 1)), min_chances=1) def two_doubles(all_doubles, soft_rate): if all_doubles == 0 or soft_rate == 0: - return Decimal(0) + return mround(0) elif soft_rate > .2: return sanitize_chance_output(all_doubles / 2) else: @@ -285,36 +285,36 @@ def two_doubles(all_doubles, soft_rate): def hit_by_pitch(other_ob, hbps, walks): - if hbps == 0 or other_ob * Decimal(hbps / max(hbps + walks, 1)) < 1: + if hbps == 0 or other_ob * mround(hbps / max(hbps + walks, 1)) < 1: return 0 else: - return sanitize_chance_output(other_ob * Decimal(hbps / max(hbps + walks, 1)), rounding=1.0) + return sanitize_chance_output(other_ob * mround(hbps / max(hbps + walks, 1)), rounding=1.0) def strikeouts(all_outs, k_rate): if all_outs == 0 or k_rate == 0: - return Decimal(0) + return mround(0) else: - return sanitize_chance_output(all_outs * Decimal(k_rate)) + return sanitize_chance_output(all_outs * k_rate) def flyout_a(all_flyouts, hard_rate): if all_flyouts == 0 or hard_rate < .4: - return Decimal(0) + return mround(0) else: - return Decimal(1.0) + return mround(1.0) def flyout_bq(rem_flyouts, soft_rate): if rem_flyouts == 0 or soft_rate < .1: - return Decimal(0) + return mround(0) else: - return sanitize_chance_output(rem_flyouts * min(soft_rate * 3, Decimal(.75))) + return sanitize_chance_output(rem_flyouts * min(soft_rate * 3, mround(.75))) def flyout_b(rem_flyouts, pull_rate, cent_rate): if rem_flyouts == 0 or pull_rate == 0: - return Decimal(0) + return mround(0) else: return sanitize_chance_output(rem_flyouts * (pull_rate + cent_rate / 2)) @@ -328,14 +328,14 @@ def popouts(rem_outs, iffb_rate): def groundball_a(all_groundouts, gidps, abs): if all_groundouts == 0 or gidps == 0: - return Decimal(0) + return mround(0) else: - return sanitize_chance_output(Decimal(min(gidps ** 2.5, abs) / max(abs, 1)) * all_groundouts) + return sanitize_chance_output(mround(min(gidps ** 2.5, abs) / max(abs, 1)) * all_groundouts) def groundball_c(rem_groundouts, med_rate): if rem_groundouts == 0 or med_rate < .4: - return Decimal(0) + return mround(0) elif med_rate > .6: return sanitize_chance_output(rem_groundouts) else: @@ -529,11 +529,11 @@ def get_batter_ratings(df_data) -> List[dict]: center_rate=df_data['Cent%_vR'], slap_rate=df_data['Oppo%_vR'] ) - vl.all_outs = Decimal(108 - vl.all_hits - vl.all_other_ob).quantize(Decimal("0.05")) - vr.all_outs = Decimal(108 - vr.all_hits - vr.all_other_ob).quantize(Decimal("0.05")) + vl.all_outs = mround(108 - vl.all_hits - vl.all_other_ob) #.quantize(Decimal("0.05")) + vr.all_outs = mround(108 - vr.all_hits - vr.all_other_ob) #.quantize(Decimal("0.05")) - vl.calculate_singles(df_data['1B_vL'], df_data['H_vL'], Decimal(df_data['IFH%_vL'])) - vr.calculate_singles(df_data['1B_vR'], df_data['H_vR'], Decimal(df_data['IFH%_vR'])) + vl.calculate_singles(df_data['1B_vL'], df_data['H_vL'], mround(df_data['IFH%_vL'])) + vr.calculate_singles(df_data['1B_vR'], df_data['H_vR'], mround(df_data['IFH%_vR'])) logging.debug( f'vL - All Hits: {vl.all_hits} / Other OB: {vl.all_other_ob} / All Outs: {vl.all_outs} ' @@ -574,11 +574,11 @@ def get_batter_ratings(df_data) -> List[dict]: # Correct total chance errors for x in [vl, vr]: if x.total_chances() < 108: - diff = Decimal(108) - x.total_chances() + diff = mround(108) - x.total_chances() logging.error(f'Adding {diff} strikeouts to close gap') x.strikeout += diff elif x.total_chances() > 108: - diff = x.total_chances() - Decimal(108) + diff = x.total_chances() - mround(108) logging.error(f'Have surplus of {diff} chances') if x.strikeout + 1 > diff: logging.error(f'Subtracting {diff} strikeouts to close gap') diff --git a/batters/stat_prep.py b/batters/stat_prep.py new file mode 100644 index 0000000..b4cea26 --- /dev/null +++ b/batters/stat_prep.py @@ -0,0 +1,106 @@ +import pandas as pd +import pydantic +from pydantic import root_validator, validator +from typing import Literal, Optional + + +class DataMismatchError(Exception): + pass + + +class BattingCardModel(pydantic.BaseModel): + player_id: Optional[int] = None + variant: int = 0 + steal_low: int = 3 + steal_high: int = 20 + steal_auto: bool = False + steal_jump: float = 0 + bunting: str = 'C' + hit_and_run: str = 'C' + running: int = 10 + offense_col: int = None + hand: Literal['R', 'L', 'S'] = 'R' + + +class CardPositionModel(pydantic.BaseModel): + player_id: int + variant: int = 0 + position: Literal['P', 'C', '1B', '2B', '3B', 'SS', 'LF', 'CF', 'RF', 'DH'] + innings: int = 1 + range: int = 5 + error: int = 0 + arm: Optional[int] = None + pb: Optional[int] = None + overthrow: Optional[int] = None + + @root_validator + def position_validator(cls, values): + if values['position'] in ['C', 'LF', 'CF', 'RF'] and values['arm'] is None: + raise ValueError(f'{values["position"]} must have an arm rating') + if values['position'] == 'C' and (values['pb'] is None or values['overthrow'] is None): + raise ValueError('Catchers must have a pb and overthrow rating') + return values + + +class BattingCardRatingsModel(pydantic.BaseModel): + battingcard_id: int + vs_hand: Literal['R', 'L', 'vR', 'vL'] + homerun: float = 0.0 + bp_homerun: float = 0.0 + triple: float = 0.0 + double_three: float = 0.0 + double_two: float = 0.0 + double_pull: float = 0.0 + single_two: float = 0.0 + single_one: float = 0.0 + single_center: float = 0.0 + bp_single: float = 0.0 + hbp: float = 0.0 + walk: float = 0.0 + strikeout: float = 0.0 + lineout: float = 0.0 + popout: float = 0.0 + flyout_a: float = 0.0 + flyout_bq: float = 0.0 + flyout_lf_b: float = 0.0 + flyout_rf_b: float = 0.0 + groundout_a: float = 0.0 + groundout_b: float = 0.0 + groundout_c: float = 0.0 + avg: float = 0.0 + obp: float = 0.0 + slg: float = 0.0 + pull_rate: float = 0.0 + center_rate: float = 0.0 + slap_rate: float = 0.0 + + @validator("avg", always=True) + def avg_validator(cls, v, values, **kwargs): + return (values['homerun'] + values['bp_homerun'] / 2 + values['triple'] + values['double_three'] + + values['double_two'] + values['double_pull'] + values['single_two'] + values['single_one'] + + values['single_center'] + values['bp_single'] / 2) / 108 + + @validator("obp", always=True) + def obp_validator(cls, v, values, **kwargs): + return ((values['hbp'] + values['walk']) / 108) + values['avg'] + + @validator("slg", always=True) + def slg_validator(cls, v, values, **kwargs): + return (values['homerun'] * 4 + values['bp_homerun'] * 2 + values['triple'] * 3 + values['double_three'] * 2 + + values['double_two'] * 2 + values['double_pull'] * 2 + values['single_two'] + values['single_one'] + + values['single_center'] + values['bp_single'] / 2) / 108 + + @root_validator + def validate_chance_total(cls, values): + total_chances = ( + values['homerun'] + values['bp_homerun'] + values['triple'] + values['double_three'] + + values['double_two'] + values['double_pull'] + values['single_two'] + values['single_one'] + + values['single_center'] + values['bp_single'] + values['hbp'] + values['walk'] + + values['strikeout'] + values['lineout'] + values['popout'] + values['flyout_a'] + + values['flyout_bq'] + values['flyout_lf_b'] + values['flyout_rf_b'] + values['groundout_a'] + + values['groundout_b'] + values['groundout_c']) + + if round(total_chances) != 108: + raise ValueError(f'BC {values["battingcard_id"]} must have exactly 108 chances on the card ' + f'{values["vs_hand"]}; {round(total_chances)} listed') + return values diff --git a/creation_helpers.py b/creation_helpers.py index c342d19..d1102ed 100644 --- a/creation_helpers.py +++ b/creation_helpers.py @@ -865,7 +865,7 @@ def sanitize_chance_output(total_chances, min_chances=1.0, rounding=0.05): logging.debug(f'sanitize: {total_chances} is less than min_chances ({min_chances}); returning 0') return 0 - rounded_decimal = round(Decimal(str(total_chances)) / Decimal(str(rounding))) * Decimal(str(rounding)) + rounded_decimal = mround(Decimal(str(total_chances)) / Decimal(str(rounding)) * Decimal(str(rounding))) exact_chances = [ Decimal('1.05'), Decimal('1.1'), Decimal('1.2'), Decimal('1.25'), Decimal('1.3'), Decimal('1.35'), @@ -879,7 +879,7 @@ def sanitize_chance_output(total_chances, min_chances=1.0, rounding=0.05): ] if rounded_decimal > exact_chances[-1]: - return float(rounded_decimal) + return rounded_decimal for x in exact_chances: if rounded_decimal <= x: diff --git a/defenders/calcs_defense.py b/defenders/calcs_defense.py index 1270ac1..2dea882 100644 --- a/defenders/calcs_defense.py +++ b/defenders/calcs_defense.py @@ -530,13 +530,18 @@ def get_bbref_fielding_df( row_data = [] col_names = [] for cell in row.find_all('td'): - try: + if cell.has_attr('data-append-csv'): player_id = cell['data-append-csv'] row_data.append(player_id) if len(headers) == 0: col_names.append('key_bbref') - except Exception as e: - pass + # try: + # player_id = cell['data-append-csv'] + # row_data.append(player_id) + # if len(headers) == 0: + # col_names.append('key_bbref') + # except Exception as e: + # pass row_data.append(cell.text) if len(headers) == 0: col_names.append(cell['data-stat']) diff --git a/retrosheet_data.py b/retrosheet_data.py index 1ec19f3..6e15ebd 100644 --- a/retrosheet_data.py +++ b/retrosheet_data.py @@ -9,6 +9,11 @@ import pandas as pd import pybaseball as pb from pybaseball import cache +from creation_helpers import get_args +from batters.stat_prep import DataMismatchError +import batters.calcs_batter as cba +import defenders.calcs_defense as cde + cache.enable() date = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}' log_level = logging.INFO @@ -17,9 +22,23 @@ logging.basicConfig( format='%(asctime)s - retrosheet_data - %(levelname)s - %(message)s', level=log_level ) -FILE_PATH = 'data-input/retrosheet/' + +RETRO_FILE_PATH = 'data-input/retrosheet/' EVENTS_FILENAME = 'retrosheets_events_1998_short.csv' # Removed last few columns which were throwing dtype errors PERSONNEL_FILENAME = 'retrosheets_personnel.csv' +DATA_INPUT_FILE_PATH = 'data-input/1998 Season Cardset/' + +MIN_PA_VL = 20 +MIN_PA_VR = 40 +MIN_TBF_VL = MIN_PA_VL +MIN_TBF_VR = MIN_PA_VR + + +async def store_defense_to_csv(season: int): + for position in ['c', '1b', '2b', '3b', 'ss', 'lf', 'cf', 'rf', 'of', 'p']: + pos_df = cde.get_bbref_fielding_df(position, season) + pos_df.to_csv(f'{DATA_INPUT_FILE_PATH}defense_{position}.csv') + await asyncio.sleep(8) def get_events_by_date(file_path: str, start_date: int, end_date: int) -> pd.DataFrame: @@ -34,12 +53,25 @@ def get_result_series(plays: pd.DataFrame, event_type: str, pitcher_hand: Litera return this_series -# def get_batting_handedness(plays: pd.DataFrame) -> pd.DataFrame: +def get_run_stat_df(input_path: str): + run_data = pd.read_csv(f'{input_path}running.csv') #.set_index('Name-additional')) + # if 'Player' in run_data: + # run_data = run_data.rename(columns={'Player': 'Full Name'}) + # if 'Name' in run_data: + # run_data = run_data.rename(columns={'Name': 'Full Name'}) + if 'Player-additional' in run_data: + run_data = run_data.rename(columns={'Player-additional': 'key_bbref'}) + if 'Name-additional' in run_data: + run_data = run_data.rename(columns={'Name-additional': 'key_bbref'}) + run_data = run_data[['key_bbref', 'ROE', 'XI', 'RS%', 'SBO', 'SB', 'CS', 'SB%', 'SB2', 'CS2', 'SB3', 'CS3', 'SBH', 'CSH', 'PO', 'PCS', 'OOB', 'OOB1', 'OOB2', 'OOB3', 'OOBHm', 'BT', 'XBT%', '1stS', '1stS2', '1stS3', '1stD', '1stD3', '1stDH', '2ndS', '2ndS3', '2ndSH']] + + run_data = run_data.fillna(0) + return run_data.set_index('key_bbref') def get_player_ids(plays: pd.DataFrame, which: Literal['batters', 'pitchers']) -> pd.DataFrame: - RETRO_PLAYERS = pd.read_csv(f'{FILE_PATH}{PERSONNEL_FILENAME}') + RETRO_PLAYERS = pd.read_csv(f'{RETRO_FILE_PATH}{PERSONNEL_FILENAME}') id_key = 'batter_id' if which == 'batters' else 'pitcher_id' players = pd.DataFrame() @@ -110,19 +142,18 @@ def get_base_batting_df(all_plays: pd.DataFrame) -> pd.DataFrame: bs = pd.concat([bs, pal_series], axis=1) par_series = all_plays[(all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('PA_vR') bs = pd.concat([bs, par_series], axis=1) - # bs = bs.dropna().query('PA_vL >= 20 & PA_vR >= 40') abl_series = all_plays[(all_plays.ab == 't') & (all_plays.pitcher_hand == 'l')].groupby('batter_id').count()['event_type'].astype(int).rename('AB_vL') bs = pd.concat([bs, abl_series], axis=1) abr_series = all_plays[(all_plays.ab == 't') & (all_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('AB_vR') bs = pd.concat([bs, abr_series], axis=1) - return bs.dropna().query('PA_vL >= 20 & PA_vR >= 40') + return bs.dropna().query(f'PA_vL >= {MIN_PA_VL} & PA_vR >= {MIN_PA_VR}') -def get_batting_stats_by_date(file_path, start_date: int, end_date: int) -> pd.DataFrame: +def get_batting_stats_by_date(retro_file_path, start_date: int, end_date: int) -> pd.DataFrame: start = datetime.datetime.now() - all_plays = get_events_by_date(file_path, start_date, end_date) + all_plays = get_events_by_date(retro_file_path, start_date, end_date) print(f'Pull events: {(datetime.datetime.now() - start).total_seconds():.2f}s') start = datetime.datetime.now() @@ -291,20 +322,248 @@ def get_batting_stats_by_date(file_path, start_date: int, end_date: int) -> pd.D return batting_stats -async def main(args): +def calc_batting_cards(bs: pd.DataFrame) -> pd.DataFrame: + def create_batting_card(row): + steal_data = cba.stealing( + chances=int(row['SBO']), + sb2s=int(row['SB2']), + cs2s=int(row['CS2']), + sb3s=int(row['SB3']), + cs3s=int(row['CS3']), + season_pct=1.0 + ) + y = pd.DataFrame({ + 'key_bbref': [row['key_bbref']], + 'steal_low': [steal_data[0]], + 'steal_high': [steal_data[1]], + 'steal_auto': [steal_data[2]], + 'steal_jump': [steal_data[3]], + 'hit_and_run': [cba.hit_and_run( + row['AB_vL'], row['AB_vR'], row['H_vL'], row['H_vR'], + row['HR_vL'], row['HR_vR'], row['SO_vL'], row['SO_vR'] + )], + 'bunt': [0], + 'running': [cba.running(row['XBT%'])], + 'hand': [row['bat_hand']], + }) + return y.loc[0] + + all_cards = bs.apply(create_batting_card, axis=1) + + return all_cards + + +def calc_batter_ratings(bs: pd.DataFrame) -> pd.DataFrame: + def create_batting_rating(row): + ratings = cba.get_batter_ratings(row) + # list_of_ratings = ratings[0] + x = pd.DataFrame({ + 'key_bbref': [row['key_bbref']], + 'ratings_vL': [ratings[0]], + 'ratings_vR': [ratings[1]] + }) + return x.loc[0] + + all_ratings = bs.apply(create_batting_rating, axis=1) + + return all_ratings + + +def calc_positions(bs: pd.DataFrame) -> pd.DataFrame: + def process_pos(row): + no_data = True + for pos_data in [(df_1b, '1b'), (df_2b, '2b'), (df_3b, '3b'), (df_ss, 'ss')]: + if row['key_bbref'] in pos_data[0].index: + logging.info(f'Running {pos_data[1]} stats for {row["p_name"]}') + try: + average_range = (int(pos_data[0].at[row["key_bbref"], 'tz_runs_total']) + + int(pos_data[0].at[row["key_bbref"], 'bis_runs_total']) + + min( + int(pos_data[0].at[row["key_bbref"], 'tz_runs_total']), + int(pos_data[0].at[row["key_bbref"], 'bis_runs_total']) + )) / 3 + + position_payload.append({ # TODO: convert position_payload to a list? + "player_id": int(row['player_id']), + "position": pos_data[1].upper(), + "innings": float(pos_data[0].at[row["key_bbref"], 'Inn_def']), + "range": get_if_range( + pos_code=pos_data[1], + tz_runs=round(average_range), + r_dp=0, + season_pct=season_pct + ), + "error": get_any_error( + pos_code=pos_data[1], + errors=int(pos_data[0].at[row["key_bbref"], 'E_def']), + chances=int(pos_data[0].at[row["key_bbref"], 'chances']), + season_pct=season_pct + ) + }) + no_data = False + except Exception as e: + logging.info(f'Infield position failed: {e}') + + of_arms = [] + of_payloads = [] + for pos_data in [(df_lf, 'lf'), (df_cf, 'cf'), (df_rf, 'rf')]: + if row["key_bbref"] in pos_data[0].index: + try: + average_range = (int(pos_data[0].at[row["key_bbref"], 'tz_runs_total']) + + int(pos_data[0].at[row["key_bbref"], 'bis_runs_total']) + + min( + int(pos_data[0].at[row["key_bbref"], 'tz_runs_total']), + int(pos_data[0].at[row["key_bbref"], 'bis_runs_total']) + )) / 3 + of_payloads.append({ + "player_id": int(row['player_id']), + "position": pos_data[1].upper(), + "innings": float(pos_data[0].at[row["key_bbref"], 'Inn_def']), + "range": get_of_range( + pos_code=pos_data[1], + tz_runs=round(average_range), + season_pct=season_pct + ) + }) + of_arms.append(int(pos_data[0].at[row["key_bbref"], 'bis_runs_outfield'])) + no_data = False + except Exception as e: + logging.info(f'Outfield position failed: {e}') + + if row["key_bbref"] in df_of.index and len(of_arms) > 0 and len(of_payloads) > 0: + try: + error_rating = get_any_error( + pos_code=pos_data[1], + errors=int(df_of.at[row["key_bbref"], 'E_def']), + chances=int(df_of.at[row["key_bbref"], 'chances']), + season_pct=season_pct + ) + arm_rating = arm_outfield(of_arms) + for f in of_payloads: + f['error'] = error_rating + f['arm'] = arm_rating + position_payload.append(f) + no_data = False + except Exception as e: + logging.info(f'Outfield position failed: {e}') + + if row["key_bbref"] in df_c.index: + try: + if df_c.at[row["key_bbref"], 'SB'] + df_c.at[row["key_bbref"], 'CS'] == 0: + arm_rating = 3 + else: + arm_rating = arm_catcher( + cs_pct=df_c.at[row["key_bbref"], 'caught_stealing_perc'], + raa=int(df_c.at[row["key_bbref"], 'bis_runs_catcher_sb']), + season_pct=season_pct + ) + position_payload.append({ + "player_id": int(row['player_id']), + "position": 'C', + "innings": float(df_c.at[row["key_bbref"], 'Inn_def']), + "range": range_catcher( + rs_value=int(df_c.at[row["key_bbref"], 'tz_runs_catcher']), + season_pct=season_pct + ), + "error": get_any_error( + pos_code='c', + errors=int(df_c.at[row["key_bbref"], 'E_def']), + chances=int(df_c.at[row["key_bbref"], 'chances']), + season_pct=season_pct + ), + "arm": arm_rating, + "pb": pb_catcher( + pb=int(df_c.at[row["key_bbref"], 'PB']), + innings=int(float(df_c.at[row["key_bbref"], 'Inn_def'])), + season_pct=season_pct + ), + "overthrow": ot_catcher( + errors=int(df_c.at[row["key_bbref"], 'E_def']), + chances=int(df_c.at[row["key_bbref"], 'chances']), + season_pct=season_pct + ) + }) + no_data = False + except Exception as e: + logging.info(f'Catcher position failed: {e}') + + if no_data: + position_payload.append({ + "player_id": int(row['player_id']), + "position": 'DH', + "innings": row['PA_vL'] + row['PA_vR'] + }) + + all_pos = bs.apply(process_pos, axis=1) + + return all_pos + + +def run_batters(data_input_path: str, start_date: int, end_date: int): print(f'Running the batter calcs...') batter_start = datetime.datetime.now() - data = get_batting_stats_by_date(f'{FILE_PATH}{EVENTS_FILENAME}', start_date=19980101, end_date=19980430) + + # Get batting stats + batting_stats = get_batting_stats_by_date(f'{RETRO_FILE_PATH}{EVENTS_FILENAME}', start_date=start_date, end_date=end_date) + bs_len = len(batting_stats) + + end_calc = datetime.datetime.now() + print(f'Batting stats: {(end_calc - batter_start).total_seconds():.2f}s') + running_start = datetime.datetime.now() + + # Get running stats + running_stats = get_run_stat_df(data_input_path) + run_len = len(running_stats) + + batting_stats = pd.merge( + left=batting_stats, + right=running_stats, + how='left', + left_on='key_bbref', + right_on='key_bbref' + ) end_calc = datetime.datetime.now() + print(f'Running stats: {(end_calc - running_start).total_seconds():.2f}s') - data.to_csv(f'batting_stats.csv') - end_save = datetime.datetime.now() - print(f'\nBatter time: {(end_calc - batter_start).total_seconds():.2f}s\nSave time: {(end_save - end_calc).total_seconds():.2f}s') + if len(batting_stats) != bs_len: + raise DataMismatchError(f'retrosheet_data - run_batters - We started with {bs_len} batting lines and have {len(batting_stats)} after merging with running_stats') + + # Calculate batting cards + card_start = datetime.datetime.now() + all_batting_cards = calc_batting_cards(batting_stats) + card_end = datetime.datetime.now() - pitcher_start = datetime.datetime.now() - end_pitcher = datetime.datetime.now() + print(f'Create batting cards: {(card_end - card_start).total_seconds()}s') - print(f'\nPitcher time: {(end_pitcher - pitcher_start).total_seconds():.2f}s\n\nTotal: {(end_pitcher - batter_start).total_seconds():.2f}s\n\nDone!') + # Calculate batting ratings + rating_start = datetime.datetime.now() + batting_stats['battingcard_id'] = batting_stats['key_fangraphs'] + all_batting_ratings = calc_batter_ratings(batting_stats) + rating_end = datetime.datetime.now() + + print(f'Create batting ratings: {(rating_end - rating_start).total_seconds()}s') + + # Calculate defense ratings + defense_start = datetime.datetime.now() + all_defense_ratings = calc_positions(batting_stats) + defense_end = datetime.datetime.now() + + print(f'Create defense ratings: {(defense_end - defense_start).total_seconds()}s') + + return batting_stats + + +async def main(args): + # batter_start = datetime.datetime.now() + # batting_stats = run_batters(f'{DATA_INPUT_FILE_PATH}', start_date=19980101, end_date=19980430) + # batting_stats.to_csv(f'batting_stats.csv') + # batter_end = datetime.datetime.now() + + # pitcher_start = datetime.datetime.now() + # pitcher_end = datetime.datetime.now() + + # print(f'\n\nBatter time: {(batter_end - batter_start).total_seconds():.2f}s \nPitcher time: {(pitcher_end - pitcher_start).total_seconds():.2f}s\nTotal: {(pitcher_end - batter_start).total_seconds():.2f}s\n\nDone!') + await store_defense_to_csv(1998) if __name__ == '__main__':