diff --git a/batters/calcs_batter.py b/batters/calcs_batter.py index 4f287da..e261db7 100644 --- a/batters/calcs_batter.py +++ b/batters/calcs_batter.py @@ -8,14 +8,17 @@ from exceptions import logger from batters.models import BattingCardRatingsModel from batters.card_builder import build_batter_full_cards -def stealing(chances: int, sb2s: int, cs2s: int, sb3s: int, cs3s: int, season_pct: float): + +def stealing( + chances: int, sb2s: int, cs2s: int, sb3s: int, cs3s: int, season_pct: float +): if chances == 0 or sb2s + cs2s == 0: return 0, 0, False, 0 total_attempts = sb2s + cs2s + sb3s + cs3s attempt_pct = total_attempts / chances - if attempt_pct >= .08: + if attempt_pct >= 0.08: st_auto = True else: st_auto = False @@ -62,7 +65,7 @@ def stealing_line(steal_data: dict): jump_chances = round(sd[3] * 36) if jump_chances == 0: - good_jump = '-' + good_jump = "-" elif jump_chances <= 6: if jump_chances == 6: good_jump = 7 @@ -77,76 +80,76 @@ def stealing_line(steal_data: dict): elif jump_chances == 1: good_jump = 2 elif jump_chances == 7: - good_jump = '4,5' + good_jump = "4,5" elif jump_chances == 8: - good_jump = '4,6' + good_jump = "4,6" elif jump_chances == 9: - good_jump = '3-5' + good_jump = "3-5" elif jump_chances == 10: - good_jump = '2-5' + good_jump = "2-5" elif jump_chances == 11: - good_jump = '6,7' + good_jump = "6,7" elif jump_chances == 12: - good_jump = '4-6' + good_jump = "4-6" elif jump_chances == 13: - good_jump = '2,4-6' + good_jump = "2,4-6" elif jump_chances == 14: - good_jump = '3-6' + good_jump = "3-6" elif jump_chances == 15: - good_jump = '2-6' + good_jump = "2-6" elif jump_chances == 16: - good_jump = '2,5-6' + good_jump = "2,5-6" elif jump_chances == 17: - good_jump = '3,5-6' + good_jump = "3,5-6" elif jump_chances == 18: - good_jump = '4-6' + good_jump = "4-6" elif jump_chances == 19: - good_jump = '2,4-7' + good_jump = "2,4-7" elif jump_chances == 20: - good_jump = '3-7' + good_jump = "3-7" elif jump_chances == 21: - good_jump = '2-7' + good_jump = "2-7" elif jump_chances == 22: - good_jump = '2-7,12' + good_jump = "2-7,12" elif jump_chances == 23: - good_jump = '2-7,11' + good_jump = "2-7,11" elif jump_chances == 24: - good_jump = '2,4-8' + good_jump = "2,4-8" elif jump_chances == 25: - good_jump = '3-8' + good_jump = "3-8" elif jump_chances == 26: - good_jump = '2-8' + good_jump = "2-8" elif jump_chances == 27: - good_jump = '2-8,12' + good_jump = "2-8,12" elif jump_chances == 28: - good_jump = '2-8,11' + good_jump = "2-8,11" elif jump_chances == 29: - good_jump = '3-9' + good_jump = "3-9" elif jump_chances == 30: - good_jump = '2-9' + good_jump = "2-9" elif jump_chances == 31: - good_jump = '2-9,12' + good_jump = "2-9,12" elif jump_chances == 32: - good_jump = '2-9,11' + good_jump = "2-9,11" elif jump_chances == 33: - good_jump = '2-10' + good_jump = "2-10" elif jump_chances == 34: - good_jump = '3-11' + good_jump = "3-11" elif jump_chances == 35: - good_jump = '2-11' + good_jump = "2-11" else: - good_jump = '2-12' + good_jump = "2-12" return f'{"*" if sd[2] else ""}{good_jump}/- ({sd[1] if sd[1] else "-"}-{sd[0] if sd[0] else "-"})' def running(extra_base_pct: str): - if extra_base_pct == '': + if extra_base_pct == "": return 8 try: xb_pct = float(extra_base_pct.strip("%")) / 80 except Exception as e: - logger.error(f'calcs_batter running - {e}') + logger.error(f"calcs_batter running - {e}") xb_pct = 20 return max(min(round(6 + (10 * xb_pct)), 17), 8) @@ -154,26 +157,36 @@ def running(extra_base_pct: str): def bunting(num_bunts: int, season_pct: float): if num_bunts > max(round(10 * season_pct), 4): - return 'A' + return "A" elif num_bunts > max(round(5 * season_pct), 2): - return 'B' + return "B" elif num_bunts > 1: - return 'C' + return "C" else: - return 'D' + return "D" - -def hit_and_run(ab_vl: int, ab_vr: int, hits_vl: int, hits_vr: int, hr_vl: int, hr_vr: int, so_vl: int, so_vr: int): - babip = (hits_vr + hits_vl - hr_vl - hr_vr) / max(ab_vl + ab_vr - so_vl - so_vr - hr_vl - hr_vl, 1) - if babip >= .35: - return 'A' - elif babip >= .3: - return 'B' - elif babip >= .25: - return 'C' +def hit_and_run( + ab_vl: int, + ab_vr: int, + hits_vl: int, + hits_vr: int, + hr_vl: int, + hr_vr: int, + so_vl: int, + so_vr: int, +): + babip = (hits_vr + hits_vl - hr_vl - hr_vr) / max( + ab_vl + ab_vr - so_vl - so_vr - hr_vl - hr_vl, 1 + ) + if babip >= 0.35: + return "A" + elif babip >= 0.3: + return "B" + elif babip >= 0.25: + return "C" else: - return 'D' + return "D" def get_batter_ratings(df_data) -> List[dict]: @@ -181,120 +194,154 @@ def get_batter_ratings(df_data) -> List[dict]: offense_mod = 1.2 vl = BattingCardRatingsModel( battingcard_id=df_data.battingcard_id, - bat_hand=df_data['bat_hand'], - vs_hand='L', - all_hits=sanitize_chance_output(108 * offense_mod * df_data['AVG_vL']), - all_other_ob=sanitize_chance_output(108 * offense_mod * - ((df_data['BB_vL'] + df_data['HBP_vL']) / df_data['PA_vL'])), - hard_rate=df_data['Hard%_vL'], - med_rate=df_data['Med%_vL'], - soft_rate=df_data['Soft%_vL'], - pull_rate=df_data['Pull%_vL'], - center_rate=df_data['Cent%_vL'], - slap_rate=df_data['Oppo%_vL'] + bat_hand=df_data["bat_hand"], + vs_hand="L", + all_hits=sanitize_chance_output(108 * offense_mod * df_data["AVG_vL"]), + all_other_ob=sanitize_chance_output( + 108 + * offense_mod + * ((df_data["BB_vL"] + df_data["HBP_vL"]) / df_data["PA_vL"]) + ), + hard_rate=df_data["Hard%_vL"], + med_rate=df_data["Med%_vL"], + soft_rate=df_data["Soft%_vL"], + pull_rate=df_data["Pull%_vL"], + center_rate=df_data["Cent%_vL"], + slap_rate=df_data["Oppo%_vL"], ) vr = BattingCardRatingsModel( battingcard_id=df_data.battingcard_id, - bat_hand=df_data['bat_hand'], - vs_hand='R', - all_hits=sanitize_chance_output(108 * offense_mod * df_data['AVG_vR']), - all_other_ob=sanitize_chance_output(108 * offense_mod * - ((df_data['BB_vR'] + df_data['HBP_vR']) / df_data['PA_vR'])), - hard_rate=df_data['Hard%_vR'], - med_rate=df_data['Med%_vR'], - soft_rate=df_data['Soft%_vR'], - pull_rate=df_data['Pull%_vR'], - center_rate=df_data['Cent%_vR'], - slap_rate=df_data['Oppo%_vR'] + bat_hand=df_data["bat_hand"], + vs_hand="R", + all_hits=sanitize_chance_output(108 * offense_mod * df_data["AVG_vR"]), + all_other_ob=sanitize_chance_output( + 108 + * offense_mod + * ((df_data["BB_vR"] + df_data["HBP_vR"]) / df_data["PA_vR"]) + ), + hard_rate=df_data["Hard%_vR"], + med_rate=df_data["Med%_vR"], + soft_rate=df_data["Soft%_vR"], + pull_rate=df_data["Pull%_vR"], + center_rate=df_data["Cent%_vR"], + slap_rate=df_data["Oppo%_vR"], ) - vl.all_outs = mround(108 - vl.all_hits - vl.all_other_ob) #.quantize(Decimal("0.05")) - vr.all_outs = mround(108 - vr.all_hits - vr.all_other_ob) #.quantize(Decimal("0.05")) + vl.all_outs = mround( + 108 - vl.all_hits - vl.all_other_ob + ) # .quantize(Decimal("0.05")) + vr.all_outs = mround( + 108 - vr.all_hits - vr.all_other_ob + ) # .quantize(Decimal("0.05")) - vl.calculate_singles(df_data['1B_vL'], df_data['H_vL'], mround(df_data['IFH%_vL'])) - vr.calculate_singles(df_data['1B_vR'], df_data['H_vR'], mround(df_data['IFH%_vR'])) + vl.calculate_singles(df_data["1B_vL"], df_data["H_vL"], mround(df_data["IFH%_vL"])) + vr.calculate_singles(df_data["1B_vR"], df_data["H_vR"], mround(df_data["IFH%_vR"])) logger.debug( - f'vL - All Hits: {vl.all_hits} / Other OB: {vl.all_other_ob} / All Outs: {vl.all_outs} ' - f'/ Total: {vl.all_hits + vl.all_other_ob + vl.all_outs}' + f"vL - All Hits: {vl.all_hits} / Other OB: {vl.all_other_ob} / All Outs: {vl.all_outs} " + f"/ Total: {vl.all_hits + vl.all_other_ob + vl.all_outs}" ) logger.debug( - f'vR - All Hits: {vr.all_hits} / Other OB: {vr.all_other_ob} / All Outs: {vr.all_outs} ' - f'/ Total: {vr.all_hits + vr.all_other_ob + vr.all_outs}' + f"vR - All Hits: {vr.all_hits} / Other OB: {vr.all_other_ob} / All Outs: {vr.all_outs} " + f"/ Total: {vr.all_hits + vr.all_other_ob + vr.all_outs}" ) - vl.calculate_xbh(df_data['3B_vL'], df_data['2B_vL'], df_data['HR_vL'], df_data['HR/FB_vL']) - vr.calculate_xbh(df_data['3B_vR'], df_data['2B_vR'], df_data['HR_vR'], df_data['HR/FB_vR']) + vl.calculate_xbh( + df_data["3B_vL"], df_data["2B_vL"], df_data["HR_vL"], df_data["HR/FB_vL"] + ) + vr.calculate_xbh( + df_data["3B_vR"], df_data["2B_vR"], df_data["HR_vR"], df_data["HR/FB_vR"] + ) - logger.debug(f'all_hits: {vl.all_hits} / sum of hits: {vl.total_chances()}') - logger.debug(f'all_hits: {vr.all_hits} / sum of hits: {vr.total_chances()}') + logger.debug(f"all_hits: {vl.all_hits} / sum of hits: {vl.total_chances()}") + logger.debug(f"all_hits: {vr.all_hits} / sum of hits: {vr.total_chances()}") - vl.calculate_other_ob(df_data['BB_vL'], df_data['HBP_vL']) - vr.calculate_other_ob(df_data['BB_vR'], df_data['HBP_vR']) + vl.calculate_other_ob(df_data["BB_vL"], df_data["HBP_vL"]) + vr.calculate_other_ob(df_data["BB_vR"], df_data["HBP_vR"]) - logger.debug(f'all on base: {vl.hbp + vl.walk + vl.total_hits()} / all chances: {vl.total_chances()}' - f'{"*******ERROR ABOVE*******" if vl.hbp + vl.walk + vl.total_hits() != vl.total_chances() else ""}') - logger.debug(f'all on base: {vr.hbp + vr.walk + vr.total_hits()} / all chances: {vr.total_chances()}' - f'{"*******ERROR ABOVE*******" if vr.hbp + vr.walk + vr.total_hits() != vr.total_chances() else ""}') + logger.debug( + f"all on base: {vl.hbp + vl.walk + vl.total_hits()} / all chances: {vl.total_chances()}" + f'{"*******ERROR ABOVE*******" if vl.hbp + vl.walk + vl.total_hits() != vl.total_chances() else ""}' + ) + logger.debug( + f"all on base: {vr.hbp + vr.walk + vr.total_hits()} / all chances: {vr.total_chances()}" + f'{"*******ERROR ABOVE*******" if vr.hbp + vr.walk + vr.total_hits() != vr.total_chances() else ""}' + ) - vl.calculate_strikeouts(df_data['SO_vL'], df_data['AB_vL'], df_data['H_vL']) - vr.calculate_strikeouts(df_data['SO_vR'], df_data['AB_vR'], df_data['H_vR']) + vl.calculate_strikeouts(df_data["SO_vL"], df_data["AB_vL"], df_data["H_vL"]) + vr.calculate_strikeouts(df_data["SO_vR"], df_data["AB_vR"], df_data["H_vR"]) - logger.debug(f'K rate vL: {round(vl.strikeout / vl.all_outs, 2)} / ' - f'K rate vR: {round(vr.strikeout / vr.all_outs, 2)}') + logger.debug( + f"K rate vL: {round(vl.strikeout / vl.all_outs, 2)} / " + f"K rate vR: {round(vr.strikeout / vr.all_outs, 2)}" + ) vl.calculate_other_outs( - df_data['FB%_vL'], df_data['LD%_vL'], df_data['GB%_vL'], df_data['GDP_vL'], df_data['AB_vL'] + df_data["FB%_vL"], + df_data["LD%_vL"], + df_data["GB%_vL"], + df_data["GDP_vL"], + df_data["AB_vL"], ) vr.calculate_other_outs( - df_data['FB%_vR'], df_data['LD%_vR'], df_data['GB%_vR'], df_data['GDP_vR'], df_data['AB_vR'] + df_data["FB%_vR"], + df_data["LD%_vR"], + df_data["GB%_vR"], + df_data["GDP_vR"], + df_data["AB_vR"], ) # Correct total chance errors for x in [vl, vr]: if x.total_chances() < 108: diff = mround(108) - x.total_chances() - logger.error(f'Adding {diff} strikeouts to close gap') + logger.error(f"Adding {diff} strikeouts to close gap") x.strikeout += diff elif x.total_chances() > 108: diff = x.total_chances() - mround(108) - logger.error(f'Have surplus of {diff} chances') + logger.error(f"Have surplus of {diff} chances") if x.strikeout + 1 > diff: - logger.error(f'Subtracting {diff} strikeouts to close gap') + logger.error(f"Subtracting {diff} strikeouts to close gap") x.strikeout -= diff elif x.lineout + 1 > diff: - logger.error(f'Subtracting {diff} lineouts to close gap') + logger.error(f"Subtracting {diff} lineouts to close gap") x.lineout -= diff elif x.groundout_a + 1 > diff: - logger.error(f'Subtracting {diff} gbA to close gap') + logger.error(f"Subtracting {diff} gbA to close gap") x.groundout_a -= diff elif x.groundout_b + 1 > diff: - logger.error(f'Subtracting {diff} gbB to close gap') + logger.error(f"Subtracting {diff} gbB to close gap") x.groundout_b -= diff elif x.groundout_c + 1 > diff: - logger.error(f'Subtracting {diff} gbC to close gap') + logger.error(f"Subtracting {diff} gbC to close gap") x.groundout_c -= diff vl_total_chances = vl.total_chances() vr_total_chances = vr.total_chances() if vl_total_chances != 108: - logger.error(f'total chances for {df_data.name} come to {vl_total_chances}') + logger.error(f"total chances for {df_data.name} come to {vl_total_chances}") else: - logger.debug(f'total chances: {vl_total_chances}') + logger.debug(f"total chances: {vl_total_chances}") if vr_total_chances != 108: - logger.error(f'total chances for {df_data.name} come to {vr_total_chances}') + logger.error(f"total chances for {df_data.name} come to {vr_total_chances}") else: - logger.debug(f'total chances: {vr_total_chances}') + logger.debug(f"total chances: {vr_total_chances}") vl_dict = vl.custom_to_dict() vr_dict = vr.custom_to_dict() try: + offense_col = int(df_data["offense_col"]) if "offense_col" in df_data else 1 + player_id = ( + int(df_data["player_id"]) + if "player_id" in df_data + else abs(hash(df_data["key_bbref"])) % 10000 + ) vl_card, vr_card = build_batter_full_cards( - vl, vr, int(df_data['offense_col']), int(df_data['player_id']), df_data['bat_hand'] + vl, vr, offense_col, player_id, df_data["bat_hand"] ) vl_dict.update(vl_card.card_output()) vr_dict.update(vr_card.card_output()) except Exception as e: - logger.warning(f'Card layout builder failed for {df_data.name}: {e}') + logger.warning(f"Card layout builder failed for {df_data.name}: {e}") return [vl_dict, vr_dict] diff --git a/offense_col_resolver.py b/offense_col_resolver.py new file mode 100644 index 0000000..18b86b1 --- /dev/null +++ b/offense_col_resolver.py @@ -0,0 +1,102 @@ +"""Resolve offense_col for players in the retrosheet pipeline. + +Three-tier resolution: +1. Cache hit → stored value from data-input/offense_col_cache.csv +2. API pre-fetch → bulk-fetch all MlbPlayers, merge new entries into cache +3. Hash fallback → deterministic hash(player_name) % 3 + 1 +""" + +import hashlib +import os + +import pandas as pd + +from db_calls import db_get +from exceptions import logger + +CACHE_PATH = "data-input/offense_col_cache.csv" + + +def hash_offense_col(player_name: str) -> int: + """Deterministic offense_col from player name. Returns 1, 2, or 3.""" + normalized = player_name.strip().lower() + digest = hashlib.md5(normalized.encode()).hexdigest() + return int(digest, 16) % 3 + 1 + + +def load_cache(path: str = CACHE_PATH) -> dict[str, int]: + """Load {key_bbref: offense_col} from CSV cache.""" + if not os.path.exists(path): + return {} + df = pd.read_csv(path, dtype={"key_bbref": str, "offense_col": int}) + return dict(zip(df["key_bbref"], df["offense_col"])) + + +def save_cache(cache: dict[str, tuple[str, int]], path: str = CACHE_PATH): + """Write cache to CSV. cache values are (player_name, offense_col).""" + rows = sorted( + [ + {"key_bbref": k, "player_name": v[0], "offense_col": v[1]} + for k, v in cache.items() + ], + key=lambda r: r["key_bbref"], + ) + pd.DataFrame(rows).to_csv(path, index=False) + + +async def resolve_offense_cols( + df: pd.DataFrame, api_available: bool = True +) -> pd.DataFrame: + """Add offense_col column to a stats DataFrame. + + Args: + df: DataFrame with key_bbref, use_name, last_name columns. + api_available: If True, fetch from API to refresh cache. + + Returns: + df with offense_col column added. + """ + cache = load_cache() + full_cache: dict[str, tuple[str, int]] = {} + + # Seed full_cache from existing file cache + for bbref, oc in cache.items(): + full_cache[bbref] = ("", oc) + + # Refresh from API if available + if api_available: + try: + result = await db_get("mlbplayers") + if result and "players" in result: + api_count = 0 + for p in result["players"]: + bbref = p.get("key_bbref") + oc = p.get("offense_col") + name = f'{p.get("first_name", "")} {p.get("last_name", "")}'.strip() + if bbref and oc: + full_cache[bbref] = (name, int(oc)) + api_count += 1 + logger.info( + f"offense_col_resolver: loaded {api_count} entries from API" + ) + save_cache(full_cache) + except Exception as e: + logger.warning( + f"offense_col_resolver: API fetch failed, using cache only: {e}" + ) + + # Build lookup from full_cache + lookup = {k: v[1] for k, v in full_cache.items()} + + # Resolve for each row + def resolve_row(row): + bbref = row.get("key_bbref", "") + if bbref in lookup: + return lookup[bbref] + name = f'{row.get("use_name", "")} {row.get("last_name", "")}'.strip() + oc = hash_offense_col(name) + logger.debug(f"offense_col_resolver: hash fallback for {name} ({bbref}) → {oc}") + return oc + + df["offense_col"] = df.apply(resolve_row, axis=1) + return df diff --git a/pitchers/calcs_pitcher.py b/pitchers/calcs_pitcher.py index a30b32d..8380bdd 100644 --- a/pitchers/calcs_pitcher.py +++ b/pitchers/calcs_pitcher.py @@ -6,94 +6,138 @@ from exceptions import logger from pitchers.models import PitchingCardRatingsModel from pitchers.card_builder import build_pitcher_full_cards + + def get_pitcher_ratings(df_data) -> List[dict]: # Calculate OB values with min cap (ensure scalar values for comparison) - ob_vl = float(108 * (df_data['BB_vL'] + df_data['HBP_vL']) / df_data['TBF_vL']) - ob_vr = float(108 * (df_data['BB_vR'] + df_data['HBP_vR']) / df_data['TBF_vR']) + ob_vl = float(108 * (df_data["BB_vL"] + df_data["HBP_vL"]) / df_data["TBF_vL"]) + ob_vr = float(108 * (df_data["BB_vR"] + df_data["HBP_vR"]) / df_data["TBF_vR"]) vl = PitchingCardRatingsModel( pitchingcard_id=df_data.pitchingcard_id, pit_hand=df_data.pitch_hand, - vs_hand='L', - all_hits=sanitize_chance_output((df_data['AVG_vL'] - 0.05) * 108), # Subtracting chances from BP results + vs_hand="L", + all_hits=sanitize_chance_output( + (df_data["AVG_vL"] - 0.05) * 108 + ), # Subtracting chances from BP results all_other_ob=sanitize_chance_output(min(ob_vl, 0.8)), - hard_rate=df_data['Hard%_vL'], - med_rate=df_data['Med%_vL'], - soft_rate=df_data['Soft%_vL'] + hard_rate=df_data["Hard%_vL"], + med_rate=df_data["Med%_vL"], + soft_rate=df_data["Soft%_vL"], ) vr = PitchingCardRatingsModel( pitchingcard_id=df_data.pitchingcard_id, pit_hand=df_data.pitch_hand, - vs_hand='R', - all_hits=sanitize_chance_output((df_data['AVG_vR'] - 0.05) * 108), # Subtracting chances from BP results + vs_hand="R", + all_hits=sanitize_chance_output( + (df_data["AVG_vR"] - 0.05) * 108 + ), # Subtracting chances from BP results all_other_ob=sanitize_chance_output(min(ob_vr, 0.8)), - hard_rate=df_data['Hard%_vR'], - med_rate=df_data['Med%_vR'], - soft_rate=df_data['Soft%_vR'] + hard_rate=df_data["Hard%_vR"], + med_rate=df_data["Med%_vR"], + soft_rate=df_data["Soft%_vR"], ) vl.all_outs = mround(108 - vl.all_hits - vl.all_other_ob, base=0.5) vr.all_outs = mround(108 - vr.all_hits - vr.all_other_ob, base=0.5) logger.info( - f'vL - All Hits: {vl.all_hits} / Other OB: {vl.all_other_ob} / All Outs: {vl.all_outs} ' - f'/ Total: {vl.total_chances()}' + f"vL - All Hits: {vl.all_hits} / Other OB: {vl.all_other_ob} / All Outs: {vl.all_outs} " + f"/ Total: {vl.total_chances()}" ) logger.info( - f'vR - All Hits: {vr.all_hits} / Other OB: {vr.all_other_ob} / All Outs: {vr.all_outs} ' - f'/ Total: {vr.total_chances()}' + f"vR - All Hits: {vr.all_hits} / Other OB: {vr.all_other_ob} / All Outs: {vr.all_outs} " + f"/ Total: {vr.total_chances()}" ) - vl.calculate_singles(df_data['H_vL'], df_data['H_vL'] - df_data['2B_vL'] - df_data['3B_vL'] - df_data['HR_vL']) - vr.calculate_singles(df_data['H_vR'], df_data['H_vR'] - df_data['2B_vR'] - df_data['3B_vR'] - df_data['HR_vR']) + vl.calculate_singles( + df_data["H_vL"], + df_data["H_vL"] - df_data["2B_vL"] - df_data["3B_vL"] - df_data["HR_vL"], + ) + vr.calculate_singles( + df_data["H_vR"], + df_data["H_vR"] - df_data["2B_vR"] - df_data["3B_vR"] - df_data["HR_vR"], + ) - logger.info(f'vL: All Hits: {vl.all_hits} / BP Singles: {vl.bp_single} / Single 2: {vl.single_two} / ' - f'Single 1: {vl.single_one} / Single CF: {vl.single_center}') - logger.info(f'vR: All Hits: {vr.all_hits} / BP Singles: {vr.bp_single} / Single 2: {vr.single_two} / ' - f'Single 1: {vr.single_one} / Single CF: {vr.single_center}') + logger.info( + f"vL: All Hits: {vl.all_hits} / BP Singles: {vl.bp_single} / Single 2: {vl.single_two} / " + f"Single 1: {vl.single_one} / Single CF: {vl.single_center}" + ) + logger.info( + f"vR: All Hits: {vr.all_hits} / BP Singles: {vr.bp_single} / Single 2: {vr.single_two} / " + f"Single 1: {vr.single_one} / Single CF: {vr.single_center}" + ) - vl.calculate_xbh(df_data['2B_vL'], df_data['3B_vL'], df_data['HR_vL'], df_data['HR/FB_vL']) - vr.calculate_xbh(df_data['2B_vR'], df_data['3B_vR'], df_data['HR_vR'], df_data['HR/FB_vR']) + vl.calculate_xbh( + df_data["2B_vL"], df_data["3B_vL"], df_data["HR_vL"], df_data["HR/FB_vL"] + ) + vr.calculate_xbh( + df_data["2B_vR"], df_data["3B_vR"], df_data["HR_vR"], df_data["HR/FB_vR"] + ) - logger.debug(f'vL: All XBH: {vl.all_hits - vl.single_one - vl.single_two - vl.single_center - vl.bp_single} / ' - f'Double**: {vl.double_two} / Double(cf): {vl.double_cf} / Triple: {vl.triple} / ' - f'BP HR: {vl.bp_homerun} / ND HR: {vl.homerun}') - logger.debug(f'vR: All XBH: {vr.all_hits - vr.single_one - vr.single_two - vr.single_center - vr.bp_single} / ' - f'Double**: {vr.double_two} / Double(cf): {vr.double_cf} / Triple: {vr.triple} / ' - f'BP HR: {vr.bp_homerun} / ND HR: {vr.homerun}') + logger.debug( + f"vL: All XBH: {vl.all_hits - vl.single_one - vl.single_two - vl.single_center - vl.bp_single} / " + f"Double**: {vl.double_two} / Double(cf): {vl.double_cf} / Triple: {vl.triple} / " + f"BP HR: {vl.bp_homerun} / ND HR: {vl.homerun}" + ) + logger.debug( + f"vR: All XBH: {vr.all_hits - vr.single_one - vr.single_two - vr.single_center - vr.bp_single} / " + f"Double**: {vr.double_two} / Double(cf): {vr.double_cf} / Triple: {vr.triple} / " + f"BP HR: {vr.bp_homerun} / ND HR: {vr.homerun}" + ) - vl.calculate_other_ob(df_data['BB_vL'], df_data['HBP_vL']) - vr.calculate_other_ob(df_data['BB_vR'], df_data['HBP_vR']) + vl.calculate_other_ob(df_data["BB_vL"], df_data["HBP_vL"]) + vr.calculate_other_ob(df_data["BB_vR"], df_data["HBP_vR"]) - logger.info(f'vL: All other OB: {vl.all_other_ob} / HBP: {vl.hbp} / BB: {vl.walk} / ' - f'Total Chances: {vl.total_chances()}') - logger.info(f'vR: All other OB: {vr.all_other_ob} / HBP: {vr.hbp} / BB: {vr.walk} / ' - f'Total Chances: {vr.total_chances()}') + logger.info( + f"vL: All other OB: {vl.all_other_ob} / HBP: {vl.hbp} / BB: {vl.walk} / " + f"Total Chances: {vl.total_chances()}" + ) + logger.info( + f"vR: All other OB: {vr.all_other_ob} / HBP: {vr.hbp} / BB: {vr.walk} / " + f"Total Chances: {vr.total_chances()}" + ) vl.calculate_strikouts( - df_data['SO_vL'], df_data['TBF_vL'] - df_data['BB_vL'] - df_data['IBB_vL'] - df_data['HBP_vL'], df_data['H_vL']) + df_data["SO_vL"], + df_data["TBF_vL"] - df_data["BB_vL"] - df_data["IBB_vL"] - df_data["HBP_vL"], + df_data["H_vL"], + ) vr.calculate_strikouts( - df_data['SO_vR'], df_data['TBF_vR'] - df_data['BB_vR'] - df_data['IBB_vR'] - df_data['HBP_vR'], df_data['H_vR']) + df_data["SO_vR"], + df_data["TBF_vR"] - df_data["BB_vR"] - df_data["IBB_vR"] - df_data["HBP_vR"], + df_data["H_vR"], + ) - logger.info(f'vL: All Outs: {vl.all_outs} / Ks: {vl.strikeout} / Current Outs: {vl.total_outs()}') - logger.info(f'vR: All Outs: {vr.all_outs} / Ks: {vr.strikeout} / Current Outs: {vr.total_outs()}') + logger.info( + f"vL: All Outs: {vl.all_outs} / Ks: {vl.strikeout} / Current Outs: {vl.total_outs()}" + ) + logger.info( + f"vR: All Outs: {vr.all_outs} / Ks: {vr.strikeout} / Current Outs: {vr.total_outs()}" + ) - vl.calculate_other_outs(df_data['FB%_vL'], df_data['GB%_vL'], df_data['Oppo%_vL']) - vr.calculate_other_outs(df_data['FB%_vR'], df_data['GB%_vR'], df_data['Oppo%_vR']) + vl.calculate_other_outs(df_data["FB%_vL"], df_data["GB%_vL"], df_data["Oppo%_vL"]) + vr.calculate_other_outs(df_data["FB%_vR"], df_data["GB%_vR"], df_data["Oppo%_vR"]) - logger.info(f'vL: Total chances: {vl.total_chances()}') - logger.info(f'vR: Total chances: {vr.total_chances()}') + logger.info(f"vL: Total chances: {vl.total_chances()}") + logger.info(f"vR: Total chances: {vr.total_chances()}") vl_dict = vl.custom_to_dict() vr_dict = vr.custom_to_dict() try: + offense_col = int(df_data["offense_col"]) if "offense_col" in df_data else 1 + player_id = ( + int(df_data["player_id"]) + if "player_id" in df_data + else abs(hash(df_data["key_bbref"])) % 10000 + ) vl_card, vr_card = build_pitcher_full_cards( - vl, vr, int(df_data['offense_col']), int(df_data['player_id']), df_data['pitch_hand'] + vl, vr, offense_col, player_id, df_data["pitch_hand"] ) vl_dict.update(vl_card.card_output()) vr_dict.update(vr_card.card_output()) except Exception as e: - logger.warning(f'Card layout builder failed for {df_data.name}: {e}') + logger.warning(f"Card layout builder failed for {df_data.name}: {e}") return [vl_dict, vr_dict] @@ -101,59 +145,61 @@ def get_pitcher_ratings(df_data) -> List[dict]: def total_chances(chance_data): sum_chances = 0 for key in chance_data: - if key not in ['id', 'player_id', 'cardset_id', 'vs_hand', 'is_prep']: + if key not in ["id", "player_id", "cardset_id", "vs_hand", "is_prep"]: sum_chances += chance_data[key] return sum_chances def soft_rate(pct): - if pct > .2: - return 'high' - elif pct < .1: - return 'low' + if pct > 0.2: + return "high" + elif pct < 0.1: + return "low" else: - return 'avg' + return "avg" def med_rate(pct): - if pct > .65: - return 'high' - elif pct < .4: - return 'low' + if pct > 0.65: + return "high" + elif pct < 0.4: + return "low" else: - return 'avg' + return "avg" def hard_rate(pct): - if pct > .4: - return 'high' - elif pct < .2: - return 'low' + if pct > 0.4: + return "high" + elif pct < 0.2: + return "low" else: - return 'avg' + return "avg" def hr_per_fb_rate(pct): - if pct > .18: - return 'high' - elif pct < .08: - return 'low' + if pct > 0.18: + return "high" + elif pct < 0.08: + return "low" else: - return 'avg' + return "avg" def all_singles(row, hits_vl, hits_vr): if int(row[7]) == 0: tot_singles_vl = 0 else: - tot_singles_vl = hits_vl * ((int(row[7]) - int(row[8]) - int(row[9]) - int(row[12])) - / int(row[7])) + tot_singles_vl = hits_vl * ( + (int(row[7]) - int(row[8]) - int(row[9]) - int(row[12])) / int(row[7]) + ) if int(row[40]) == 0: tot_singles_vr = 0 else: - tot_singles_vr = hits_vr * ((int(row[40]) - int(row[41]) - int(row[42]) - int(row[45])) - / int(row[40])) + tot_singles_vr = hits_vr * ( + (int(row[40]) - int(row[41]) - int(row[42]) - int(row[45])) / int(row[40]) + ) return mround(tot_singles_vl), mround(tot_singles_vr) @@ -166,12 +212,12 @@ def bp_singles(singles_vl, singles_vr): def wh_singles(rem_si_vl, rem_si_vr, hard_rate_vl, hard_rate_vr): - if hard_rate_vl == 'low': + if hard_rate_vl == "low": whs_vl = 0 else: whs_vl = rem_si_vl / 2 - if hard_rate_vr == 'low': + if hard_rate_vr == "low": whs_vr = 0 else: whs_vr = rem_si_vr / 2 @@ -180,12 +226,12 @@ def wh_singles(rem_si_vl, rem_si_vr, hard_rate_vl, hard_rate_vr): def one_singles(rem_si_vl, rem_si_vr, soft_rate_vl, soft_rate_vr): - if soft_rate_vl == 'high': + if soft_rate_vl == "high": oss_vl = rem_si_vl else: oss_vl = 0 - if soft_rate_vr == 'high': + if soft_rate_vr == "high": oss_vr = rem_si_vr else: oss_vr = 0 @@ -194,19 +240,19 @@ def one_singles(rem_si_vl, rem_si_vr, soft_rate_vl, soft_rate_vr): def bp_homerun(hr_vl, hr_vr, hr_rate_vl, hr_rate_vr): - if hr_rate_vl == 'low': + if hr_rate_vl == "low": bphr_vl = hr_vl - elif hr_rate_vl == 'avg': - bphr_vl = hr_vl * .75 + elif hr_rate_vl == "avg": + bphr_vl = hr_vl * 0.75 else: - bphr_vl = hr_vl * .4 + bphr_vl = hr_vl * 0.4 - if hr_rate_vr == 'low': + if hr_rate_vr == "low": bphr_vr = hr_vr - elif hr_rate_vr == 'avg': - bphr_vr = hr_vr * .75 + elif hr_rate_vr == "avg": + bphr_vr = hr_vr * 0.75 else: - bphr_vr = hr_vr * .4 + bphr_vr = hr_vr * 0.4 return mround(bphr_vl), mround(bphr_vr) @@ -219,8 +265,8 @@ def triples(all_xbh_vl, all_xbh_vr, triple_rate_vl, triple_rate_vr): def two_doubles(all_doubles_vl, all_doubles_vr, soft_rate_vl, soft_rate_vr): - two_doubles_vl = all_doubles_vl if soft_rate_vl == 'high' else 0 - two_doubles_vr = all_doubles_vr if soft_rate_vr == 'high' else 0 + two_doubles_vl = all_doubles_vl if soft_rate_vl == "high" else 0 + two_doubles_vr = all_doubles_vr if soft_rate_vr == "high" else 0 return mround(two_doubles_vl), mround(two_doubles_vr) @@ -242,21 +288,21 @@ def hbps(all_ob, this_hbp_rate): def xchecks(pos, all_chances=True): - if pos.lower() == 'p': + if pos.lower() == "p": return 1 if all_chances else 0 - elif pos.lower() == 'c': + elif pos.lower() == "c": return 3 if all_chances else 2 - elif pos.lower() == '1b': + elif pos.lower() == "1b": return 2 if all_chances else 1 - elif pos.lower() == '2b': + elif pos.lower() == "2b": return 6 if all_chances else 5 - elif pos.lower() == '3b': + elif pos.lower() == "3b": return 3 if all_chances else 2 - elif pos.lower() == 'ss': + elif pos.lower() == "ss": return 7 if all_chances else 6 - elif pos.lower() == 'lf': + elif pos.lower() == "lf": return 2 if all_chances else 1 - elif pos.lower() == 'cf': + elif pos.lower() == "cf": return 3 if all_chances else 2 else: return 2 if all_chances else 1 @@ -272,7 +318,7 @@ def oppo_fly(all_fly, oppo_rate): def groundball_a(all_gb, dp_rate): if all_gb == 0 or dp_rate == 0: return 0 - elif dp_rate > .6: + elif dp_rate > 0.6: return all_gb else: return mround(all_gb * (dp_rate * 1.5)) @@ -282,20 +328,22 @@ def balks(total_balks: int, innings: float, season_pct): try: total_balks = int(total_balks) except ValueError: - logger.error(f'Could not read balks: {total_balks} / setting to 0') + logger.error(f"Could not read balks: {total_balks} / setting to 0") total_balks = 0 - + try: innings = float(innings) except ValueError: - logger.error(f'Could not read innings: {innings} / setting to 0') + logger.error(f"Could not read innings: {innings} / setting to 0") innings = 0 if innings == 0: return 0 - - numerator = (total_balks * 290 * season_pct) - logger.info(f'total_balks: {total_balks} / season_pct {season_pct} / innings: {innings} / numerator: {numerator}') + + numerator = total_balks * 290 * season_pct + logger.info( + f"total_balks: {total_balks} / season_pct {season_pct} / innings: {innings} / numerator: {numerator}" + ) return min(round(numerator / innings), 20) @@ -311,19 +359,19 @@ def closer_rating(gf: int, saves: int, games: int): if gf == 0 or games == 0 or saves == 0: return None - if gf / games >= .875: + if gf / games >= 0.875: return 6 - elif gf / games >= .8: + elif gf / games >= 0.8: return 5 - elif gf / games >= .7: + elif gf / games >= 0.7: return 4 - elif gf / games >= .55: + elif gf / games >= 0.55: return 3 - elif gf / games >= .4: + elif gf / games >= 0.4: return 2 - elif gf / games >= .25: + elif gf / games >= 0.25: return 1 - elif gf / games >= .1: + elif gf / games >= 0.1: return 0 else: return None diff --git a/retrosheet_data.py b/retrosheet_data.py index ccf54f9..c30a35a 100644 --- a/retrosheet_data.py +++ b/retrosheet_data.py @@ -16,6 +16,7 @@ from creation_helpers import get_args, CLUB_LIST, FRANCHISE_LIST, sanitize_name from batters.stat_prep import DataMismatchError from db_calls import DB_URL, db_get, db_patch, db_post, db_put, db_delete from exceptions import log_exception, logger +from offense_col_resolver import resolve_offense_cols, hash_offense_col from retrosheet_transformer import load_retrosheet_csv import batters.calcs_batter as cba import defenders.calcs_defense as cde @@ -31,68 +32,89 @@ cache.enable() # ) -RETRO_FILE_PATH = 'data-input/retrosheet/' -EVENTS_FILENAME = 'retrosheets_events_2005.csv' # Now using transformer for new format compatibility -PERSONNEL_FILENAME = 'retrosheets_personnel.csv' -DATA_INPUT_FILE_PATH = 'data-input/2005 Live Cardset/' -CARD_BASE_URL = f'{DB_URL}/v2/players/' +RETRO_FILE_PATH = "data-input/retrosheet/" +EVENTS_FILENAME = ( + "retrosheets_events_2005.csv" # Now using transformer for new format compatibility +) +PERSONNEL_FILENAME = "retrosheets_personnel.csv" +DATA_INPUT_FILE_PATH = "data-input/2005 Live Cardset/" +CARD_BASE_URL = f"{DB_URL}/v2/players/" start_time = datetime.datetime.now() -RELEASE_DIRECTORY = f'{start_time.year}-{start_time.month}-{start_time.day}' -PLAYER_DESCRIPTION = 'Live' # Live for Live Series -# PLAYER_DESCRIPTION = 'May PotM' # PotM for promos +RELEASE_DIRECTORY = f"{start_time.year}-{start_time.month}-{start_time.day}" +PLAYER_DESCRIPTION = "Live" # Live for Live Series +# PLAYER_DESCRIPTION = 'July PotM' # PotM for promos PROMO_INCLUSION_RETRO_IDS = [ # AL - # 'rodra001', # Alex Rodriguez (IF) - # 'menck001', # Kevin Mench (OF) - # 'colob001', # Bartolo Colon (SP) - # 'ryanb001', # BJ Ryan (RP) + # 'giamj001', # Jason Giambi (IF) + # 'cabrm001', # Miguel Cabrera (OF) + # 'lackj001', # John Lackey (SP) + # 'rivem002', # Mariano Rivera (RP) # NL - # 'delgc001', # Carlos Delgado (IF) - # 'abreb001', # Bobby Abreu (OF) - # 'haraa001', # Aaron Harang (SP) - # 'hofft001', # Trevor Hoffman (RP) + # 'furcr001', # Rafael Furcal (IF) + # 'jenkg001', # Geoff Jenkins (OF) + # 'pattj004', # John Patterson (SP) + # 'wagnb001', # Billy Wagner (RP) ] -MIN_PA_VL = 20 if 'live' in PLAYER_DESCRIPTION.lower() else 1 # 1 for PotM -MIN_PA_VR = 40 if 'live' in PLAYER_DESCRIPTION.lower() else 1 # 1 for PotM +MIN_PA_VL = 20 if "live" in PLAYER_DESCRIPTION.lower() else 1 # 1 for PotM +MIN_PA_VR = 40 if "live" in PLAYER_DESCRIPTION.lower() else 1 # 1 for PotM MIN_TBF_VL = MIN_PA_VL MIN_TBF_VR = MIN_PA_VR -CARDSET_ID = 27 if 'live' in PLAYER_DESCRIPTION.lower() else 28 # 27: 2005 Live, 28: 2005 Promos +CARDSET_ID = ( + 27 if "live" in PLAYER_DESCRIPTION.lower() else 28 +) # 27: 2005 Live, 28: 2005 Promos # Per-Update Parameters SEASON_PCT = 81 / 162 # Through end of July (~half season) -START_DATE = 20050403 # YYYYMMDD format - 2005 Opening Day -# END_DATE = 20050531 # YYYYMMDD format - May PotM -END_DATE = 20050731 # End of July 2005 +START_DATE = 20050403 # YYYYMMDD format - 2005 Opening Day +END_DATE = 20050731 # End of July 2005 POST_DATA = True -LAST_WEEK_RATIO = 0.0 if PLAYER_DESCRIPTION == 'Live' else 0.0 +LAST_WEEK_RATIO = 0.0 if PLAYER_DESCRIPTION == "Live" else 0.0 LAST_TWOWEEKS_RATIO = 0.0 LAST_MONTH_RATIO = 0.0 + def date_from_int(integer_date: int) -> datetime.datetime: - return datetime.datetime(int(str(integer_date)[:4]), int(str(integer_date)[4:6]), int(str(integer_date)[-2:])) + return datetime.datetime( + int(str(integer_date)[:4]), + int(str(integer_date)[4:6]), + int(str(integer_date)[-2:]), + ) -def date_math(start_date: int, operator: Literal['+', '-'], day_delta: int = 0, month_delta: int = 0, year_delta: int = 0) -> int: +def date_math( + start_date: int, + operator: Literal["+", "-"], + day_delta: int = 0, + month_delta: int = 0, + year_delta: int = 0, +) -> int: if len(str(start_date)) != 8: - log_exception(ValueError, 'Start date must be 8 digits long') + log_exception(ValueError, "Start date must be 8 digits long") if True in [day_delta < 0, month_delta < 0, year_delta < 0]: - log_exception(ValueError, 'Time deltas must greater than or equal to 0; use `-` operator to go back in time') + log_exception( + ValueError, + "Time deltas must greater than or equal to 0; use `-` operator to go back in time", + ) if day_delta > 28: - log_exception(ValueError, 'Use month_delta for days > 28') + log_exception(ValueError, "Use month_delta for days > 28") if month_delta > 12: - log_exception(ValueError, 'Use year_delta for months > 12') + log_exception(ValueError, "Use year_delta for months > 12") s_date = date_from_int(start_date) if year_delta > 0: s_date = datetime.datetime( - s_date.year + year_delta if operator == '+' else s_date.year - year_delta, + s_date.year + year_delta if operator == "+" else s_date.year - year_delta, s_date.month, - s_date.day + s_date.day, ) if month_delta > 0: month_range = [12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] - new_index = s_date.month + month_delta if operator == '+' else s_date.month - month_delta + new_index = ( + s_date.month + month_delta + if operator == "+" + else s_date.month - month_delta + ) new_month = month_range[(new_index % 12)] new_year = s_date.year @@ -101,13 +123,11 @@ def date_math(start_date: int, operator: Literal['+', '-'], day_delta: int = 0, elif new_index < 1: new_year -= 1 - s_date = datetime.datetime( - new_year, - new_month, - s_date.day - ) - fd = s_date + datetime.timedelta(days=day_delta if operator == '+' else day_delta * -1) - return f'{str(fd.year).zfill(4)}{str(fd.month).zfill(2)}{str(fd.day).zfill(2)}' + s_date = datetime.datetime(new_year, new_month, s_date.day) + fd = s_date + datetime.timedelta( + days=day_delta if operator == "+" else day_delta * -1 + ) + return f"{str(fd.year).zfill(4)}{str(fd.month).zfill(2)}{str(fd.day).zfill(2)}" def weeks_between(start_date_int: int, end_date_int: int) -> int: @@ -118,70 +138,124 @@ def weeks_between(start_date_int: int, end_date_int: int) -> int: async def store_defense_to_csv(season: int): - for position in ['c', '1b', '2b', '3b', 'ss', 'lf', 'cf', 'rf', 'of', 'p']: + for position in ["c", "1b", "2b", "3b", "ss", "lf", "cf", "rf", "of", "p"]: pos_df = cde.get_bbref_fielding_df(position, season) - pos_df.to_csv(f'{DATA_INPUT_FILE_PATH}defense_{position}.csv') + pos_df.to_csv(f"{DATA_INPUT_FILE_PATH}defense_{position}.csv") await asyncio.sleep(8) -def get_batting_result_series(plays: pd.DataFrame, event_type: str, pitcher_hand: Literal['r', 'l'], col_name: str) -> pd.Series: - this_series = plays[(plays.event_type == event_type) & (plays.pitcher_hand == pitcher_hand)].groupby('batter_id').count()['event_type'].astype(int).rename(col_name) +def get_batting_result_series( + plays: pd.DataFrame, event_type: str, pitcher_hand: Literal["r", "l"], col_name: str +) -> pd.Series: + this_series = ( + plays[(plays.event_type == event_type) & (plays.pitcher_hand == pitcher_hand)] + .groupby("batter_id") + .count()["event_type"] + .astype(int) + .rename(col_name) + ) return this_series -def get_pitching_result_series(plays: pd.DataFrame, event_type: str, batter_hand: Literal['r', 'l'], col_name: str) -> pd.Series: - this_series = plays[(plays.event_type == event_type) & (plays.batter_hand == batter_hand)].groupby('pitcher_id').count()['event_type'].astype(int).rename(col_name) +def get_pitching_result_series( + plays: pd.DataFrame, event_type: str, batter_hand: Literal["r", "l"], col_name: str +) -> pd.Series: + this_series = ( + plays[(plays.event_type == event_type) & (plays.batter_hand == batter_hand)] + .groupby("pitcher_id") + .count()["event_type"] + .astype(int) + .rename(col_name) + ) return this_series def get_run_stat_df(input_path: str): - run_data = pd.read_csv(f'{input_path}running.csv') #.set_index('Name-additional')) + run_data = pd.read_csv(f"{input_path}running.csv") # .set_index('Name-additional')) # if 'Player' in run_data: # run_data = run_data.rename(columns={'Player': 'Full Name'}) # if 'Name' in run_data: # run_data = run_data.rename(columns={'Name': 'Full Name'}) - if 'Player-additional' in run_data: - run_data = run_data.rename(columns={'Player-additional': 'key_bbref'}) - if 'Name-additional' in run_data: - run_data = run_data.rename(columns={'Name-additional': 'key_bbref'}) - - run_data = run_data[['key_bbref', 'Tm', 'ROE', 'XI', 'RS%', 'SBO', 'SB', 'CS', 'SB%', 'SB2', 'CS2', 'SB3', 'CS3', 'SBH', 'CSH', 'PO', 'PCS', 'OOB', 'OOB1', 'OOB2', 'OOB3', 'OOBHm', 'BT', 'XBT%', '1stS', '1stS2', '1stS3', '1stD', '1stD3', '1stDH', '2ndS', '2ndS3', '2ndSH']] + if "Player-additional" in run_data: + run_data = run_data.rename(columns={"Player-additional": "key_bbref"}) + if "Name-additional" in run_data: + run_data = run_data.rename(columns={"Name-additional": "key_bbref"}) + + run_data = run_data[ + [ + "key_bbref", + "Tm", + "ROE", + "XI", + "RS%", + "SBO", + "SB", + "CS", + "SB%", + "SB2", + "CS2", + "SB3", + "CS3", + "SBH", + "CSH", + "PO", + "PCS", + "OOB", + "OOB1", + "OOB2", + "OOB3", + "OOBHm", + "BT", + "XBT%", + "1stS", + "1stS2", + "1stS3", + "1stD", + "1stD3", + "1stDH", + "2ndS", + "2ndS3", + "2ndSH", + ] + ] run_data = run_data.fillna(0) - return run_data.set_index('key_bbref') + return run_data.set_index("key_bbref") def get_periph_stat_df(input_path: str): - pit_data = pd.read_csv(f'{input_path}pitching.csv') - if 'Player-additional' in pit_data: - pit_data = pit_data.rename(columns={'Player-additional': 'key_bbref'}) - if 'Name-additional' in pit_data: - pit_data = pit_data.rename(columns={'Name-additional': 'key_bbref'}) - if 'Team' in pit_data: - pit_data = pit_data.rename(columns={'Team': 'Tm'}) - - pit_data = pit_data[['key_bbref', 'Tm', 'GF', 'SHO', 'SV', 'IP', 'BK', 'WP']] - + pit_data = pd.read_csv(f"{input_path}pitching.csv") + if "Player-additional" in pit_data: + pit_data = pit_data.rename(columns={"Player-additional": "key_bbref"}) + if "Name-additional" in pit_data: + pit_data = pit_data.rename(columns={"Name-additional": "key_bbref"}) + if "Team" in pit_data: + pit_data = pit_data.rename(columns={"Team": "Tm"}) + + pit_data = pit_data[["key_bbref", "Tm", "GF", "SHO", "SV", "IP", "BK", "WP"]] + pit_data = pit_data.fillna(0) return pit_data -def get_player_ids(plays: pd.DataFrame, which: Literal['batters', 'pitchers']) -> pd.DataFrame: - RETRO_PLAYERS = pd.read_csv(f'{RETRO_FILE_PATH}{PERSONNEL_FILENAME}') - id_key = 'batter_id' if which == 'batters' else 'pitcher_id' +def get_player_ids( + plays: pd.DataFrame, which: Literal["batters", "pitchers"] +) -> pd.DataFrame: + RETRO_PLAYERS = pd.read_csv(f"{RETRO_FILE_PATH}{PERSONNEL_FILENAME}") + id_key = "batter_id" if which == "batters" else "pitcher_id" players = pd.DataFrame() - unique_players = pd.Series(plays[id_key].unique()).to_frame('id') + unique_players = pd.Series(plays[id_key].unique()).to_frame("id") players = pd.merge( left=RETRO_PLAYERS, right=unique_players, - how='right', - left_on='id', - right_on='id' - ).rename(columns={'id': id_key}) + how="right", + left_on="id", + right_on="id", + ).rename(columns={"id": id_key}) - if PLAYER_DESCRIPTION not in ['Live', '1998']: - msg = f'Player description is *{PLAYER_DESCRIPTION}* so dropping players not in PROMO_INCLUSION_RETRO_IDS' + if PLAYER_DESCRIPTION not in ["Live", "1998"]: + msg = f"Player description is *{PLAYER_DESCRIPTION}* so dropping players not in PROMO_INCLUSION_RETRO_IDS" print(msg) logger.info(msg) # players = players.drop(players[players.index not in PROMO_INCLUSION_RETRO_IDS].index) @@ -189,351 +263,737 @@ def get_player_ids(plays: pd.DataFrame, which: Literal['batters', 'pitchers']) - def get_pids(row): # return get_all_pybaseball_ids([row[id_key]], 'retro', full_name=f'{row["use_name"]} {row["last_name"]}') - pull = pb.playerid_reverse_lookup([row[id_key]], key_type='retro') + pull = pb.playerid_reverse_lookup([row[id_key]], key_type="retro") if len(pull.values) == 0: - print(f'Could not find id {row[id_key]} in pybaseball lookup') - return pull.loc[0][['key_mlbam', 'key_retro', 'key_bbref', 'key_fangraphs']] - - players = players[[id_key, 'last_name', 'use_name']] + print(f"Could not find id {row[id_key]} in pybaseball lookup") + return pull.loc[0][["key_mlbam", "key_retro", "key_bbref", "key_fangraphs"]] + + players = players[[id_key, "last_name", "use_name"]] start_time = datetime.datetime.now() other_ids = players.apply(get_pids, axis=1) end_time = datetime.datetime.now() - print(f'ID lookup: {(end_time - start_time).total_seconds():.2f}s') + print(f"ID lookup: {(end_time - start_time).total_seconds():.2f}s") def clean_first(row): - return sanitize_name(row['use_name']) + return sanitize_name(row["use_name"]) + def clean_last(row): - return sanitize_name(row['last_name']) - - players['use_name'] = players.apply(clean_first, axis=1) - players['last_name'] = players.apply(clean_last, axis=1) + return sanitize_name(row["last_name"]) + + players["use_name"] = players.apply(clean_first, axis=1) + players["last_name"] = players.apply(clean_last, axis=1) players = pd.merge( - left=players, - right=other_ids, - left_on=id_key, - right_on='key_retro' + left=players, right=other_ids, left_on=id_key, right_on="key_retro" ) players = players.set_index(id_key) def get_bat_hand(row): - pa_vl = plays[(plays.batter_id == row['key_retro']) & (plays.pitcher_hand == 'l')].groupby('result_batter_hand').count()['game_id'].astype(int) - pa_vr = plays[(plays.batter_id == row['key_retro']) & (plays.pitcher_hand == 'r')].groupby('result_batter_hand').count()['game_id'].astype(int) + pa_vl = ( + plays[(plays.batter_id == row["key_retro"]) & (plays.pitcher_hand == "l")] + .groupby("result_batter_hand") + .count()["game_id"] + .astype(int) + ) + pa_vr = ( + plays[(plays.batter_id == row["key_retro"]) & (plays.pitcher_hand == "r")] + .groupby("result_batter_hand") + .count()["game_id"] + .astype(int) + ) - l_vs_l = 0 if 'l' not in pa_vl else pa_vl['l'] - l_vs_r = 0 if 'l' not in pa_vr else pa_vr['l'] - r_vs_l = 0 if 'r' not in pa_vl else pa_vl['r'] - r_vs_r = 0 if 'r' not in pa_vr else pa_vr['r'] + l_vs_l = 0 if "l" not in pa_vl else pa_vl["l"] + l_vs_r = 0 if "l" not in pa_vr else pa_vr["l"] + r_vs_l = 0 if "r" not in pa_vl else pa_vl["r"] + r_vs_r = 0 if "r" not in pa_vr else pa_vr["r"] # If player ONLY batted from one side (zero PAs from other side), classify as single-handed if sum([l_vs_l, l_vs_r]) == 0 and sum([r_vs_l, r_vs_r]) > 0: - return 'R' + return "R" elif sum([l_vs_l, l_vs_r]) > 0 and sum([r_vs_l, r_vs_r]) == 0: - return 'L' + return "L" # If player batted from both sides (even if limited sample), they're a switch hitter # This correctly identifies switch hitters regardless of total PA count if sum([l_vs_l, l_vs_r]) > 0 and sum([r_vs_l, r_vs_r]) > 0: - return 'S' + return "S" # Fallback for edge cases (shouldn't reach here in normal flow) if sum([l_vs_l, l_vs_r]) > sum([r_vs_l, r_vs_r]): - return 'L' + return "L" else: - return 'R' - + return "R" + def get_pitch_hand(row): - first_event = plays.drop_duplicates('pitcher_id').loc[plays.pitcher_id == row['key_retro'], 'pitcher_hand'] + first_event = plays.drop_duplicates("pitcher_id").loc[ + plays.pitcher_id == row["key_retro"], "pitcher_hand" + ] return first_event.item() - if which == 'batters': - players['bat_hand'] = players.apply(get_bat_hand, axis=1) - elif which == 'pitchers': - players['pitch_hand'] = players.apply(get_pitch_hand, axis=1) + if which == "batters": + players["bat_hand"] = players.apply(get_bat_hand, axis=1) + elif which == "pitchers": + players["pitch_hand"] = players.apply(get_pitch_hand, axis=1) return players -def get_base_batting_df(file_path: str, start_date: int, end_date: int) -> list[pd.DataFrame, pd.DataFrame]: +def get_base_batting_df( + file_path: str, start_date: int, end_date: int +) -> list[pd.DataFrame, pd.DataFrame]: all_plays = load_retrosheet_csv(file_path) - all_plays['date'] = all_plays['game_id'].str[3:-1].astype(int) - date_plays = all_plays[(all_plays.date >= start_date) & (all_plays.date <= end_date)] + all_plays["date"] = all_plays["game_id"].str[3:-1].astype(int) + date_plays = all_plays[ + (all_plays.date >= start_date) & (all_plays.date <= end_date) + ] - all_player_ids = get_player_ids(all_plays, 'batters') + all_player_ids = get_player_ids(all_plays, "batters") - pal_series = date_plays[(date_plays.batter_event == 't') & (date_plays.pitcher_hand == 'l')].groupby('batter_id').count()['event_type'].astype(int).rename('PA_vL') + pal_series = ( + date_plays[(date_plays.batter_event == "t") & (date_plays.pitcher_hand == "l")] + .groupby("batter_id") + .count()["event_type"] + .astype(int) + .rename("PA_vL") + ) bs = pd.concat([all_player_ids, pal_series], axis=1) - par_series = date_plays[(date_plays.batter_event == 't') & (date_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('PA_vR') + par_series = ( + date_plays[(date_plays.batter_event == "t") & (date_plays.pitcher_hand == "r")] + .groupby("batter_id") + .count()["event_type"] + .astype(int) + .rename("PA_vR") + ) bs = pd.concat([bs, par_series], axis=1) - abl_series = date_plays[(date_plays.ab == 't') & (date_plays.pitcher_hand == 'l')].groupby('batter_id').count()['event_type'].astype(int).rename('AB_vL') + abl_series = ( + date_plays[(date_plays.ab == "t") & (date_plays.pitcher_hand == "l")] + .groupby("batter_id") + .count()["event_type"] + .astype(int) + .rename("AB_vL") + ) bs = pd.concat([bs, abl_series], axis=1) - abr_series = date_plays[(date_plays.ab == 't') & (date_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('AB_vR') + abr_series = ( + date_plays[(date_plays.ab == "t") & (date_plays.pitcher_hand == "r")] + .groupby("batter_id") + .count()["event_type"] + .astype(int) + .rename("AB_vR") + ) bs = pd.concat([bs, abr_series], axis=1) - core_df = bs.dropna().query(f'PA_vL >= {MIN_PA_VL} & PA_vR >= {MIN_PA_VR}') - if LAST_WEEK_RATIO == 0.0 and LAST_TWOWEEKS_RATIO == 0.0 and LAST_MONTH_RATIO == 0.0: + core_df = bs.dropna().query(f"PA_vL >= {MIN_PA_VL} & PA_vR >= {MIN_PA_VR}") + if ( + LAST_WEEK_RATIO == 0.0 + and LAST_TWOWEEKS_RATIO == 0.0 + and LAST_MONTH_RATIO == 0.0 + ): return [date_plays, core_df] base_num_weeks = weeks_between(start_date, end_date) if LAST_WEEK_RATIO > 0: - new_start = date_math(end_date, '-', day_delta=7) - week_plays = date_plays[(date_plays.date >= int(new_start)) & (date_plays.date <= end_date)] + new_start = date_math(end_date, "-", day_delta=7) + week_plays = date_plays[ + (date_plays.date >= int(new_start)) & (date_plays.date <= end_date) + ] copies = round(base_num_weeks * LAST_WEEK_RATIO) for x in range(copies): date_plays = pd.concat([date_plays, week_plays], ignore_index=True) - + if LAST_TWOWEEKS_RATIO > 0: - new_start = date_math(end_date, '-', day_delta=14) - week_plays = date_plays[(date_plays.date >= int(new_start)) & (date_plays.date <= end_date)] + new_start = date_math(end_date, "-", day_delta=14) + week_plays = date_plays[ + (date_plays.date >= int(new_start)) & (date_plays.date <= end_date) + ] copies = round(base_num_weeks * LAST_TWOWEEKS_RATIO) for x in range(copies): date_plays = pd.concat([date_plays, week_plays], ignore_index=True) - + if LAST_MONTH_RATIO > 0: - new_start = date_math(end_date, '-', month_delta=1) - week_plays = date_plays[(date_plays.date >= int(new_start)) & (date_plays.date <= end_date)] + new_start = date_math(end_date, "-", month_delta=1) + week_plays = date_plays[ + (date_plays.date >= int(new_start)) & (date_plays.date <= end_date) + ] copies = round(base_num_weeks * LAST_MONTH_RATIO) for x in range(copies): date_plays = pd.concat([date_plays, week_plays], ignore_index=True) - core_df = core_df.drop(columns=['PA_vL', 'PA_vR', 'AB_vL', 'AB_vR']) + core_df = core_df.drop(columns=["PA_vL", "PA_vR", "AB_vL", "AB_vR"]) - pal_series = date_plays[(date_plays.batter_event == 't') & (date_plays.pitcher_hand == 'l')].groupby('batter_id').count()['event_type'].astype(int).rename('PA_vL') - core_df['PA_vL'] = pal_series + pal_series = ( + date_plays[(date_plays.batter_event == "t") & (date_plays.pitcher_hand == "l")] + .groupby("batter_id") + .count()["event_type"] + .astype(int) + .rename("PA_vL") + ) + core_df["PA_vL"] = pal_series - par_series = date_plays[(date_plays.batter_event == 't') & (date_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('PA_vR') - core_df['PA_vR'] = par_series + par_series = ( + date_plays[(date_plays.batter_event == "t") & (date_plays.pitcher_hand == "r")] + .groupby("batter_id") + .count()["event_type"] + .astype(int) + .rename("PA_vR") + ) + core_df["PA_vR"] = par_series - abl_series = date_plays[(date_plays.ab == 't') & (date_plays.pitcher_hand == 'l')].groupby('batter_id').count()['event_type'].astype(int).rename('AB_vL') - core_df['AB_vL'] = abl_series + abl_series = ( + date_plays[(date_plays.ab == "t") & (date_plays.pitcher_hand == "l")] + .groupby("batter_id") + .count()["event_type"] + .astype(int) + .rename("AB_vL") + ) + core_df["AB_vL"] = abl_series - abr_series = date_plays[(date_plays.ab == 't') & (date_plays.pitcher_hand == 'r')].groupby('batter_id').count()['event_type'].astype(int).rename('AB_vR') - core_df['AB_vR'] = abr_series + abr_series = ( + date_plays[(date_plays.ab == "t") & (date_plays.pitcher_hand == "r")] + .groupby("batter_id") + .count()["event_type"] + .astype(int) + .rename("AB_vR") + ) + core_df["AB_vR"] = abr_series return [date_plays, core_df] -def get_base_pitching_df(file_path: str, start_date: int, end_date: int) -> list[pd.DataFrame, pd.DataFrame]: +def get_base_pitching_df( + file_path: str, start_date: int, end_date: int +) -> list[pd.DataFrame, pd.DataFrame]: all_plays = load_retrosheet_csv(file_path) - all_plays['date'] = all_plays['game_id'].str[3:-1].astype(int) - date_plays = all_plays[(all_plays.date >= start_date) & (all_plays.date <= end_date)] + all_plays["date"] = all_plays["game_id"].str[3:-1].astype(int) + date_plays = all_plays[ + (all_plays.date >= start_date) & (all_plays.date <= end_date) + ] - ps = get_player_ids(all_plays, 'pitchers') + ps = get_player_ids(all_plays, "pitchers") - tbfl_series = date_plays[(date_plays.batter_event == 't') & (date_plays.batter_hand == 'l')].groupby('pitcher_id').count()['event_type'].astype(int).rename('TBF_vL') + tbfl_series = ( + date_plays[(date_plays.batter_event == "t") & (date_plays.batter_hand == "l")] + .groupby("pitcher_id") + .count()["event_type"] + .astype(int) + .rename("TBF_vL") + ) ps = pd.concat([ps, tbfl_series], axis=1) - tbfr_series = date_plays[(date_plays.batter_event == 't') & (date_plays.batter_hand == 'r')].groupby('pitcher_id').count()['event_type'].astype(int).rename('TBF_vR') + tbfr_series = ( + date_plays[(date_plays.batter_event == "t") & (date_plays.batter_hand == "r")] + .groupby("pitcher_id") + .count()["event_type"] + .astype(int) + .rename("TBF_vR") + ) ps = pd.concat([ps, tbfr_series], axis=1) - abl_series = date_plays[(date_plays.ab == 't') & (date_plays.batter_hand == 'l')].groupby('pitcher_id').count()['event_type'].astype(int).rename('AB_vL') + abl_series = ( + date_plays[(date_plays.ab == "t") & (date_plays.batter_hand == "l")] + .groupby("pitcher_id") + .count()["event_type"] + .astype(int) + .rename("AB_vL") + ) ps = pd.concat([ps, abl_series], axis=1) - abr_series = date_plays[(date_plays.ab == 't') & (date_plays.batter_hand == 'r')].groupby('pitcher_id').count()['event_type'].astype(int).rename('AB_vR') + abr_series = ( + date_plays[(date_plays.ab == "t") & (date_plays.batter_hand == "r")] + .groupby("pitcher_id") + .count()["event_type"] + .astype(int) + .rename("AB_vR") + ) ps = pd.concat([ps, abr_series], axis=1) - if PLAYER_DESCRIPTION in ['Live', '1998']: - core_df = ps.dropna().query(f'TBF_vL >= {MIN_TBF_VL} & TBF_vR >= {MIN_TBF_VR}') + if PLAYER_DESCRIPTION in ["Live", "1998"]: + core_df = ps.dropna().query(f"TBF_vL >= {MIN_TBF_VL} & TBF_vR >= {MIN_TBF_VR}") else: core_df = ps.dropna() - if LAST_WEEK_RATIO == 0.0 and LAST_TWOWEEKS_RATIO == 0.0 and LAST_MONTH_RATIO == 0.0: + if ( + LAST_WEEK_RATIO == 0.0 + and LAST_TWOWEEKS_RATIO == 0.0 + and LAST_MONTH_RATIO == 0.0 + ): return [date_plays, core_df] base_num_weeks = weeks_between(start_date, end_date) if LAST_WEEK_RATIO > 0: - new_start = date_math(end_date, '-', day_delta=7) - week_plays = date_plays[(date_plays.date >= int(new_start)) & (date_plays.date <= end_date)] + new_start = date_math(end_date, "-", day_delta=7) + week_plays = date_plays[ + (date_plays.date >= int(new_start)) & (date_plays.date <= end_date) + ] copies = round(base_num_weeks * LAST_WEEK_RATIO) for x in range(copies): date_plays = pd.concat([date_plays, week_plays], ignore_index=True) - + if LAST_TWOWEEKS_RATIO > 0: - new_start = date_math(end_date, '-', day_delta=14) - week_plays = date_plays[(date_plays.date >= int(new_start)) & (date_plays.date <= end_date)] + new_start = date_math(end_date, "-", day_delta=14) + week_plays = date_plays[ + (date_plays.date >= int(new_start)) & (date_plays.date <= end_date) + ] copies = round(base_num_weeks * LAST_TWOWEEKS_RATIO) for x in range(copies): date_plays = pd.concat([date_plays, week_plays], ignore_index=True) - + if LAST_MONTH_RATIO > 0: - new_start = date_math(end_date, '-', month_delta=1) - week_plays = date_plays[(date_plays.date >= int(new_start)) & (date_plays.date <= end_date)] + new_start = date_math(end_date, "-", month_delta=1) + week_plays = date_plays[ + (date_plays.date >= int(new_start)) & (date_plays.date <= end_date) + ] copies = round(base_num_weeks * LAST_MONTH_RATIO) for x in range(copies): date_plays = pd.concat([date_plays, week_plays], ignore_index=True) - core_df = core_df.drop(columns=['TBF_vL', 'TBF_vR', 'AB_vL', 'AB_vR']) + core_df = core_df.drop(columns=["TBF_vL", "TBF_vR", "AB_vL", "AB_vR"]) - tbfl_series = date_plays[(date_plays.batter_event == 't') & (date_plays.batter_hand == 'l')].groupby('pitcher_id').count()['event_type'].astype(int).rename('TBF_vL') - core_df['TBF_vL'] = tbfl_series + tbfl_series = ( + date_plays[(date_plays.batter_event == "t") & (date_plays.batter_hand == "l")] + .groupby("pitcher_id") + .count()["event_type"] + .astype(int) + .rename("TBF_vL") + ) + core_df["TBF_vL"] = tbfl_series - tbfr_series = date_plays[(date_plays.batter_event == 't') & (date_plays.batter_hand == 'r')].groupby('pitcher_id').count()['event_type'].astype(int).rename('TBF_vR') - core_df['TBF_vR'] = tbfr_series + tbfr_series = ( + date_plays[(date_plays.batter_event == "t") & (date_plays.batter_hand == "r")] + .groupby("pitcher_id") + .count()["event_type"] + .astype(int) + .rename("TBF_vR") + ) + core_df["TBF_vR"] = tbfr_series - abl_series = date_plays[(date_plays.ab == 't') & (date_plays.batter_hand == 'l')].groupby('pitcher_id').count()['event_type'].astype(int).rename('AB_vL') - core_df['AB_vL'] = abl_series + abl_series = ( + date_plays[(date_plays.ab == "t") & (date_plays.batter_hand == "l")] + .groupby("pitcher_id") + .count()["event_type"] + .astype(int) + .rename("AB_vL") + ) + core_df["AB_vL"] = abl_series - abr_series = date_plays[(date_plays.ab == 't') & (date_plays.batter_hand == 'r')].groupby('pitcher_id').count()['event_type'].astype(int).rename('AB_vR') - core_df['AB_vR'] = abr_series + abr_series = ( + date_plays[(date_plays.ab == "t") & (date_plays.batter_hand == "r")] + .groupby("pitcher_id") + .count()["event_type"] + .astype(int) + .rename("AB_vR") + ) + core_df["AB_vR"] = abr_series return [date_plays, core_df] def get_med_vL(row): - high = 0.9 - row['Hard%_vL'] - low = (row['SLG_vL'] - row['AVG_vL']) * 1.5 - return round(max(min(high, low),0.1), 5) + high = 0.9 - row["Hard%_vL"] + low = (row["SLG_vL"] - row["AVG_vL"]) * 1.5 + return round(max(min(high, low), 0.1), 5) + + def get_med_vR(row): - high = 0.9 - row['Hard%_vR'] - low = (row['SLG_vR'] - row['AVG_vR']) * 1.5 - return round(max(min(high, low),0.1), 5) + high = 0.9 - row["Hard%_vR"] + low = (row["SLG_vR"] - row["AVG_vR"]) * 1.5 + return round(max(min(high, low), 0.1), 5) -def get_batting_stats_by_date(retro_file_path, start_date: int, end_date: int) -> pd.DataFrame: +def get_batting_stats_by_date( + retro_file_path, start_date: int, end_date: int +) -> pd.DataFrame: start = datetime.datetime.now() - all_plays, batting_stats = get_base_batting_df(retro_file_path, start_date, end_date) - print(f'Get base dataframe: {(datetime.datetime.now() - start).total_seconds():.2f}s') + all_plays, batting_stats = get_base_batting_df( + retro_file_path, start_date, end_date + ) + print( + f"Get base dataframe: {(datetime.datetime.now() - start).total_seconds():.2f}s" + ) start = datetime.datetime.now() - all_player_ids = batting_stats['key_retro'] - logging.info(f'all_player_ids: {all_player_ids}') - all_plays = all_plays[all_plays['batter_id'].isin(all_player_ids)] - print(f'Shrink all_plays: {(datetime.datetime.now() - start).total_seconds():.2f}s') + all_player_ids = batting_stats["key_retro"] + logging.info(f"all_player_ids: {all_player_ids}") + all_plays = all_plays[all_plays["batter_id"].isin(all_player_ids)] + print(f"Shrink all_plays: {(datetime.datetime.now() - start).total_seconds():.2f}s") # Basic counting stats start = datetime.datetime.now() for event_type, vs_hand, col_name in [ - ('home run', 'r', 'HR_vR'), - ('home run', 'l', 'HR_vL'), - ('single', 'r', '1B_vR'), - ('single', 'l', '1B_vL'), - ('double', 'r', '2B_vR'), - ('double', 'l', '2B_vL'), - ('triple', 'r', '3B_vR'), - ('triple', 'l', '3B_vL'), - ('walk', 'r', 'BB_vR'), - ('walk', 'l', 'BB_vL'), - ('strikeout', 'r', 'SO_vR'), - ('strikeout', 'l', 'SO_vL'), - ('hit by pitch', 'r', 'HBP_vR'), - ('hit by pitch', 'l', 'HBP_vL') + ("home run", "r", "HR_vR"), + ("home run", "l", "HR_vL"), + ("single", "r", "1B_vR"), + ("single", "l", "1B_vL"), + ("double", "r", "2B_vR"), + ("double", "l", "2B_vL"), + ("triple", "r", "3B_vR"), + ("triple", "l", "3B_vL"), + ("walk", "r", "BB_vR"), + ("walk", "l", "BB_vL"), + ("strikeout", "r", "SO_vR"), + ("strikeout", "l", "SO_vL"), + ("hit by pitch", "r", "HBP_vR"), + ("hit by pitch", "l", "HBP_vL"), ]: - this_series = get_batting_result_series(all_plays, event_type, vs_hand, col_name) + this_series = get_batting_result_series( + all_plays, event_type, vs_hand, col_name + ) batting_stats[col_name] = this_series - print(f'Count basic stats: {(datetime.datetime.now() - start).total_seconds():.2f}s') - + print( + f"Count basic stats: {(datetime.datetime.now() - start).total_seconds():.2f}s" + ) + # Bespoke counting stats start = datetime.datetime.now() + def get_fb_vl(row): - return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batted_ball_type == 'f') & (all_plays.pitcher_hand == 'l')].count()['event_type'].astype(int) + return ( + all_plays[ + (all_plays.batter_id == row["key_retro"]) + & (all_plays.batted_ball_type == "f") + & (all_plays.pitcher_hand == "l") + ] + .count()["event_type"] + .astype(int) + ) + def get_fb_vr(row): - return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batted_ball_type == 'f') & (all_plays.pitcher_hand == 'r')].count()['event_type'].astype(int) - + return ( + all_plays[ + (all_plays.batter_id == row["key_retro"]) + & (all_plays.batted_ball_type == "f") + & (all_plays.pitcher_hand == "r") + ] + .count()["event_type"] + .astype(int) + ) + def get_gb_vl(row): - return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batted_ball_type == 'G') & (all_plays.pitcher_hand == 'l')].count()['event_type'].astype(int) + return ( + all_plays[ + (all_plays.batter_id == row["key_retro"]) + & (all_plays.batted_ball_type == "G") + & (all_plays.pitcher_hand == "l") + ] + .count()["event_type"] + .astype(int) + ) + def get_gb_vr(row): - return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batted_ball_type == 'G') & (all_plays.pitcher_hand == 'r')].count()['event_type'].astype(int) - + return ( + all_plays[ + (all_plays.batter_id == row["key_retro"]) + & (all_plays.batted_ball_type == "G") + & (all_plays.pitcher_hand == "r") + ] + .count()["event_type"] + .astype(int) + ) + def get_ld_vl(row): - return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batted_ball_type == 'l') & (all_plays.pitcher_hand == 'l')].count()['event_type'].astype(int) + return ( + all_plays[ + (all_plays.batter_id == row["key_retro"]) + & (all_plays.batted_ball_type == "l") + & (all_plays.pitcher_hand == "l") + ] + .count()["event_type"] + .astype(int) + ) + def get_ld_vr(row): - return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batted_ball_type == 'l') & (all_plays.pitcher_hand == 'r')].count()['event_type'].astype(int) - + return ( + all_plays[ + (all_plays.batter_id == row["key_retro"]) + & (all_plays.batted_ball_type == "l") + & (all_plays.pitcher_hand == "r") + ] + .count()["event_type"] + .astype(int) + ) + def get_gdp_vl(row): - dp = all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'l') & (all_plays.dp == 't')].count()['event_type'].astype(int) - tp = all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'l') & (all_plays.tp == 't')].count()['event_type'].astype(int) + dp = ( + all_plays[ + (all_plays.batter_id == row["key_retro"]) + & (all_plays.batter_event == "t") + & (all_plays.pitcher_hand == "l") + & (all_plays.dp == "t") + ] + .count()["event_type"] + .astype(int) + ) + tp = ( + all_plays[ + (all_plays.batter_id == row["key_retro"]) + & (all_plays.batter_event == "t") + & (all_plays.pitcher_hand == "l") + & (all_plays.tp == "t") + ] + .count()["event_type"] + .astype(int) + ) return dp + tp + def get_gdp_vr(row): - dp = all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'r') & (all_plays.dp == 't')].count()['event_type'].astype(int) - tp = all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.batter_event == 't') & (all_plays.pitcher_hand == 'r') & (all_plays.tp == 't')].count()['event_type'].astype(int) + dp = ( + all_plays[ + (all_plays.batter_id == row["key_retro"]) + & (all_plays.batter_event == "t") + & (all_plays.pitcher_hand == "r") + & (all_plays.dp == "t") + ] + .count()["event_type"] + .astype(int) + ) + tp = ( + all_plays[ + (all_plays.batter_id == row["key_retro"]) + & (all_plays.batter_event == "t") + & (all_plays.pitcher_hand == "r") + & (all_plays.tp == "t") + ] + .count()["event_type"] + .astype(int) + ) return dp + tp - + def get_bunt(row): - return all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.bunt == 't')].count()['event_type'].astype(int) - - batting_stats['FB_vL'] = batting_stats.apply(get_fb_vl, axis=1) - batting_stats['FB_vR'] = batting_stats.apply(get_fb_vr, axis=1) + return ( + all_plays[ + (all_plays.batter_id == row["key_retro"]) & (all_plays.bunt == "t") + ] + .count()["event_type"] + .astype(int) + ) - batting_stats['GB_vL'] = batting_stats.apply(get_gb_vl, axis=1) - batting_stats['GB_vR'] = batting_stats.apply(get_gb_vr, axis=1) + batting_stats["FB_vL"] = batting_stats.apply(get_fb_vl, axis=1) + batting_stats["FB_vR"] = batting_stats.apply(get_fb_vr, axis=1) - batting_stats['LD_vL'] = batting_stats.apply(get_ld_vl, axis=1) - batting_stats['LD_vR'] = batting_stats.apply(get_ld_vr, axis=1) + batting_stats["GB_vL"] = batting_stats.apply(get_gb_vl, axis=1) + batting_stats["GB_vR"] = batting_stats.apply(get_gb_vr, axis=1) - batting_stats['GDP_vL'] = batting_stats.apply(get_gdp_vl, axis=1) - batting_stats['GDP_vR'] = batting_stats.apply(get_gdp_vr, axis=1) + batting_stats["LD_vL"] = batting_stats.apply(get_ld_vl, axis=1) + batting_stats["LD_vR"] = batting_stats.apply(get_ld_vr, axis=1) - batting_stats['Bunts'] = batting_stats.apply(get_bunt, axis=1) - print(f'Custom counting stats: {(datetime.datetime.now() - start).total_seconds():.2f}s') + batting_stats["GDP_vL"] = batting_stats.apply(get_gdp_vl, axis=1) + batting_stats["GDP_vR"] = batting_stats.apply(get_gdp_vr, axis=1) + + batting_stats["Bunts"] = batting_stats.apply(get_bunt, axis=1) + print( + f"Custom counting stats: {(datetime.datetime.now() - start).total_seconds():.2f}s" + ) # Infield Hit % - ifh_vl = all_plays[(all_plays.hit_val.str.contains('1|2|3')) & (all_plays.pitcher_hand == 'l') & (all_plays.hit_location.str.contains('1|2|3|4|5|6')) & (~all_plays.hit_location.str.contains('D', na=False))].groupby('batter_id').count()['event_type'].astype(int).rename('ifh_vL') - ifh_vr = all_plays[(all_plays.hit_val.str.contains('1|2|3')) & (all_plays.pitcher_hand == 'r') & (all_plays.hit_location.str.contains('1|2|3|4|5|6')) & (~all_plays.hit_location.str.contains('D', na=False))].groupby('batter_id').count()['event_type'].astype(int).rename('ifh_vR') - - batting_stats['ifh_vL'] = ifh_vl - batting_stats['ifh_vR'] = ifh_vr + ifh_vl = ( + all_plays[ + (all_plays.hit_val.str.contains("1|2|3")) + & (all_plays.pitcher_hand == "l") + & (all_plays.hit_location.str.contains("1|2|3|4|5|6")) + & (~all_plays.hit_location.str.contains("D", na=False)) + ] + .groupby("batter_id") + .count()["event_type"] + .astype(int) + .rename("ifh_vL") + ) + ifh_vr = ( + all_plays[ + (all_plays.hit_val.str.contains("1|2|3")) + & (all_plays.pitcher_hand == "r") + & (all_plays.hit_location.str.contains("1|2|3|4|5|6")) + & (~all_plays.hit_location.str.contains("D", na=False)) + ] + .groupby("batter_id") + .count()["event_type"] + .astype(int) + .rename("ifh_vR") + ) + + batting_stats["ifh_vL"] = ifh_vl + batting_stats["ifh_vR"] = ifh_vr def get_pull_vl(row): - pull_loc = '5|7' if row['bat_hand'] != 'L' else '3|9' - x = all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.pitcher_hand == 'l') & (all_plays.hit_location.str.contains(pull_loc))].count()['event_type'].astype(int) + pull_loc = "5|7" if row["bat_hand"] != "L" else "3|9" + x = ( + all_plays[ + (all_plays.batter_id == row["key_retro"]) + & (all_plays.pitcher_hand == "l") + & (all_plays.hit_location.str.contains(pull_loc)) + ] + .count()["event_type"] + .astype(int) + ) return x + def get_pull_vr(row): - pull_loc = '5|7' if row['bat_hand'] == 'R' else '3|9' - x = all_plays[(all_plays.batter_id == row['key_retro']) & (all_plays.pitcher_hand == 'r') & (all_plays.hit_location.str.contains(pull_loc))].count()['event_type'].astype(int) + pull_loc = "5|7" if row["bat_hand"] == "R" else "3|9" + x = ( + all_plays[ + (all_plays.batter_id == row["key_retro"]) + & (all_plays.pitcher_hand == "r") + & (all_plays.hit_location.str.contains(pull_loc)) + ] + .count()["event_type"] + .astype(int) + ) return x # Bespoke Queries - batting_stats['pull_vL'] = batting_stats.apply(get_pull_vl, axis=1) - batting_stats['pull_vR'] = batting_stats.apply(get_pull_vr, axis=1) + batting_stats["pull_vL"] = batting_stats.apply(get_pull_vl, axis=1) + batting_stats["pull_vR"] = batting_stats.apply(get_pull_vr, axis=1) - center_vl = all_plays[(all_plays.pitcher_hand == 'l') & (all_plays.hit_location.str.contains('1|4|6|8'))].groupby('batter_id').count()['event_type'].astype(int).rename('center_vl') - center_vr = all_plays[(all_plays.pitcher_hand == 'r') & (all_plays.hit_location.str.contains('1|4|6|8'))].groupby('batter_id').count()['event_type'].astype(int).rename('center_vr') + center_vl = ( + all_plays[ + (all_plays.pitcher_hand == "l") + & (all_plays.hit_location.str.contains("1|4|6|8")) + ] + .groupby("batter_id") + .count()["event_type"] + .astype(int) + .rename("center_vl") + ) + center_vr = ( + all_plays[ + (all_plays.pitcher_hand == "r") + & (all_plays.hit_location.str.contains("1|4|6|8")) + ] + .groupby("batter_id") + .count()["event_type"] + .astype(int) + .rename("center_vr") + ) - batting_stats['center_vL'] = center_vl - batting_stats['center_vR'] = center_vr + batting_stats["center_vL"] = center_vl + batting_stats["center_vR"] = center_vr - oppo_vl = all_plays[(all_plays.pitcher_hand == 'l') & (all_plays.hit_location.str.contains('5|7'))].groupby('batter_id').count()['event_type'].astype(int).rename('oppo_vL') - oppo_vr = all_plays[(all_plays.pitcher_hand == 'r') & (all_plays.hit_location.str.contains('5|7'))].groupby('batter_id').count()['event_type'].astype(int).rename('oppo_vR') + oppo_vl = ( + all_plays[ + (all_plays.pitcher_hand == "l") + & (all_plays.hit_location.str.contains("5|7")) + ] + .groupby("batter_id") + .count()["event_type"] + .astype(int) + .rename("oppo_vL") + ) + oppo_vr = ( + all_plays[ + (all_plays.pitcher_hand == "r") + & (all_plays.hit_location.str.contains("5|7")) + ] + .groupby("batter_id") + .count()["event_type"] + .astype(int) + .rename("oppo_vR") + ) + + batting_stats["oppo_vL"] = oppo_vl + batting_stats["oppo_vR"] = oppo_vr - batting_stats['oppo_vL'] = oppo_vl - batting_stats['oppo_vR'] = oppo_vr - # fill na to 0 following counting stats batting_stats = batting_stats.fillna(0) # Calculated Fields start = datetime.datetime.now() - batting_stats['H_vL'] = batting_stats['1B_vL'] + batting_stats['2B_vL'] + batting_stats['3B_vL'] + batting_stats['HR_vL'] - batting_stats['H_vR'] = batting_stats['1B_vR'] + batting_stats['2B_vR'] + batting_stats['3B_vR'] + batting_stats['HR_vR'] + batting_stats["H_vL"] = ( + batting_stats["1B_vL"] + + batting_stats["2B_vL"] + + batting_stats["3B_vL"] + + batting_stats["HR_vL"] + ) + batting_stats["H_vR"] = ( + batting_stats["1B_vR"] + + batting_stats["2B_vR"] + + batting_stats["3B_vR"] + + batting_stats["HR_vR"] + ) - batting_stats['AVG_vL'] = round(batting_stats['H_vL'] / batting_stats['AB_vL'], 5) - batting_stats['AVG_vR'] = round(batting_stats['H_vR'] / batting_stats['AB_vR'], 5) + batting_stats["AVG_vL"] = round(batting_stats["H_vL"] / batting_stats["AB_vL"], 5) + batting_stats["AVG_vR"] = round(batting_stats["H_vR"] / batting_stats["AB_vR"], 5) - batting_stats['OBP_vL'] = round((batting_stats['H_vL'] + batting_stats['BB_vL'] + batting_stats['HBP_vL']) / batting_stats['PA_vL'], 5) - batting_stats['OBP_vR'] = round((batting_stats['H_vR'] + batting_stats['BB_vR'] + batting_stats['HBP_vR']) / batting_stats['PA_vR'], 5) + batting_stats["OBP_vL"] = round( + (batting_stats["H_vL"] + batting_stats["BB_vL"] + batting_stats["HBP_vL"]) + / batting_stats["PA_vL"], + 5, + ) + batting_stats["OBP_vR"] = round( + (batting_stats["H_vR"] + batting_stats["BB_vR"] + batting_stats["HBP_vR"]) + / batting_stats["PA_vR"], + 5, + ) - batting_stats['SLG_vL'] = round((batting_stats['1B_vL'] + batting_stats['2B_vL'] * 2 + batting_stats['3B_vL'] * 3 + batting_stats['HR_vL'] * 4) / batting_stats['AB_vL'], 5) - batting_stats['SLG_vR'] = round((batting_stats['1B_vR'] + batting_stats['2B_vR'] * 2 + batting_stats['3B_vR'] * 3 + batting_stats['HR_vR'] * 4) / batting_stats['AB_vR'], 5) + batting_stats["SLG_vL"] = round( + ( + batting_stats["1B_vL"] + + batting_stats["2B_vL"] * 2 + + batting_stats["3B_vL"] * 3 + + batting_stats["HR_vL"] * 4 + ) + / batting_stats["AB_vL"], + 5, + ) + batting_stats["SLG_vR"] = round( + ( + batting_stats["1B_vR"] + + batting_stats["2B_vR"] * 2 + + batting_stats["3B_vR"] * 3 + + batting_stats["HR_vR"] * 4 + ) + / batting_stats["AB_vR"], + 5, + ) - batting_stats['HR/FB_vL'] = round(batting_stats['HR_vL'] / batting_stats['FB_vL'], 5) - batting_stats['HR/FB_vR'] = round(batting_stats['HR_vR'] / batting_stats['FB_vR'], 5) + batting_stats["HR/FB_vL"] = round( + batting_stats["HR_vL"] / batting_stats["FB_vL"], 5 + ) + batting_stats["HR/FB_vR"] = round( + batting_stats["HR_vR"] / batting_stats["FB_vR"], 5 + ) - batting_stats['FB%_vL'] = round(batting_stats['FB_vL'] / (batting_stats['FB_vL'] + batting_stats['GB_vL'] + batting_stats['LD_vL']), 5) - batting_stats['FB%_vR'] = round(batting_stats['FB_vR'] / (batting_stats['FB_vR'] + batting_stats['GB_vR'] + batting_stats['LD_vR']), 5) + batting_stats["FB%_vL"] = round( + batting_stats["FB_vL"] + / (batting_stats["FB_vL"] + batting_stats["GB_vL"] + batting_stats["LD_vL"]), + 5, + ) + batting_stats["FB%_vR"] = round( + batting_stats["FB_vR"] + / (batting_stats["FB_vR"] + batting_stats["GB_vR"] + batting_stats["LD_vR"]), + 5, + ) - batting_stats['GB%_vL'] = round(batting_stats['GB_vL'] / (batting_stats['FB_vL'] + batting_stats['GB_vL'] + batting_stats['LD_vL']), 5) - batting_stats['GB%_vR'] = round(batting_stats['GB_vR'] / (batting_stats['FB_vR'] + batting_stats['GB_vR'] + batting_stats['LD_vR']), 5) + batting_stats["GB%_vL"] = round( + batting_stats["GB_vL"] + / (batting_stats["FB_vL"] + batting_stats["GB_vL"] + batting_stats["LD_vL"]), + 5, + ) + batting_stats["GB%_vR"] = round( + batting_stats["GB_vR"] + / (batting_stats["FB_vR"] + batting_stats["GB_vR"] + batting_stats["LD_vR"]), + 5, + ) - batting_stats['LD%_vL'] = round(batting_stats['LD_vL'] / (batting_stats['FB_vL'] + batting_stats['GB_vL'] + batting_stats['LD_vL']), 5) - batting_stats['LD%_vR'] = round(batting_stats['LD_vR'] / (batting_stats['FB_vR'] + batting_stats['GB_vR'] + batting_stats['LD_vR']), 5) + batting_stats["LD%_vL"] = round( + batting_stats["LD_vL"] + / (batting_stats["FB_vL"] + batting_stats["GB_vL"] + batting_stats["LD_vL"]), + 5, + ) + batting_stats["LD%_vR"] = round( + batting_stats["LD_vR"] + / (batting_stats["FB_vR"] + batting_stats["GB_vR"] + batting_stats["LD_vR"]), + 5, + ) - batting_stats['Hard%_vL'] = round(0.2 + batting_stats['SLG_vL'] - batting_stats['AVG_vL'], 5) - batting_stats['Hard%_vR'] = round(0.2 + batting_stats['SLG_vR'] - batting_stats['AVG_vR'], 5) + batting_stats["Hard%_vL"] = round( + 0.2 + batting_stats["SLG_vL"] - batting_stats["AVG_vL"], 5 + ) + batting_stats["Hard%_vR"] = round( + 0.2 + batting_stats["SLG_vR"] - batting_stats["AVG_vR"], 5 + ) # def get_med_vL(row): # high = 0.9 - row['Hard%_vL'] @@ -544,148 +1004,367 @@ def get_batting_stats_by_date(retro_file_path, start_date: int, end_date: int) - # low = (row['SLG_vR'] - row['AVG_vR']) * 1.5 # return round(max(min(high, low),0.1), 5) - batting_stats['Med%_vL'] = batting_stats.apply(get_med_vL, axis=1) - batting_stats['Med%_vR'] = batting_stats.apply(get_med_vR, axis=1) + batting_stats["Med%_vL"] = batting_stats.apply(get_med_vL, axis=1) + batting_stats["Med%_vR"] = batting_stats.apply(get_med_vR, axis=1) - batting_stats['Soft%_vL'] = round(1 - batting_stats['Hard%_vL'] - batting_stats['Med%_vL'], 5) - batting_stats['Soft%_vR'] = round(1 - batting_stats['Hard%_vR'] - batting_stats['Med%_vR'], 5) + batting_stats["Soft%_vL"] = round( + 1 - batting_stats["Hard%_vL"] - batting_stats["Med%_vL"], 5 + ) + batting_stats["Soft%_vR"] = round( + 1 - batting_stats["Hard%_vR"] - batting_stats["Med%_vR"], 5 + ) - batting_stats['IFH%_vL'] = round(batting_stats['ifh_vL'] / batting_stats['H_vL'], 5) - batting_stats['IFH%_vR'] = round(batting_stats['ifh_vR'] / batting_stats['H_vR'], 5) + batting_stats["IFH%_vL"] = round(batting_stats["ifh_vL"] / batting_stats["H_vL"], 5) + batting_stats["IFH%_vR"] = round(batting_stats["ifh_vR"] / batting_stats["H_vR"], 5) - pull_val = round(batting_stats['pull_vL'] / (batting_stats['pull_vL'] + batting_stats['center_vL'] + batting_stats['oppo_vL']), 5) - batting_stats['Pull%_vL'] = pull_val.clip(0.1, 0.6) - pull_val = round(batting_stats['pull_vR'] / (batting_stats['pull_vR'] + batting_stats['center_vR'] + batting_stats['oppo_vR']), 5) - batting_stats['Pull%_vR'] = pull_val.clip(0.1, 0.6) + pull_val = round( + batting_stats["pull_vL"] + / ( + batting_stats["pull_vL"] + + batting_stats["center_vL"] + + batting_stats["oppo_vL"] + ), + 5, + ) + batting_stats["Pull%_vL"] = pull_val.clip(0.1, 0.6) + pull_val = round( + batting_stats["pull_vR"] + / ( + batting_stats["pull_vR"] + + batting_stats["center_vR"] + + batting_stats["oppo_vR"] + ), + 5, + ) + batting_stats["Pull%_vR"] = pull_val.clip(0.1, 0.6) - cent_val = round(batting_stats['center_vL'] / (batting_stats['pull_vL'] + batting_stats['center_vL'] + batting_stats['oppo_vL']), 5) - batting_stats['Cent%_vL'] = cent_val.clip(0.1, 0.6) - cent_val = round(batting_stats['center_vL'] / (batting_stats['pull_vR'] + batting_stats['center_vR'] + batting_stats['oppo_vR']), 5) - batting_stats['Cent%_vR'] = cent_val.clip(0.1, 0.6) + cent_val = round( + batting_stats["center_vL"] + / ( + batting_stats["pull_vL"] + + batting_stats["center_vL"] + + batting_stats["oppo_vL"] + ), + 5, + ) + batting_stats["Cent%_vL"] = cent_val.clip(0.1, 0.6) + cent_val = round( + batting_stats["center_vL"] + / ( + batting_stats["pull_vR"] + + batting_stats["center_vR"] + + batting_stats["oppo_vR"] + ), + 5, + ) + batting_stats["Cent%_vR"] = cent_val.clip(0.1, 0.6) - batting_stats['Oppo%_vL'] = round(1 - batting_stats['Pull%_vL'] - batting_stats['Cent%_vL'], 5) - batting_stats['Oppo%_vR'] = round(1 - batting_stats['Pull%_vR'] - batting_stats['Cent%_vR'], 5) + batting_stats["Oppo%_vL"] = round( + 1 - batting_stats["Pull%_vL"] - batting_stats["Cent%_vL"], 5 + ) + batting_stats["Oppo%_vR"] = round( + 1 - batting_stats["Pull%_vR"] - batting_stats["Cent%_vR"], 5 + ) batting_stats = batting_stats.fillna(0) - print(f'Calculated fields: {(datetime.datetime.now() - start).total_seconds():.2f}s') + print( + f"Calculated fields: {(datetime.datetime.now() - start).total_seconds():.2f}s" + ) return batting_stats -def get_pitching_stats_by_date(retro_file_path, start_date: int, end_date: int) -> pd.DataFrame: +def get_pitching_stats_by_date( + retro_file_path, start_date: int, end_date: int +) -> pd.DataFrame: start = datetime.datetime.now() - all_plays, pitching_stats = get_base_pitching_df(retro_file_path, start_date, end_date) - print(f'Get base dataframe: {(datetime.datetime.now() - start).total_seconds():.2f}s') + all_plays, pitching_stats = get_base_pitching_df( + retro_file_path, start_date, end_date + ) + print( + f"Get base dataframe: {(datetime.datetime.now() - start).total_seconds():.2f}s" + ) start = datetime.datetime.now() - all_player_ids = pitching_stats['key_retro'] - all_plays = all_plays[all_plays['pitcher_id'].isin(all_player_ids)] - print(f'Shrink all_plays: {(datetime.datetime.now() - start).total_seconds():.2f}s') + all_player_ids = pitching_stats["key_retro"] + all_plays = all_plays[all_plays["pitcher_id"].isin(all_player_ids)] + print(f"Shrink all_plays: {(datetime.datetime.now() - start).total_seconds():.2f}s") # Basic counting stats start = datetime.datetime.now() for event_type, vs_hand, col_name in [ - ('home run', 'r', 'HR_vR'), - ('home run', 'l', 'HR_vL'), - ('single', 'r', '1B_vR'), - ('single', 'l', '1B_vL'), - ('double', 'r', '2B_vR'), - ('double', 'l', '2B_vL'), - ('triple', 'r', '3B_vR'), - ('triple', 'l', '3B_vL'), - ('walk', 'r', 'BB_vR'), - ('walk', 'l', 'BB_vL'), - ('strikeout', 'r', 'SO_vR'), - ('strikeout', 'l', 'SO_vL'), - ('hit by pitch', 'r', 'HBP_vR'), - ('hit by pitch', 'l', 'HBP_vL'), - ('intentional walk', 'l', 'IBB_vL'), - ('intentional walk', 'r', 'IBB_vR') + ("home run", "r", "HR_vR"), + ("home run", "l", "HR_vL"), + ("single", "r", "1B_vR"), + ("single", "l", "1B_vL"), + ("double", "r", "2B_vR"), + ("double", "l", "2B_vL"), + ("triple", "r", "3B_vR"), + ("triple", "l", "3B_vL"), + ("walk", "r", "BB_vR"), + ("walk", "l", "BB_vL"), + ("strikeout", "r", "SO_vR"), + ("strikeout", "l", "SO_vL"), + ("hit by pitch", "r", "HBP_vR"), + ("hit by pitch", "l", "HBP_vL"), + ("intentional walk", "l", "IBB_vL"), + ("intentional walk", "r", "IBB_vR"), ]: - this_series = get_pitching_result_series(all_plays, event_type, vs_hand, col_name) + this_series = get_pitching_result_series( + all_plays, event_type, vs_hand, col_name + ) pitching_stats[col_name] = this_series - print(f'Count basic stats: {(datetime.datetime.now() - start).total_seconds():.2f}s') + print( + f"Count basic stats: {(datetime.datetime.now() - start).total_seconds():.2f}s" + ) pitching_stats = pitching_stats.fillna(0) - + # Bespoke counting stats start = datetime.datetime.now() + def get_fb_vl(row): - return all_plays[(all_plays.pitcher_id == row['key_retro']) & (all_plays.batted_ball_type == 'f') & (all_plays.batter_hand == 'l')].count()['event_type'].astype(int) + return ( + all_plays[ + (all_plays.pitcher_id == row["key_retro"]) + & (all_plays.batted_ball_type == "f") + & (all_plays.batter_hand == "l") + ] + .count()["event_type"] + .astype(int) + ) + def get_fb_vr(row): - return all_plays[(all_plays.pitcher_id == row['key_retro']) & (all_plays.batted_ball_type == 'f') & (all_plays.batter_hand == 'r')].count()['event_type'].astype(int) - + return ( + all_plays[ + (all_plays.pitcher_id == row["key_retro"]) + & (all_plays.batted_ball_type == "f") + & (all_plays.batter_hand == "r") + ] + .count()["event_type"] + .astype(int) + ) + def get_gb_vl(row): - return all_plays[(all_plays.pitcher_id == row['key_retro']) & (all_plays.batted_ball_type == 'G') & (all_plays.batter_hand == 'l')].count()['event_type'].astype(int) + return ( + all_plays[ + (all_plays.pitcher_id == row["key_retro"]) + & (all_plays.batted_ball_type == "G") + & (all_plays.batter_hand == "l") + ] + .count()["event_type"] + .astype(int) + ) + def get_gb_vr(row): - return all_plays[(all_plays.pitcher_id == row['key_retro']) & (all_plays.batted_ball_type == 'G') & (all_plays.batter_hand == 'r')].count()['event_type'].astype(int) - + return ( + all_plays[ + (all_plays.pitcher_id == row["key_retro"]) + & (all_plays.batted_ball_type == "G") + & (all_plays.batter_hand == "r") + ] + .count()["event_type"] + .astype(int) + ) + def get_ld_vl(row): - return all_plays[(all_plays.pitcher_id == row['key_retro']) & (all_plays.batted_ball_type == 'l') & (all_plays.pitcher_hand == 'l')].count()['event_type'].astype(int) + return ( + all_plays[ + (all_plays.pitcher_id == row["key_retro"]) + & (all_plays.batted_ball_type == "l") + & (all_plays.pitcher_hand == "l") + ] + .count()["event_type"] + .astype(int) + ) + def get_ld_vr(row): - return all_plays[(all_plays.pitcher_id == row['key_retro']) & (all_plays.batted_ball_type == 'l') & (all_plays.pitcher_hand == 'r')].count()['event_type'].astype(int) - - pitching_stats['FB_vL'] = pitching_stats.apply(get_fb_vl, axis=1) - pitching_stats['FB_vR'] = pitching_stats.apply(get_fb_vr, axis=1) + return ( + all_plays[ + (all_plays.pitcher_id == row["key_retro"]) + & (all_plays.batted_ball_type == "l") + & (all_plays.pitcher_hand == "r") + ] + .count()["event_type"] + .astype(int) + ) - pitching_stats['GB_vL'] = pitching_stats.apply(get_gb_vl, axis=1) - pitching_stats['GB_vR'] = pitching_stats.apply(get_gb_vr, axis=1) + pitching_stats["FB_vL"] = pitching_stats.apply(get_fb_vl, axis=1) + pitching_stats["FB_vR"] = pitching_stats.apply(get_fb_vr, axis=1) - pitching_stats['LD_vL'] = pitching_stats.apply(get_ld_vl, axis=1) - pitching_stats['LD_vR'] = pitching_stats.apply(get_ld_vr, axis=1) + pitching_stats["GB_vL"] = pitching_stats.apply(get_gb_vl, axis=1) + pitching_stats["GB_vR"] = pitching_stats.apply(get_gb_vr, axis=1) - pitching_stats['H_vL'] = pitching_stats['1B_vL'] + pitching_stats['2B_vL'] + pitching_stats['3B_vL'] + pitching_stats['HR_vL'] - pitching_stats['H_vR'] = pitching_stats['1B_vR'] + pitching_stats['2B_vR'] + pitching_stats['3B_vR'] + pitching_stats['HR_vR'] + pitching_stats["LD_vL"] = pitching_stats.apply(get_ld_vl, axis=1) + pitching_stats["LD_vR"] = pitching_stats.apply(get_ld_vr, axis=1) - print(f'Custom counting stats: {(datetime.datetime.now() - start).total_seconds():.2f}s') + pitching_stats["H_vL"] = ( + pitching_stats["1B_vL"] + + pitching_stats["2B_vL"] + + pitching_stats["3B_vL"] + + pitching_stats["HR_vL"] + ) + pitching_stats["H_vR"] = ( + pitching_stats["1B_vR"] + + pitching_stats["2B_vR"] + + pitching_stats["3B_vR"] + + pitching_stats["HR_vR"] + ) + + print( + f"Custom counting stats: {(datetime.datetime.now() - start).total_seconds():.2f}s" + ) # Calculated Fields """ Oppo%_vL & R """ start = datetime.datetime.now() - pitching_stats['AVG_vL'] = round(pitching_stats['H_vL'] / pitching_stats['AB_vL'], 5) - pitching_stats['AVG_vR'] = round(pitching_stats['H_vR'] / pitching_stats['AB_vR'], 5) + pitching_stats["AVG_vL"] = round( + pitching_stats["H_vL"] / pitching_stats["AB_vL"], 5 + ) + pitching_stats["AVG_vR"] = round( + pitching_stats["H_vR"] / pitching_stats["AB_vR"], 5 + ) - pitching_stats['OBP_vL'] = round((pitching_stats['H_vL'] + pitching_stats['BB_vL'] + pitching_stats['HBP_vL'] + pitching_stats['IBB_vL']) / pitching_stats['TBF_vL'], 5) - pitching_stats['OBP_vR'] = round((pitching_stats['H_vR'] + pitching_stats['BB_vR'] + pitching_stats['HBP_vR'] + pitching_stats['IBB_vR']) / pitching_stats['TBF_vR'], 5) + pitching_stats["OBP_vL"] = round( + ( + pitching_stats["H_vL"] + + pitching_stats["BB_vL"] + + pitching_stats["HBP_vL"] + + pitching_stats["IBB_vL"] + ) + / pitching_stats["TBF_vL"], + 5, + ) + pitching_stats["OBP_vR"] = round( + ( + pitching_stats["H_vR"] + + pitching_stats["BB_vR"] + + pitching_stats["HBP_vR"] + + pitching_stats["IBB_vR"] + ) + / pitching_stats["TBF_vR"], + 5, + ) - pitching_stats['SLG_vL'] = round((pitching_stats['1B_vL'] + pitching_stats['2B_vL'] * 2 + pitching_stats['3B_vL'] * 3 + pitching_stats['HR_vL'] * 4) / pitching_stats['AB_vL'], 5) - pitching_stats['SLG_vR'] = round((pitching_stats['1B_vR'] + pitching_stats['2B_vR'] * 2 + pitching_stats['3B_vR'] * 3 + pitching_stats['HR_vR'] * 4) / pitching_stats['AB_vR'], 5) + pitching_stats["SLG_vL"] = round( + ( + pitching_stats["1B_vL"] + + pitching_stats["2B_vL"] * 2 + + pitching_stats["3B_vL"] * 3 + + pitching_stats["HR_vL"] * 4 + ) + / pitching_stats["AB_vL"], + 5, + ) + pitching_stats["SLG_vR"] = round( + ( + pitching_stats["1B_vR"] + + pitching_stats["2B_vR"] * 2 + + pitching_stats["3B_vR"] * 3 + + pitching_stats["HR_vR"] * 4 + ) + / pitching_stats["AB_vR"], + 5, + ) - pitching_stats['HR/FB_vL'] = round(pitching_stats['HR_vL'] / pitching_stats['FB_vL'], 5) - pitching_stats['HR/FB_vR'] = round(pitching_stats['HR_vR'] / pitching_stats['FB_vR'], 5) + pitching_stats["HR/FB_vL"] = round( + pitching_stats["HR_vL"] / pitching_stats["FB_vL"], 5 + ) + pitching_stats["HR/FB_vR"] = round( + pitching_stats["HR_vR"] / pitching_stats["FB_vR"], 5 + ) - pitching_stats['Hard%_vL'] = round(0.2 + pitching_stats['SLG_vL'] - pitching_stats['AVG_vL'], 5) - pitching_stats['Hard%_vR'] = round(0.2 + pitching_stats['SLG_vR'] - pitching_stats['AVG_vR'], 5) + pitching_stats["Hard%_vL"] = round( + 0.2 + pitching_stats["SLG_vL"] - pitching_stats["AVG_vL"], 5 + ) + pitching_stats["Hard%_vR"] = round( + 0.2 + pitching_stats["SLG_vR"] - pitching_stats["AVG_vR"], 5 + ) - pitching_stats['Med%_vL'] = pitching_stats.apply(get_med_vL, axis=1) - pitching_stats['Med%_vR'] = pitching_stats.apply(get_med_vR, axis=1) + pitching_stats["Med%_vL"] = pitching_stats.apply(get_med_vL, axis=1) + pitching_stats["Med%_vR"] = pitching_stats.apply(get_med_vR, axis=1) - pitching_stats['Soft%_vL'] = round(1 - pitching_stats['Hard%_vL'] - pitching_stats['Med%_vL'], 5) - pitching_stats['Soft%_vR'] = round(1 - pitching_stats['Hard%_vR'] - pitching_stats['Med%_vR'], 5) + pitching_stats["Soft%_vL"] = round( + 1 - pitching_stats["Hard%_vL"] - pitching_stats["Med%_vL"], 5 + ) + pitching_stats["Soft%_vR"] = round( + 1 - pitching_stats["Hard%_vR"] - pitching_stats["Med%_vR"], 5 + ) - pitching_stats['FB%_vL'] = round(pitching_stats['FB_vL'] / (pitching_stats['FB_vL'] + pitching_stats['GB_vL'] + pitching_stats['LD_vL']), 5) - pitching_stats['FB%_vR'] = round(pitching_stats['FB_vR'] / (pitching_stats['FB_vR'] + pitching_stats['GB_vR'] + pitching_stats['LD_vR']), 5) + pitching_stats["FB%_vL"] = round( + pitching_stats["FB_vL"] + / (pitching_stats["FB_vL"] + pitching_stats["GB_vL"] + pitching_stats["LD_vL"]), + 5, + ) + pitching_stats["FB%_vR"] = round( + pitching_stats["FB_vR"] + / (pitching_stats["FB_vR"] + pitching_stats["GB_vR"] + pitching_stats["LD_vR"]), + 5, + ) - pitching_stats['GB%_vL'] = round(pitching_stats['GB_vL'] / (pitching_stats['FB_vL'] + pitching_stats['GB_vL'] + pitching_stats['LD_vL']), 5) - pitching_stats['GB%_vR'] = round(pitching_stats['GB_vR'] / (pitching_stats['FB_vR'] + pitching_stats['GB_vR'] + pitching_stats['LD_vR']), 5) + pitching_stats["GB%_vL"] = round( + pitching_stats["GB_vL"] + / (pitching_stats["FB_vL"] + pitching_stats["GB_vL"] + pitching_stats["LD_vL"]), + 5, + ) + pitching_stats["GB%_vR"] = round( + pitching_stats["GB_vR"] + / (pitching_stats["FB_vR"] + pitching_stats["GB_vR"] + pitching_stats["LD_vR"]), + 5, + ) def get_oppo_vl(row): - count = all_plays[(all_plays.pitcher_id == row['key_retro']) & (all_plays.batter_hand == 'l') & (all_plays.hit_location.str.contains('5|7'))].count()['event_type'].astype(int) - denom = all_plays[(all_plays.pitcher_id == row['key_retro']) & (all_plays.batter_hand == 'l') & (all_plays.batter_event == 't')].count()['event_type'].astype(int) - return round(count / denom, 5) - def get_oppo_vr(row): - count = all_plays[(all_plays.pitcher_id == row['key_retro']) & (all_plays.batter_hand == 'r') & (all_plays.hit_location.str.contains('3|9'))].count()['event_type'].astype(int) - denom = all_plays[(all_plays.pitcher_id == row['key_retro']) & (all_plays.batter_hand == 'r') & (all_plays.batter_event == 't')].count()['event_type'].astype(int) + count = ( + all_plays[ + (all_plays.pitcher_id == row["key_retro"]) + & (all_plays.batter_hand == "l") + & (all_plays.hit_location.str.contains("5|7")) + ] + .count()["event_type"] + .astype(int) + ) + denom = ( + all_plays[ + (all_plays.pitcher_id == row["key_retro"]) + & (all_plays.batter_hand == "l") + & (all_plays.batter_event == "t") + ] + .count()["event_type"] + .astype(int) + ) return round(count / denom, 5) - pitching_stats['Oppo%_vL'] = pitching_stats.apply(get_oppo_vl, axis=1) - pitching_stats['Oppo%_vR'] = pitching_stats.apply(get_oppo_vr, axis=1) + def get_oppo_vr(row): + count = ( + all_plays[ + (all_plays.pitcher_id == row["key_retro"]) + & (all_plays.batter_hand == "r") + & (all_plays.hit_location.str.contains("3|9")) + ] + .count()["event_type"] + .astype(int) + ) + denom = ( + all_plays[ + (all_plays.pitcher_id == row["key_retro"]) + & (all_plays.batter_hand == "r") + & (all_plays.batter_event == "t") + ] + .count()["event_type"] + .astype(int) + ) + return round(count / denom, 5) + + pitching_stats["Oppo%_vL"] = pitching_stats.apply(get_oppo_vl, axis=1) + pitching_stats["Oppo%_vR"] = pitching_stats.apply(get_oppo_vr, axis=1) pitching_stats = pitching_stats.fillna(0) - print(f'Calculated fields: {(datetime.datetime.now() - start).total_seconds():.2f}s') + print( + f"Calculated fields: {(datetime.datetime.now() - start).total_seconds():.2f}s" + ) return pitching_stats @@ -693,74 +1372,94 @@ def get_pitching_stats_by_date(retro_file_path, start_date: int, end_date: int) def calc_batting_cards(bs: pd.DataFrame, season_pct: float) -> pd.DataFrame: def create_batting_card(row): steal_data = cba.stealing( - chances=int(row['SBO']), - sb2s=int(row['SB2']), - cs2s=int(row['CS2']), - sb3s=int(row['SB3']), - cs3s=int(row['CS3']), - season_pct=1.0 + chances=int(row["SBO"]), + sb2s=int(row["SB2"]), + cs2s=int(row["CS2"]), + sb3s=int(row["SB3"]), + cs3s=int(row["CS3"]), + season_pct=1.0, + ) + y = pd.DataFrame( + { + "key_bbref": [row["key_bbref"]], + "steal_low": [steal_data[0]], + "steal_high": [steal_data[1]], + "steal_auto": [steal_data[2]], + "steal_jump": [steal_data[3]], + "hit_and_run": [ + cba.hit_and_run( + row["AB_vL"], + row["AB_vR"], + row["H_vL"], + row["H_vR"], + row["HR_vL"], + row["HR_vR"], + row["SO_vL"], + row["SO_vR"], + ) + ], + "bunt": [cba.bunting(row["Bunts"], season_pct)], + "running": [cba.running(row["XBT%"])], + "hand": [row["bat_hand"]], + } ) - y = pd.DataFrame({ - 'key_bbref': [row['key_bbref']], - 'steal_low': [steal_data[0]], - 'steal_high': [steal_data[1]], - 'steal_auto': [steal_data[2]], - 'steal_jump': [steal_data[3]], - 'hit_and_run': [cba.hit_and_run( - row['AB_vL'], row['AB_vR'], row['H_vL'], row['H_vR'], - row['HR_vL'], row['HR_vR'], row['SO_vL'], row['SO_vR'] - )], - 'bunt': [cba.bunting(row['Bunts'], season_pct)], - 'running': [cba.running(row['XBT%'])], - 'hand': [row['bat_hand']], - }) return y.loc[0] all_cards = bs.apply(create_batting_card, axis=1) - all_cards = all_cards.set_index('key_bbref') + all_cards = all_cards.set_index("key_bbref") return all_cards def calc_pitching_cards(ps: pd.DataFrame, season_pct: float) -> pd.DataFrame: def create_pitching_card(row): - pow_data = cde.pow_ratings(row['IP'], row['GS'], row['G']) - y = pd.DataFrame({ - "key_bbref": [row['key_bbref']], - "balk": [cpi.balks(row['BK'], row['IP'], season_pct)], - "wild_pitch": [cpi.wild_pitches(row['WP'], row['IP'], season_pct)], - "hold": [cde.hold_pitcher(str(row['caught_stealing_perc']), int(row['pickoffs']), season_pct)], - "starter_rating": [pow_data[0]], - "relief_rating": [pow_data[1]], - "closer_rating": [cpi.closer_rating(int(row['GF']), int(row['SV']), int(row['G']))], - "batting": [f'#1W{row["pitch_hand"].upper()}-C'] - }) + pow_data = cde.pow_ratings(row["IP"], row["GS"], row["G"]) + y = pd.DataFrame( + { + "key_bbref": [row["key_bbref"]], + "balk": [cpi.balks(row["BK"], row["IP"], season_pct)], + "wild_pitch": [cpi.wild_pitches(row["WP"], row["IP"], season_pct)], + "hold": [ + cde.hold_pitcher( + str(row["caught_stealing_perc"]), + int(row["pickoffs"]), + season_pct, + ) + ], + "starter_rating": [pow_data[0]], + "relief_rating": [pow_data[1]], + "closer_rating": [ + cpi.closer_rating(int(row["GF"]), int(row["SV"]), int(row["G"])) + ], + "batting": [f'#1W{row["pitch_hand"].upper()}-C'], + } + ) return y.loc[0] - + all_cards = ps.apply(create_pitching_card, axis=1) - all_cards = all_cards.set_index('key_bbref') + all_cards = all_cards.set_index("key_bbref") return all_cards def calc_batter_ratings(bs: pd.DataFrame) -> pd.DataFrame: def create_batting_rating(row): - if row['key_bbref'] == 'galaran01': + if row["key_bbref"] == "galaran01": pass ratings = cba.get_batter_ratings(row) - ops_vl = ratings[0]['obp'] + ratings[0]['slg'] - ops_vr = ratings[1]['obp'] + ratings[1]['slg'] + ops_vl = ratings[0]["obp"] + ratings[0]["slg"] + ops_vr = ratings[1]["obp"] + ratings[1]["slg"] total_ops = (ops_vl + ops_vr + min(ops_vr, ops_vl)) / 3 - + def calc_cost(total_ops, base_cost, base_ops, max_delta) -> int: delta = ((total_ops - base_ops) / 0.1) * 2 if delta < 1: delta = (max_delta * (1 - (total_ops / base_ops))) * -0.1 - + final_cost = base_cost + (max_delta * delta) return round(final_cost) - + if total_ops >= 1.2: rarity_id = 99 cost = calc_cost(total_ops, base_cost=2400, base_ops=1.215, max_delta=810) @@ -780,31 +1479,33 @@ def calc_batter_ratings(bs: pd.DataFrame) -> pd.DataFrame: rarity_id = 5 cost = calc_cost(total_ops, base_cost=10, base_ops=0.61, max_delta=8) - x = pd.DataFrame({ - 'key_bbref': [row['key_bbref']], - 'ratings_vL': [ratings[0]], - 'ratings_vR': [ratings[1]], - 'ops_vL': ops_vl, - 'ops_vR': ops_vr, - 'total_ops': total_ops, - 'rarity_id': rarity_id, - 'cost': cost - }) + x = pd.DataFrame( + { + "key_bbref": [row["key_bbref"]], + "ratings_vL": [ratings[0]], + "ratings_vR": [ratings[1]], + "ops_vL": ops_vl, + "ops_vR": ops_vr, + "total_ops": total_ops, + "rarity_id": rarity_id, + "cost": cost, + } + ) return x.loc[0] all_ratings = bs.apply(create_batting_rating, axis=1) - all_ratings = all_ratings.set_index('key_bbref') + all_ratings = all_ratings.set_index("key_bbref") return all_ratings def calc_pitcher_ratings(ps: pd.DataFrame) -> pd.DataFrame: def create_pitching_rating(row): - row['pitchingcard_id'] = row['key_fangraphs'] - row['pitch_hand'] = row['pitch_hand'].upper() + row["pitchingcard_id"] = row["key_fangraphs"] + row["pitch_hand"] = row["pitch_hand"].upper() ratings = cpi.get_pitcher_ratings(row) - ops_vl = ratings[0]['obp'] + ratings[0]['slg'] - ops_vr = ratings[1]['obp'] + ratings[1]['slg'] + ops_vl = ratings[0]["obp"] + ratings[0]["slg"] + ops_vr = ratings[1]["obp"] + ratings[1]["slg"] total_ops = (ops_vl + ops_vr + min(ops_vr, ops_vl)) / 3 def calc_cost(total_ops, base_cost, base_ops, max_delta) -> int: @@ -815,8 +1516,8 @@ def calc_pitcher_ratings(ps: pd.DataFrame) -> pd.DataFrame: final_cost = base_cost + (max_delta * delta) return round(final_cost) - - if row['starter_rating'] > 3: + + if row["starter_rating"] > 3: if total_ops <= 0.4: rarity_id = 99 cost = calc_cost(total_ops, 2400, 0.38, 810) @@ -854,285 +1555,357 @@ def calc_pitcher_ratings(ps: pd.DataFrame) -> pd.DataFrame: else: rarity_id = 5 cost = calc_cost(total_ops, 10, 0.7, 8) - - x = pd.DataFrame({ - 'key_bbref': [row['key_bbref']], - 'ratings_vL': [ratings[0]], - 'ratings_vR': [ratings[1]], - 'ops_vL': ops_vl, - 'ops_vR': ops_vr, - 'total_ops': total_ops, - 'rarity_id': rarity_id, - 'cost': cost - }) + + x = pd.DataFrame( + { + "key_bbref": [row["key_bbref"]], + "ratings_vL": [ratings[0]], + "ratings_vR": [ratings[1]], + "ops_vL": ops_vl, + "ops_vR": ops_vr, + "total_ops": total_ops, + "rarity_id": rarity_id, + "cost": cost, + } + ) return x.loc[0] all_ratings = ps.apply(create_pitching_rating, axis=1) - all_ratings = all_ratings.set_index('key_bbref') + all_ratings = all_ratings.set_index("key_bbref") return all_ratings def calc_positions(bs: pd.DataFrame) -> pd.DataFrame: - df_c = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_c.csv').set_index('key_bbref') - df_1b = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_1b.csv').set_index('key_bbref') - df_2b = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_2b.csv').set_index('key_bbref') - df_3b = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_3b.csv').set_index('key_bbref') - df_ss = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_ss.csv').set_index('key_bbref') - df_lf = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_lf.csv').set_index('key_bbref') - df_cf = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_cf.csv').set_index('key_bbref') - df_rf = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_rf.csv').set_index('key_bbref') - df_of = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_of.csv').set_index('key_bbref') + df_c = pd.read_csv(f"{DATA_INPUT_FILE_PATH}defense_c.csv").set_index("key_bbref") + df_1b = pd.read_csv(f"{DATA_INPUT_FILE_PATH}defense_1b.csv").set_index("key_bbref") + df_2b = pd.read_csv(f"{DATA_INPUT_FILE_PATH}defense_2b.csv").set_index("key_bbref") + df_3b = pd.read_csv(f"{DATA_INPUT_FILE_PATH}defense_3b.csv").set_index("key_bbref") + df_ss = pd.read_csv(f"{DATA_INPUT_FILE_PATH}defense_ss.csv").set_index("key_bbref") + df_lf = pd.read_csv(f"{DATA_INPUT_FILE_PATH}defense_lf.csv").set_index("key_bbref") + df_cf = pd.read_csv(f"{DATA_INPUT_FILE_PATH}defense_cf.csv").set_index("key_bbref") + df_rf = pd.read_csv(f"{DATA_INPUT_FILE_PATH}defense_rf.csv").set_index("key_bbref") + df_of = pd.read_csv(f"{DATA_INPUT_FILE_PATH}defense_of.csv").set_index("key_bbref") season_pct = 1.0 all_pos = [] def process_pos(row): no_data = True - for pos_df, position in [(df_1b, '1b'), (df_2b, '2b'), (df_3b, '3b'), (df_ss, 'ss')]: - if row['key_bbref'] in pos_df.index: - logger.info(f'Running {position} stats for {row["use_name"]} {row["last_name"]}') + for pos_df, position in [ + (df_1b, "1b"), + (df_2b, "2b"), + (df_3b, "3b"), + (df_ss, "ss"), + ]: + if row["key_bbref"] in pos_df.index: + logger.info( + f'Running {position} stats for {row["use_name"]} {row["last_name"]}' + ) try: - if 'bis_runs_total' in pos_df.columns: - average_range = (int(pos_df.at[row["key_bbref"], 'tz_runs_total']) + - int(pos_df.at[row["key_bbref"], 'bis_runs_total']) + - min( - int(pos_df.at[row["key_bbref"], 'tz_runs_total']), - int(pos_df.at[row["key_bbref"], 'bis_runs_total']) - )) / 3 - else: - average_range = pos_df.at[row["key_bbref"], 'tz_runs_total'] - - if float(pos_df.at[row["key_bbref"], 'Inn_def']) >= 10.0: - all_pos.append({ - "key_bbref": row['key_bbref'], - "position": position.upper(), - "innings": float(pos_df.at[row["key_bbref"], 'Inn_def']), - "range": cde.get_if_range( - pos_code=position, - tz_runs=round(average_range), - r_dp=0, - season_pct=season_pct - ), - "error": cde.get_any_error( - pos_code=position, - errors=int(pos_df.at[row["key_bbref"], 'E_def']), - chances=int(pos_df.at[row["key_bbref"], 'chances']), - season_pct=season_pct + if "bis_runs_total" in pos_df.columns: + average_range = ( + int(pos_df.at[row["key_bbref"], "tz_runs_total"]) + + int(pos_df.at[row["key_bbref"], "bis_runs_total"]) + + min( + int(pos_df.at[row["key_bbref"], "tz_runs_total"]), + int(pos_df.at[row["key_bbref"], "bis_runs_total"]), ) - }) + ) / 3 + else: + average_range = pos_df.at[row["key_bbref"], "tz_runs_total"] + + if float(pos_df.at[row["key_bbref"], "Inn_def"]) >= 10.0: + all_pos.append( + { + "key_bbref": row["key_bbref"], + "position": position.upper(), + "innings": float( + pos_df.at[row["key_bbref"], "Inn_def"] + ), + "range": cde.get_if_range( + pos_code=position, + tz_runs=round(average_range), + r_dp=0, + season_pct=season_pct, + ), + "error": cde.get_any_error( + pos_code=position, + errors=int(pos_df.at[row["key_bbref"], "E_def"]), + chances=int(pos_df.at[row["key_bbref"], "chances"]), + season_pct=season_pct, + ), + } + ) no_data = False except Exception as e: - logger.info(f'Infield position failed: {e}') + logger.info(f"Infield position failed: {e}") of_arms = [] of_payloads = [] - for pos_df, position in [(df_lf, 'lf'), (df_cf, 'cf'), (df_rf, 'rf')]: + for pos_df, position in [(df_lf, "lf"), (df_cf, "cf"), (df_rf, "rf")]: if row["key_bbref"] in pos_df.index: try: - if 'bis_runs_total' in pos_df.columns: - average_range = (int(pos_df.at[row["key_bbref"], 'tz_runs_total']) + - int(pos_df.at[row["key_bbref"], 'bis_runs_total']) + - min( - int(pos_df.at[row["key_bbref"], 'tz_runs_total']), - int(pos_df.at[row["key_bbref"], 'bis_runs_total']) - )) / 3 - else: - average_range = pos_df.at[row["key_bbref"], 'tz_runs_total'] - - if float(pos_df.at[row["key_bbref"], 'Inn_def']) >= 10.0: - of_payloads.append({ - "key_bbref": row['key_bbref'], - "position": position.upper(), - "innings": float(pos_df.at[row["key_bbref"], 'Inn_def']), - "range": cde.get_of_range( - pos_code=position, - tz_runs=round(average_range), - season_pct=season_pct + if "bis_runs_total" in pos_df.columns: + average_range = ( + int(pos_df.at[row["key_bbref"], "tz_runs_total"]) + + int(pos_df.at[row["key_bbref"], "bis_runs_total"]) + + min( + int(pos_df.at[row["key_bbref"], "tz_runs_total"]), + int(pos_df.at[row["key_bbref"], "bis_runs_total"]), ) - }) - of_run_rating = 'bis_runs_outfield' if 'bis_runs_outfield' in pos_df.columns else 'tz_runs_total' + ) / 3 + else: + average_range = pos_df.at[row["key_bbref"], "tz_runs_total"] + + if float(pos_df.at[row["key_bbref"], "Inn_def"]) >= 10.0: + of_payloads.append( + { + "key_bbref": row["key_bbref"], + "position": position.upper(), + "innings": float( + pos_df.at[row["key_bbref"], "Inn_def"] + ), + "range": cde.get_of_range( + pos_code=position, + tz_runs=round(average_range), + season_pct=season_pct, + ), + } + ) + of_run_rating = ( + "bis_runs_outfield" + if "bis_runs_outfield" in pos_df.columns + else "tz_runs_total" + ) of_arms.append(int(pos_df.at[row["key_bbref"], of_run_rating])) no_data = False except Exception as e: - logger.info(f'Outfield position failed: {e}') + logger.info(f"Outfield position failed: {e}") - if row["key_bbref"] in df_of.index and len(of_arms) > 0 and len(of_payloads) > 0: + if ( + row["key_bbref"] in df_of.index + and len(of_arms) > 0 + and len(of_payloads) > 0 + ): try: error_rating = cde.get_any_error( pos_code=position, - errors=int(df_of.at[row["key_bbref"], 'E_def']), - chances=int(df_of.at[row["key_bbref"], 'chances']), - season_pct=season_pct + errors=int(df_of.at[row["key_bbref"], "E_def"]), + chances=int(df_of.at[row["key_bbref"], "chances"]), + season_pct=season_pct, ) arm_rating = cde.arm_outfield(of_arms) for f in of_payloads: - f['error'] = error_rating - f['arm'] = arm_rating + f["error"] = error_rating + f["arm"] = arm_rating all_pos.append(f) no_data = False except Exception as e: - logger.info(f'Outfield position failed: {e}') + logger.info(f"Outfield position failed: {e}") if row["key_bbref"] in df_c.index: try: - run_rating = 'bis_runs_catcher_sb' if 'bis_runs_catcher_sb' in df_c else 'tz_runs_catcher' - - if df_c.at[row["key_bbref"], 'SB'] + df_c.at[row["key_bbref"], 'CS'] == 0: + run_rating = ( + "bis_runs_catcher_sb" + if "bis_runs_catcher_sb" in df_c + else "tz_runs_catcher" + ) + + if ( + df_c.at[row["key_bbref"], "SB"] + df_c.at[row["key_bbref"], "CS"] + == 0 + ): arm_rating = 3 else: arm_rating = cde.arm_catcher( - cs_pct=df_c.at[row["key_bbref"], 'caught_stealing_perc'], + cs_pct=df_c.at[row["key_bbref"], "caught_stealing_perc"], raa=int(df_c.at[row["key_bbref"], run_rating]), - season_pct=season_pct + season_pct=season_pct, ) - if float(df_c.at[row["key_bbref"], 'Inn_def']) >= 10.0: - all_pos.append({ - "key_bbref": row['key_bbref'], - "position": 'C', - "innings": float(df_c.at[row["key_bbref"], 'Inn_def']), - "range": cde.range_catcher( - rs_value=int(df_c.at[row["key_bbref"], 'tz_runs_catcher']), - season_pct=season_pct - ), - "error": cde.get_any_error( - pos_code='c', - errors=int(df_c.at[row["key_bbref"], 'E_def']), - chances=int(df_c.at[row["key_bbref"], 'chances']), - season_pct=season_pct - ), - "arm": arm_rating, - "pb": cde.pb_catcher( - pb=int(df_c.at[row["key_bbref"], 'PB']), - innings=int(float(df_c.at[row["key_bbref"], 'Inn_def'])), - season_pct=season_pct - ), - "overthrow": cde.ot_catcher( - errors=int(df_c.at[row["key_bbref"], 'E_def']), - chances=int(df_c.at[row["key_bbref"], 'chances']), - season_pct=season_pct - ) - }) + if float(df_c.at[row["key_bbref"], "Inn_def"]) >= 10.0: + all_pos.append( + { + "key_bbref": row["key_bbref"], + "position": "C", + "innings": float(df_c.at[row["key_bbref"], "Inn_def"]), + "range": cde.range_catcher( + rs_value=int( + df_c.at[row["key_bbref"], "tz_runs_catcher"] + ), + season_pct=season_pct, + ), + "error": cde.get_any_error( + pos_code="c", + errors=int(df_c.at[row["key_bbref"], "E_def"]), + chances=int(df_c.at[row["key_bbref"], "chances"]), + season_pct=season_pct, + ), + "arm": arm_rating, + "pb": cde.pb_catcher( + pb=int(df_c.at[row["key_bbref"], "PB"]), + innings=int( + float(df_c.at[row["key_bbref"], "Inn_def"]) + ), + season_pct=season_pct, + ), + "overthrow": cde.ot_catcher( + errors=int(df_c.at[row["key_bbref"], "E_def"]), + chances=int(df_c.at[row["key_bbref"], "chances"]), + season_pct=season_pct, + ), + } + ) no_data = False except Exception as e: - logger.info(f'Catcher position failed: {e}') + logger.info(f"Catcher position failed: {e}") if no_data: - all_pos.append({ - "key_bbref": row['key_bbref'], - "position": 'DH', - "innings": row['PA_vL'] + row['PA_vR'] - }) + all_pos.append( + { + "key_bbref": row["key_bbref"], + "position": "DH", + "innings": row["PA_vL"] + row["PA_vR"], + } + ) bs.apply(process_pos, axis=1) pos_df = pd.DataFrame(all_pos) - pos_df = pos_df.set_index('key_bbref') + pos_df = pos_df.set_index("key_bbref") return pos_df def calc_pitcher_defense(ps: pd.DataFrame) -> pd.DataFrame: - df_p = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_p.csv').set_index('key_bbref') + df_p = pd.read_csv(f"{DATA_INPUT_FILE_PATH}defense_p.csv").set_index("key_bbref") all_pos = [] def process_def(row): - if 'bis_runs_total' in df_p: - range_val = cde.range_pitcher(rs_value=int(df_p.at[row['key_bbref'], 'bis_runs_total'])) - else: - range_val = cde.range_pitcher(rf_per9_value=df_p.at[row['key_bbref'], 'range_factor_per_nine']) - - if row['key_bbref'] in df_p.index: - all_pos.append({ - 'key_bbref': row['key_bbref'], - 'position': 'P', - 'innings': float(df_p.at[row['key_bbref'], 'Inn_def']), - 'range': range_val, - 'error': cde.get_any_error( - pos_code='p', - errors=int(df_p.at[row["key_bbref"], 'E_def']), - chances=int(df_p.at[row["key_bbref"], 'chances']), - season_pct=1.0 - ) - }) + if "bis_runs_total" in df_p: + range_val = cde.range_pitcher( + rs_value=int(df_p.at[row["key_bbref"], "bis_runs_total"]) + ) else: - all_pos.append({ - "key_bbref": int(row['key_bbref']), - "position": 'P', - "innings": 1, - "range": 5, - "error": 51 - }) + range_val = cde.range_pitcher( + rf_per9_value=df_p.at[row["key_bbref"], "range_factor_per_nine"] + ) + + if row["key_bbref"] in df_p.index: + all_pos.append( + { + "key_bbref": row["key_bbref"], + "position": "P", + "innings": float(df_p.at[row["key_bbref"], "Inn_def"]), + "range": range_val, + "error": cde.get_any_error( + pos_code="p", + errors=int(df_p.at[row["key_bbref"], "E_def"]), + chances=int(df_p.at[row["key_bbref"], "chances"]), + season_pct=1.0, + ), + } + ) + else: + all_pos.append( + { + "key_bbref": int(row["key_bbref"]), + "position": "P", + "innings": 1, + "range": 5, + "error": 51, + } + ) ps.apply(process_def, axis=1) pos_df = pd.DataFrame(all_pos) - pos_df = pos_df.set_index('key_bbref') + pos_df = pos_df.set_index("key_bbref") return pos_df -async def get_or_post_players(bstat_df: pd.DataFrame = None, bat_rat_df: pd.DataFrame = None, def_rat_df: pd.DataFrame = None, pstat_df: pd.DataFrame = None, pit_rat_df: pd.DataFrame = None) -> pd.DataFrame: +async def get_or_post_players( + bstat_df: pd.DataFrame = None, + bat_rat_df: pd.DataFrame = None, + def_rat_df: pd.DataFrame = None, + pstat_df: pd.DataFrame = None, + pit_rat_df: pd.DataFrame = None, +) -> pd.DataFrame: all_players = [] - player_deltas = [['player_id', 'player_name', 'old-cost', 'new-cost', 'old-rarity', 'new-rarity']] - new_players = [['player_id', 'player_name', 'cost', 'rarity', 'pos1']] + player_deltas = [ + ["player_id", "player_name", "old-cost", "new-cost", "old-rarity", "new-rarity"] + ] + new_players = [["player_id", "player_name", "cost", "rarity", "pos1"]] async def player_search(bbref_id: str): - p_query = await db_get('players', params=[('bbref_id', bbref_id), ('cardset_id', CARDSET_ID)]) - if p_query['count'] > 0: - return p_query['players'][0] + p_query = await db_get( + "players", params=[("bbref_id", bbref_id), ("cardset_id", CARDSET_ID)] + ) + if p_query["count"] > 0: + return p_query["players"][0] else: return None - + async def mlb_search_or_post(retro_id: int): - mlb_query = await db_get('mlbplayers', params=[('key_retro', retro_id)]) - if mlb_query['count'] > 0: - return mlb_query['players'][0] + mlb_query = await db_get("mlbplayers", params=[("key_retro", retro_id)]) + if mlb_query["count"] > 0: + return mlb_query["players"][0] else: mlb_player = await db_post( - 'mlbplayers/one', + "mlbplayers/one", payload={ - 'first_name': row['use_name'], - 'last_name': row['last_name'], - 'key_mlbam': row['key_mlbam'], - 'key_fangraphs': row['key_fangraphs'], - 'key_bbref': row['key_bbref'], - 'key_retro': row['key_retro'] - } + "first_name": row["use_name"], + "last_name": row["last_name"], + "key_mlbam": row["key_mlbam"], + "key_fangraphs": row["key_fangraphs"], + "key_bbref": row["key_bbref"], + "key_retro": row["key_retro"], + "offense_col": int( + row.get( + "offense_col", + hash_offense_col(f"{row['use_name']} {row['last_name']}"), + ) + ), + }, ) return mlb_player def new_player_payload(row, ratings_df: pd.DataFrame): return { - 'p_name': f'{row["use_name"]} {row["last_name"]}', - 'cost': f'{ratings_df.loc[row['key_bbref']]["cost"]}', - 'image': f'change-me', - 'mlbclub': CLUB_LIST[row['Tm']], - 'franchise': FRANCHISE_LIST[row['Tm']], - 'cardset_id': CARDSET_ID, - 'set_num': int(float(row['key_fangraphs'])), - 'rarity_id': int(ratings_df.loc[row['key_bbref']]['rarity_id']), - 'description': PLAYER_DESCRIPTION, - 'bbref_id': row['key_bbref'], - 'fangr_id': int(float(row['key_fangraphs'])), - 'mlbplayer_id': mlb_player['id'] + "p_name": f'{row["use_name"]} {row["last_name"]}', + "cost": f'{ratings_df.loc[row['key_bbref']]["cost"]}', + "image": f"change-me", + "mlbclub": CLUB_LIST[row["Tm"]], + "franchise": FRANCHISE_LIST[row["Tm"]], + "cardset_id": CARDSET_ID, + "set_num": int(float(row["key_fangraphs"])), + "rarity_id": int(ratings_df.loc[row["key_bbref"]]["rarity_id"]), + "description": PLAYER_DESCRIPTION, + "bbref_id": row["key_bbref"], + "fangr_id": int(float(row["key_fangraphs"])), + "mlbplayer_id": mlb_player["id"], } def get_player_record_pos(def_rat_df: pd.DataFrame, row) -> list[str]: all_pos = [None, None, None, None, None, None, None, None] try: count = 0 - all_pos_df = def_rat_df.loc[row['key_bbref']].sort_values(by='innings', ascending=False) + all_pos_df = def_rat_df.loc[row["key_bbref"]].sort_values( + by="innings", ascending=False + ) for index, pos_row in all_pos_df.iterrows(): all_pos[count] = pos_row.position count += 1 except KeyError: - logger.info(f'No positions found for {row['use_name']} {row['last_name']}') - all_pos[0] = 'DH' + logger.info(f"No positions found for {row['use_name']} {row['last_name']}") + all_pos[0] = "DH" except TypeError: - logger.info(f'Only one position found for {row['use_name']} {row['last_name']}') - all_pos[0] = def_rat_df.loc[row['key_bbref']].position - + logger.info( + f"Only one position found for {row['use_name']} {row['last_name']}" + ) + all_pos[0] = def_rat_df.loc[row["key_bbref"]].position + return all_pos dev_count = 0 @@ -1141,185 +1914,257 @@ async def get_or_post_players(bstat_df: pd.DataFrame = None, bat_rat_df: pd.Data if dev_count < 0: break - p_search = await player_search(row['key_bbref']) + p_search = await player_search(row["key_bbref"]) if p_search is not None: - if 'id' in p_search: - player_id = p_search['id'] + if "id" in p_search: + player_id = p_search["id"] else: - player_id = p_search['player_id'] + player_id = p_search["player_id"] # Update positions for existing players too all_pos = get_player_record_pos(def_rat_df, row) patch_params = [ - ('cost', f'{bat_rat_df.loc[row['key_bbref']]["cost"]}'), - ('rarity_id', int(bat_rat_df.loc[row['key_bbref']]['rarity_id'])), - ('image', f'{CARD_BASE_URL}{player_id}/battingcard{urllib.parse.quote("?d=")}{RELEASE_DIRECTORY}') + ("cost", f'{bat_rat_df.loc[row['key_bbref']]["cost"]}'), + ("rarity_id", int(bat_rat_df.loc[row["key_bbref"]]["rarity_id"])), + ( + "image", + f'{CARD_BASE_URL}{player_id}/battingcard{urllib.parse.quote("?d=")}{RELEASE_DIRECTORY}', + ), ] # Add position updates - set all 8 slots to clear any old positions for x in enumerate(all_pos): - patch_params.append((f'pos_{x[0] + 1}', x[1])) + patch_params.append((f"pos_{x[0] + 1}", x[1])) - new_player = await db_patch('players', object_id=player_id, params=patch_params) - new_player['bbref_id'] = row['key_bbref'] + new_player = await db_patch( + "players", object_id=player_id, params=patch_params + ) + new_player["bbref_id"] = row["key_bbref"] all_players.append(new_player) - player_deltas.append([ - new_player['player_id'], new_player['p_name'], p_search['cost'], new_player['cost'], p_search['rarity']['name'], new_player['rarity']['name'] - ]) + player_deltas.append( + [ + new_player["player_id"], + new_player["p_name"], + p_search["cost"], + new_player["cost"], + p_search["rarity"]["name"], + new_player["rarity"]["name"], + ] + ) else: - mlb_player = await mlb_search_or_post(row['key_retro']) - + mlb_player = await mlb_search_or_post(row["key_retro"]) + player_payload = new_player_payload(row, bat_rat_df) - - all_pos = get_player_record_pos(def_rat_df, row) + + all_pos = get_player_record_pos(def_rat_df, row) for x in enumerate(all_pos): - player_payload[f'pos_{x[0] + 1}'] = x[1] + player_payload[f"pos_{x[0] + 1}"] = x[1] - new_player = await db_post('players', payload=player_payload) + new_player = await db_post("players", payload=player_payload) - if 'id' in new_player: - player_id = new_player['id'] + if "id" in new_player: + player_id = new_player["id"] else: - player_id = new_player['player_id'] + player_id = new_player["player_id"] - new_player = await db_patch('players', object_id=player_id, params=[('image', f'{CARD_BASE_URL}{player_id}/battingcard{urllib.parse.quote("?d=")}{RELEASE_DIRECTORY}')]) - if 'paperdex' in new_player: - del new_player['paperdex'] + new_player = await db_patch( + "players", + object_id=player_id, + params=[ + ( + "image", + f'{CARD_BASE_URL}{player_id}/battingcard{urllib.parse.quote("?d=")}{RELEASE_DIRECTORY}', + ) + ], + ) + if "paperdex" in new_player: + del new_player["paperdex"] # all_bbref_ids.append(row['key_bbref']) # all_player_ids.append(player_id) - new_player['bbref_id'] = row['key_bbref'] + new_player["bbref_id"] = row["key_bbref"] all_players.append(new_player) - new_players.append([new_player['player_id'], new_player['p_name'], new_player['cost'], new_player['rarity']['name'], new_player['pos_1']]) + new_players.append( + [ + new_player["player_id"], + new_player["p_name"], + new_player["cost"], + new_player["rarity"]["name"], + new_player["pos_1"], + ] + ) dev_count += 1 elif pstat_df is not None and pit_rat_df is not None and def_rat_df is not None: - starter_index = pstat_df.columns.get_loc('starter_rating') - closer_index = pstat_df.columns.get_loc('closer_rating') + starter_index = pstat_df.columns.get_loc("starter_rating") + closer_index = pstat_df.columns.get_loc("closer_rating") for index, row in pstat_df.iterrows(): if dev_count < 0: break - p_search = await player_search(row['key_bbref']) + p_search = await player_search(row["key_bbref"]) if p_search is not None: - if 'id' in p_search: - player_id = p_search['id'] + if "id" in p_search: + player_id = p_search["id"] else: - player_id = p_search['player_id'] + player_id = p_search["player_id"] # Determine pitcher positions based on ratings patch_params = [ - ('cost', f'{pit_rat_df.loc[row['key_bbref']]["cost"]}'), - ('rarity_id', int(pit_rat_df.loc[row['key_bbref']]['rarity_id'])), - ('image', f'{CARD_BASE_URL}{player_id}/pitchingcard{urllib.parse.quote("?d=")}{RELEASE_DIRECTORY}') + ("cost", f'{pit_rat_df.loc[row['key_bbref']]["cost"]}'), + ("rarity_id", int(pit_rat_df.loc[row["key_bbref"]]["rarity_id"])), + ( + "image", + f'{CARD_BASE_URL}{player_id}/pitchingcard{urllib.parse.quote("?d=")}{RELEASE_DIRECTORY}', + ), ] - player_index = pstat_df.index[pstat_df['key_bbref'] == row['key_bbref']].tolist() + player_index = pstat_df.index[ + pstat_df["key_bbref"] == row["key_bbref"] + ].tolist() stat_row = pstat_df.iloc[player_index] starter_rating = stat_row.iat[0, starter_index] if starter_rating >= 4: - patch_params.append(('pos_1', 'SP')) + patch_params.append(("pos_1", "SP")) # Clear other position slots for i in range(2, 9): - patch_params.append((f'pos_{i}', None)) + patch_params.append((f"pos_{i}", None)) else: - patch_params.append(('pos_1', 'RP')) + patch_params.append(("pos_1", "RP")) closer_rating = stat_row.iat[0, closer_index] if not pd.isna(closer_rating): - patch_params.append(('pos_2', 'CP')) + patch_params.append(("pos_2", "CP")) # Clear remaining position slots for i in range(3, 9): - patch_params.append((f'pos_{i}', None)) + patch_params.append((f"pos_{i}", None)) else: # Clear remaining position slots for i in range(2, 9): - patch_params.append((f'pos_{i}', None)) + patch_params.append((f"pos_{i}", None)) - new_player = await db_patch('players', object_id=player_id, params=patch_params) - new_player['bbref_id'] = row['key_bbref'] + new_player = await db_patch( + "players", object_id=player_id, params=patch_params + ) + new_player["bbref_id"] = row["key_bbref"] all_players.append(new_player) - player_deltas.append([ - new_player['player_id'], new_player['p_name'], p_search['cost'], new_player['cost'], p_search['rarity']['name'], new_player['rarity']['name'] - ]) + player_deltas.append( + [ + new_player["player_id"], + new_player["p_name"], + p_search["cost"], + new_player["cost"], + p_search["rarity"]["name"], + new_player["rarity"]["name"], + ] + ) else: - mlb_player = await mlb_search_or_post(row['key_retro']) - + mlb_player = await mlb_search_or_post(row["key_retro"]) + player_payload = new_player_payload(row, pit_rat_df) - player_index = pstat_df.index[pstat_df['key_bbref'] == row['key_bbref']].tolist() + player_index = pstat_df.index[ + pstat_df["key_bbref"] == row["key_bbref"] + ].tolist() stat_row = pstat_df.iloc[player_index] - + starter_rating = stat_row.iat[0, starter_index] if starter_rating >= 4: - player_payload['pos_1'] = 'SP' + player_payload["pos_1"] = "SP" else: - player_payload['pos_1'] = 'RP' + player_payload["pos_1"] = "RP" closer_rating = stat_row.iat[0, closer_index] if not pd.isna(closer_rating): - player_payload['pos_2'] = 'CP' + player_payload["pos_2"] = "CP" - new_player = await db_post('players', payload=player_payload) + new_player = await db_post("players", payload=player_payload) - if 'id' in new_player: - player_id = new_player['id'] + if "id" in new_player: + player_id = new_player["id"] else: - player_id = new_player['player_id'] + player_id = new_player["player_id"] - new_player = await db_patch('players', object_id=player_id, params=[('image', f'{CARD_BASE_URL}{player_id}/pitchingcard{urllib.parse.quote("?d=")}{RELEASE_DIRECTORY}')]) - if 'paperdex' in new_player: - del new_player['paperdex'] + new_player = await db_patch( + "players", + object_id=player_id, + params=[ + ( + "image", + f'{CARD_BASE_URL}{player_id}/pitchingcard{urllib.parse.quote("?d=")}{RELEASE_DIRECTORY}', + ) + ], + ) + if "paperdex" in new_player: + del new_player["paperdex"] - new_player['bbref_id'] = row['key_bbref'] + new_player["bbref_id"] = row["key_bbref"] all_players.append(new_player) - new_players.append([new_player['player_id'], new_player['p_name'], new_player['cost'], new_player['rarity']['name'], new_player['pos_1']]) - + new_players.append( + [ + new_player["player_id"], + new_player["p_name"], + new_player["cost"], + new_player["rarity"]["name"], + new_player["pos_1"], + ] + ) + dev_count += 1 else: - raise KeyError(f'Could not get players - not enough stat DFs were supplied') + raise KeyError(f"Could not get players - not enough stat DFs were supplied") - pd.DataFrame(player_deltas[1:], columns=player_deltas[0]).to_csv(f'{"batter" if bstat_df is not None else "pitcher"}-deltas.csv') - pd.DataFrame(new_players[1:], columns=new_players[0]).to_csv(f'new-{"batter" if bstat_df is not None else "pitcher"}s.csv') + pd.DataFrame(player_deltas[1:], columns=player_deltas[0]).to_csv( + f'{"batter" if bstat_df is not None else "pitcher"}-deltas.csv' + ) + pd.DataFrame(new_players[1:], columns=new_players[0]).to_csv( + f'new-{"batter" if bstat_df is not None else "pitcher"}s.csv' + ) - players_df = pd.DataFrame(all_players).set_index('bbref_id') + players_df = pd.DataFrame(all_players).set_index("bbref_id") return players_df async def post_batting_cards(cards_df: pd.DataFrame): all_cards = [] - cards_df.apply(lambda x: all_cards.append({ - 'player_id': int(x["player_id"]), - 'steal_low': x['steal_low'], - 'steal_high': x['steal_high'], - 'steal_auto': x['steal_auto'], - 'steal_jump': x['steal_jump'], - 'bunting': x['bunt'], - 'hit_and_run': x['hit_and_run'], - 'running': x['running'], - 'hand': x['hand'] - }), axis=1) - resp = await db_put('battingcards', payload={'cards': all_cards}, timeout=6) + cards_df.apply( + lambda x: all_cards.append( + { + "player_id": int(x["player_id"]), + "steal_low": x["steal_low"], + "steal_high": x["steal_high"], + "steal_auto": x["steal_auto"], + "steal_jump": x["steal_jump"], + "bunting": x["bunt"], + "hit_and_run": x["hit_and_run"], + "running": x["running"], + "hand": x["hand"], + } + ), + axis=1, + ) + resp = await db_put("battingcards", payload={"cards": all_cards}, timeout=6) if resp is not None: pass else: - log_exception(ValueError, 'Unable to post batting cards') - - bc_query = await db_get('battingcards', params=[('cardset_id', CARDSET_ID)]) - if bc_query['count'] > 0: - bc_data = bc_query['cards'] - + log_exception(ValueError, "Unable to post batting cards") + + bc_query = await db_get("battingcards", params=[("cardset_id", CARDSET_ID)]) + if bc_query["count"] > 0: + bc_data = bc_query["cards"] + for line in bc_data: - line['player_id'] = line['player']['player_id'] - line['key_bbref'] = line['player']['bbref_id'] - line['battingcard_id'] = line['id'] + line["player_id"] = line["player"]["player_id"] + line["key_bbref"] = line["player"]["bbref_id"] + line["battingcard_id"] = line["id"] return pd.DataFrame(bc_data) else: - log_exception(ValueError, 'Unable to pull newly posted batting cards') + log_exception(ValueError, "Unable to pull newly posted batting cards") async def post_pitching_cards(cards_df: pd.DataFrame): all_cards = [] + def get_closer_rating(raw_rating): try: if pd.isnull(raw_rating): @@ -1329,83 +2174,96 @@ async def post_pitching_cards(cards_df: pd.DataFrame): except AttributeError: return None - cards_df.apply(lambda x: all_cards.append({ - 'player_id': int(x['player_id']), - 'balk': x['balk'], - 'wild_pitch': x['wild_pitch'], - 'hold': x['hold'], - 'starter_rating': x['starter_rating'], - 'relief_rating': x['relief_rating'], - 'closer_rating': get_closer_rating(x['closer_rating']), - 'batting': x['batting'], - 'hand': x['pitch_hand'].upper() - }), axis=1) - resp = await db_put('pitchingcards', payload={'cards': all_cards}, timeout=6) + cards_df.apply( + lambda x: all_cards.append( + { + "player_id": int(x["player_id"]), + "balk": x["balk"], + "wild_pitch": x["wild_pitch"], + "hold": x["hold"], + "starter_rating": x["starter_rating"], + "relief_rating": x["relief_rating"], + "closer_rating": get_closer_rating(x["closer_rating"]), + "batting": x["batting"], + "hand": x["pitch_hand"].upper(), + } + ), + axis=1, + ) + resp = await db_put("pitchingcards", payload={"cards": all_cards}, timeout=6) if resp is not None: pass else: - log_exception(ValueError, 'Unable to post pitcher cards') - - pc_query = await db_get('pitchingcards', params=[('cardset_id', CARDSET_ID)]) - if pc_query['count'] > 0: - pc_data = pc_query['cards'] - if PLAYER_DESCRIPTION.lower() not in ['live', '1998']: - pc_data = [x for x in pc_query['cards'] if x['player']['mlbplayer']['key_retro'] in PROMO_INCLUSION_RETRO_IDS] + log_exception(ValueError, "Unable to post pitcher cards") + + pc_query = await db_get("pitchingcards", params=[("cardset_id", CARDSET_ID)]) + if pc_query["count"] > 0: + pc_data = pc_query["cards"] + if PLAYER_DESCRIPTION.lower() not in ["live", "1998"]: + pc_data = [ + x + for x in pc_query["cards"] + if x["player"]["mlbplayer"]["key_retro"] in PROMO_INCLUSION_RETRO_IDS + ] for line in pc_data: - line['player_id'] = line['player']['player_id'] - line['key_bbref'] = line['player']['bbref_id'] - line['pitchingcard_id'] = line['id'] + line["player_id"] = line["player"]["player_id"] + line["key_bbref"] = line["player"]["bbref_id"] + line["pitchingcard_id"] = line["id"] return pd.DataFrame(pc_data) else: - log_exception(ValueError, 'Unable to pull newly posted pitcher cards') + log_exception(ValueError, "Unable to pull newly posted pitcher cards") async def post_batting_ratings(ratings_df: pd.DataFrame): all_ratings = [] def append_ratings(row): - vl = row['ratings_vL'] - vl['player_id'] = row['player_id'] - vl['battingcard_id'] = row['battingcard_id'] + vl = row["ratings_vL"] + vl["player_id"] = row["player_id"] + vl["battingcard_id"] = row["battingcard_id"] - vr = row['ratings_vR'] - vr['player_id'] = row['player_id'] - vr['battingcard_id'] = row['battingcard_id'] + vr = row["ratings_vR"] + vr["player_id"] = row["player_id"] + vr["battingcard_id"] = row["battingcard_id"] all_ratings.append(vl) all_ratings.append(vr) ratings_df.apply(append_ratings, axis=1) - resp = await db_put('battingcardratings', payload={'ratings': all_ratings}, timeout=6) + resp = await db_put( + "battingcardratings", payload={"ratings": all_ratings}, timeout=6 + ) if resp is not None: return True else: - log_exception(ValueError, 'Unable to post batting ratings') - + log_exception(ValueError, "Unable to post batting ratings") + async def post_pitching_ratings(ratings_df: pd.DataFrame): all_ratings = [] def append_ratings(row): - vl = row['ratings_vL'] - vl['player_id'] = row['player_id'] - vl['pitchingcard_id'] = row['pitchingcard_id'] + vl = row["ratings_vL"] + vl["player_id"] = row["player_id"] + vl["pitchingcard_id"] = row["pitchingcard_id"] - vr = row['ratings_vR'] - vr['player_id'] = row['player_id'] - vr['pitchingcard_id'] = row['pitchingcard_id'] + vr = row["ratings_vR"] + vr["player_id"] = row["player_id"] + vr["pitchingcard_id"] = row["pitchingcard_id"] all_ratings.append(vl) all_ratings.append(vr) ratings_df.apply(append_ratings, axis=1) - resp = await db_put('pitchingcardratings', payload={'ratings': all_ratings}, timeout=6) + resp = await db_put( + "pitchingcardratings", payload={"ratings": all_ratings}, timeout=6 + ) if resp is not None: return True else: - log_exception(ValueError, 'Unable to post pitching ratings') + log_exception(ValueError, "Unable to post pitching ratings") async def post_positions(pos_df: pd.DataFrame, delete_existing: bool = False): @@ -1413,58 +2271,63 @@ async def post_positions(pos_df: pd.DataFrame, delete_existing: bool = False): # (e.g., DH positions from buggy runs where outfielders had no defensive positions) # Only delete on the first call (batters), not the second call (pitchers) if delete_existing: - player_ids = pos_df['player_id'].unique().tolist() - logger.info(f'Deleting existing cardpositions for {len(player_ids)} players in current run') - existing_positions = await db_get('cardpositions', params=[('cardset_id', CARDSET_ID)]) - if existing_positions and existing_positions.get('count', 0) > 0: + player_ids = pos_df["player_id"].unique().tolist() + logger.info( + f"Deleting existing cardpositions for {len(player_ids)} players in current run" + ) + existing_positions = await db_get( + "cardpositions", params=[("cardset_id", CARDSET_ID)] + ) + if existing_positions and existing_positions.get("count", 0) > 0: deleted_count = 0 - for pos in existing_positions['positions']: + for pos in existing_positions["positions"]: # Only delete positions for players being processed in this run - if pos['player']['player_id'] in player_ids: + if pos["player"]["player_id"] in player_ids: try: - await db_delete('cardpositions', object_id=pos['id'], timeout=1) + await db_delete("cardpositions", object_id=pos["id"], timeout=1) deleted_count += 1 except Exception as e: - logger.warning(f'Failed to delete cardposition {pos["id"]}: {e}') - logger.info(f'Deleted {deleted_count} positions for players in current run') + logger.warning( + f'Failed to delete cardposition {pos["id"]}: {e}' + ) + logger.info(f"Deleted {deleted_count} positions for players in current run") all_pos = [] def append_positions(row): clean_row = row.dropna() new_val = clean_row.to_dict() - new_val['player_id'] = int(row['player_id']) + new_val["player_id"] = int(row["player_id"]) all_pos.append(new_val) + pos_df.apply(append_positions, axis=1) - resp = await db_put('cardpositions', payload={'positions': all_pos}, timeout=6) + resp = await db_put("cardpositions", payload={"positions": all_pos}, timeout=6) if resp is not None: return True else: - log_exception(ValueError, 'Unable to post positions') + log_exception(ValueError, "Unable to post positions") -async def post_batter_data(bs: pd.DataFrame, bc: pd.DataFrame, br: pd.DataFrame, dr: pd.DataFrame) -> int: +async def post_batter_data( + bs: pd.DataFrame, bc: pd.DataFrame, br: pd.DataFrame, dr: pd.DataFrame +) -> int: all_players = await get_or_post_players(bstat_df=bs, bat_rat_df=br, def_rat_df=dr) - + # Post Batting Cards bc = pd.merge( - left=bc, - right=all_players, - how='left', - left_on='key_bbref', - right_on='bbref_id' + left=bc, right=all_players, how="left", left_on="key_bbref", right_on="bbref_id" ) bc = await post_batting_cards(bc) - + # Post Batting Ratings # Only merge the columns we need to avoid corrupting dict columns in br br = pd.merge( left=br, - right=bc[['key_bbref', 'player_id', 'battingcard_id']], - how='left', - left_on='key_bbref', - right_on='key_bbref' + right=bc[["key_bbref", "player_id", "battingcard_id"]], + how="left", + left_on="key_bbref", + right_on="key_bbref", ) br = await post_batting_ratings(br) @@ -1472,23 +2335,21 @@ async def post_batter_data(bs: pd.DataFrame, bc: pd.DataFrame, br: pd.DataFrame, dr = pd.merge( left=dr, right=all_players, - how='right', # 'left', - left_on='key_bbref', - right_on='bbref_id' + how="right", # 'left', + left_on="key_bbref", + right_on="bbref_id", ) await post_positions(dr, delete_existing=True) # Delete on first call (batters) return len(all_players) -async def post_pitcher_data(ps: pd.DataFrame, pc: pd.DataFrame, pr: pd.DataFrame, dr: pd.DataFrame) -> int: +async def post_pitcher_data( + ps: pd.DataFrame, pc: pd.DataFrame, pr: pd.DataFrame, dr: pd.DataFrame +) -> int: all_players = await get_or_post_players(pstat_df=ps, pit_rat_df=pr, def_rat_df=dr) ps = pd.merge( - left=all_players, - right=ps, - how='left', - left_on='bbref_id', - right_on='key_bbref' + left=all_players, right=ps, how="left", left_on="bbref_id", right_on="key_bbref" ) # Post Pitching Cards @@ -1498,35 +2359,41 @@ async def post_pitcher_data(ps: pd.DataFrame, pc: pd.DataFrame, pr: pd.DataFrame # Only merge the columns we need to avoid corrupting dict columns in pr pr = pd.merge( left=pr, - right=pc[['key_bbref', 'player_id', 'pitchingcard_id']], - how='left', - left_on='key_bbref', - right_on='key_bbref' + right=pc[["key_bbref", "player_id", "pitchingcard_id"]], + how="left", + left_on="key_bbref", + right_on="key_bbref", ) pr = await post_pitching_ratings(pr) # Post Positions dr = pd.merge( - left=all_players, - right=dr, - how='left', - left_on='bbref_id', - right_on='key_bbref' + left=all_players, right=dr, how="left", left_on="bbref_id", right_on="key_bbref" ) - await post_positions(dr, delete_existing=False) # Don't delete on second call (pitchers) + await post_positions( + dr, delete_existing=False + ) # Don't delete on second call (pitchers) return len(all_players) - -async def run_batters(data_input_path: str, start_date: int, end_date: int, post_data: bool = False, season_pct: float = 1.0): - print(f'Running the batter calcs...') + +async def run_batters( + data_input_path: str, + start_date: int, + end_date: int, + post_data: bool = False, + season_pct: float = 1.0, +): + print(f"Running the batter calcs...") # batter_start = datetime.datetime.now() # Get batting stats - batting_stats = get_batting_stats_by_date(f'{RETRO_FILE_PATH}{EVENTS_FILENAME}', start_date=start_date, end_date=end_date) + batting_stats = get_batting_stats_by_date( + f"{RETRO_FILE_PATH}{EVENTS_FILENAME}", start_date=start_date, end_date=end_date + ) bs_len = len(batting_stats) - # end_calc = datetime.datetime.now() + # end_calc = datetime.datetime.now() # print(f'Combined batting stats: {(end_calc - batter_start).total_seconds():.2f}s\n') running_start = datetime.datetime.now() @@ -1536,74 +2403,91 @@ async def run_batters(data_input_path: str, start_date: int, end_date: int, post batting_stats = pd.merge( left=batting_stats, right=running_stats, - how='left', - left_on='key_bbref', - right_on='key_bbref' + how="left", + left_on="key_bbref", + right_on="key_bbref", ) # Handle players who played for multiple teams - keep only highest-level combined totals # Players traded during season have multiple rows: one per team + one combined (2TM, 3TM, etc.) # Prefer: 3TM > 2TM > TOT > individual teams - duplicated_mask = batting_stats['key_bbref'].duplicated(keep=False) + duplicated_mask = batting_stats["key_bbref"].duplicated(keep=False) if duplicated_mask.any(): # Sort by Tm (descending) to prioritize higher-numbered combined totals (3TM > 2TM) # Then drop duplicates, keeping only the first (highest priority) row per player - batting_stats = batting_stats.sort_values('Tm', ascending=False) - batting_stats = batting_stats.drop_duplicates(subset='key_bbref', keep='first') + batting_stats = batting_stats.sort_values("Tm", ascending=False) + batting_stats = batting_stats.drop_duplicates(subset="key_bbref", keep="first") logger.info("Removed team-specific rows for traded batters") bs_len = len(batting_stats) # Update length after removing duplicates end_calc = datetime.datetime.now() - print(f'Running stats: {(end_calc - running_start).total_seconds():.2f}s') + print(f"Running stats: {(end_calc - running_start).total_seconds():.2f}s") if len(batting_stats) != bs_len: - raise DataMismatchError(f'retrosheet_data - run_batters - We started with {bs_len} batting lines and have {len(batting_stats)} after merging with running_stats') - + raise DataMismatchError( + f"retrosheet_data - run_batters - We started with {bs_len} batting lines and have {len(batting_stats)} after merging with running_stats" + ) + + # Resolve offense_col for card layout builder + batting_stats = await resolve_offense_cols(batting_stats, api_available=post_data) + # Calculate batting cards card_start = datetime.datetime.now() all_batting_cards = calc_batting_cards(batting_stats, season_pct) card_end = datetime.datetime.now() - print(f'Create batting cards: {(card_end - card_start).total_seconds():.2f}s') + print(f"Create batting cards: {(card_end - card_start).total_seconds():.2f}s") # Calculate batting ratings rating_start = datetime.datetime.now() - batting_stats['battingcard_id'] = batting_stats['key_fangraphs'] + batting_stats["battingcard_id"] = batting_stats["key_fangraphs"] all_batting_ratings = calc_batter_ratings(batting_stats) rating_end = datetime.datetime.now() - print(f'Create batting ratings: {(rating_end - rating_start).total_seconds():.2f}s') + print(f"Create batting ratings: {(rating_end - rating_start).total_seconds():.2f}s") # Calculate defense ratings defense_start = datetime.datetime.now() all_defense_ratings = calc_positions(batting_stats) defense_end = datetime.datetime.now() - print(f'Create defense ratings: {(defense_end - defense_start).total_seconds():.2f}s') + print( + f"Create defense ratings: {(defense_end - defense_start).total_seconds():.2f}s" + ) # Post all data if post_data: - print(f'Posting player data...') + print(f"Posting player data...") post_start = datetime.datetime.now() - num_players = await post_batter_data(batting_stats, all_batting_cards, all_batting_ratings, all_defense_ratings) + num_players = await post_batter_data( + batting_stats, all_batting_cards, all_batting_ratings, all_defense_ratings + ) post_end = datetime.datetime.now() - print(f'Post player data: {(post_end - post_start).total_seconds()}s') + print(f"Post player data: {(post_end - post_start).total_seconds()}s") - post_msg = f'Posted {num_players} players to the database' + post_msg = f"Posted {num_players} players to the database" logger.info(post_msg) print(post_msg) else: - post_msg = f'{batting_stats.index.size} total batters\n\nPlayers are NOT being posted to the database' + post_msg = f"{batting_stats.index.size} total batters\n\nPlayers are NOT being posted to the database" logger.warning(post_msg) print(post_msg) return batting_stats -async def run_pitchers(data_input_path: str, start_date: int, end_date: int, post_data: bool = False, season_pct: float = 1.0): +async def run_pitchers( + data_input_path: str, + start_date: int, + end_date: int, + post_data: bool = False, + season_pct: float = 1.0, +): # Get pitching stats - pitching_stats = get_pitching_stats_by_date(f'{RETRO_FILE_PATH}{EVENTS_FILENAME}', start_date=start_date, end_date=end_date) + pitching_stats = get_pitching_stats_by_date( + f"{RETRO_FILE_PATH}{EVENTS_FILENAME}", start_date=start_date, end_date=end_date + ) # Get peripheral stats start_time = datetime.datetime.now() @@ -1612,42 +2496,47 @@ async def run_pitchers(data_input_path: str, start_date: int, end_date: int, pos pitching_stats = pd.merge( left=pitching_stats, right=periph_stats, - how='left', - left_on='key_bbref', - right_on='key_bbref' + how="left", + left_on="key_bbref", + right_on="key_bbref", ) # Handle players who played for multiple teams - keep only highest-level combined totals # Players traded during season have multiple rows: one per team + one combined (2TM, 3TM, etc.) # Prefer: 3TM > 2TM > TOT > individual teams - duplicated_mask = pitching_stats['key_bbref'].duplicated(keep=False) + duplicated_mask = pitching_stats["key_bbref"].duplicated(keep=False) if duplicated_mask.any(): # Sort by Tm (descending) to prioritize higher-numbered combined totals (3TM > 2TM) # Then drop duplicates, keeping only the first (highest priority) row per player - pitching_stats = pitching_stats.sort_values('Tm', ascending=False) - pitching_stats = pitching_stats.drop_duplicates(subset='key_bbref', keep='first') + pitching_stats = pitching_stats.sort_values("Tm", ascending=False) + pitching_stats = pitching_stats.drop_duplicates( + subset="key_bbref", keep="first" + ) logger.info(f"Removed team-specific rows for traded players") end_time = datetime.datetime.now() - print(f'Peripheral stats: {(end_time - start_time).total_seconds():.2f}s') + print(f"Peripheral stats: {(end_time - start_time).total_seconds():.2f}s") # Calculate defense ratings start_time = datetime.datetime.now() - df_p = pd.read_csv(f'{DATA_INPUT_FILE_PATH}defense_p.csv').set_index('key_bbref') + df_p = pd.read_csv(f"{DATA_INPUT_FILE_PATH}defense_p.csv").set_index("key_bbref") # Drop 'Tm' from defense data to avoid column name conflicts (we already have it from periph_stats) - if 'Tm' in df_p.columns: - df_p = df_p.drop(columns=['Tm']) + if "Tm" in df_p.columns: + df_p = df_p.drop(columns=["Tm"]) pitching_stats = pd.merge( left=pitching_stats, right=df_p, - how='left', - left_on='key_bbref', - right_on='key_bbref' + how="left", + left_on="key_bbref", + right_on="key_bbref", ) pitching_stats = pitching_stats.fillna(0) all_defense_ratings = calc_pitcher_defense(pitching_stats) end_time = datetime.datetime.now() - print(f'Defense stats: {(end_time - start_time).total_seconds():.2f}s') + print(f"Defense stats: {(end_time - start_time).total_seconds():.2f}s") + + # Resolve offense_col for card layout builder + pitching_stats = await resolve_offense_cols(pitching_stats, api_available=post_data) # Calculate pitching cards start_time = datetime.datetime.now() @@ -1655,43 +2544,48 @@ async def run_pitchers(data_input_path: str, start_date: int, end_date: int, pos pitching_stats = pd.merge( left=pitching_stats, right=all_pitching_cards, - how='left', - left_on='key_bbref', - right_on='key_bbref' + how="left", + left_on="key_bbref", + right_on="key_bbref", ) end_time = datetime.datetime.now() - print(f'Pit cards: {(end_time - start_time).total_seconds():.2f}s') + print(f"Pit cards: {(end_time - start_time).total_seconds():.2f}s") # Calculate pitching card ratings start_time = datetime.datetime.now() all_pitching_ratings = calc_pitcher_ratings(pitching_stats) end_time = datetime.datetime.now() - print(f'Pit ratings: {(end_time - start_time).total_seconds():.2f}s') + print(f"Pit ratings: {(end_time - start_time).total_seconds():.2f}s") # Post all data if post_data: - print(f'\nPosting player data...') + print(f"\nPosting player data...") post_start = datetime.datetime.now() - num_players = await post_pitcher_data(pitching_stats, all_pitching_cards, all_pitching_ratings, all_defense_ratings) + num_players = await post_pitcher_data( + pitching_stats, + all_pitching_cards, + all_pitching_ratings, + all_defense_ratings, + ) post_end = datetime.datetime.now() - print(f'Post player data: {(post_end - post_start).total_seconds()}s') + print(f"Post player data: {(post_end - post_start).total_seconds()}s") - post_msg = f'\nPosted {num_players} pitchers to the database' + post_msg = f"\nPosted {num_players} pitchers to the database" logger.info(post_msg) print(post_msg) else: - post_msg = f'{pitching_stats.index.size} total pitchers\n\nPlayers are NOT being posted to the database' + post_msg = f"{pitching_stats.index.size} total pitchers\n\nPlayers are NOT being posted to the database" logger.warning(post_msg) print(post_msg) - + return pitching_stats async def main(args): - if len(PROMO_INCLUSION_RETRO_IDS) > 0 and PLAYER_DESCRIPTION == 'Live': - msg = f'Player description is set to *Live*, but there are {len(PROMO_INCLUSION_RETRO_IDS)} IDs in the promo inclusion list. Clear the promo list or change the player description.' - log_exception(ValueError, msg=msg, level='error') + if len(PROMO_INCLUSION_RETRO_IDS) > 0 and PLAYER_DESCRIPTION == "Live": + msg = f"Player description is set to *Live*, but there are {len(PROMO_INCLUSION_RETRO_IDS)} IDs in the promo inclusion list. Clear the promo list or change the player description." + log_exception(ValueError, msg=msg, level="error") # Temporarily commented out for Ryan Zimmerman full season run # if weeks_between(START_DATE, END_DATE) > 5 and len(PROMO_INCLUSION_RETRO_IDS) > 0: @@ -1699,21 +2593,33 @@ async def main(args): # log_exception(ValueError, msg=msg, level='error') batter_start = datetime.datetime.now() - batting_stats = await run_batters(f'{DATA_INPUT_FILE_PATH}', start_date=START_DATE, end_date=END_DATE, post_data=POST_DATA, season_pct=SEASON_PCT) - batting_stats.to_csv(f'batting_stats.csv') + batting_stats = await run_batters( + f"{DATA_INPUT_FILE_PATH}", + start_date=START_DATE, + end_date=END_DATE, + post_data=POST_DATA, + season_pct=SEASON_PCT, + ) + batting_stats.to_csv(f"batting_stats.csv") batter_end = datetime.datetime.now() - print(f'\nBatter time: {(batter_end - batter_start).total_seconds():.2f}s\n') + print(f"\nBatter time: {(batter_end - batter_start).total_seconds():.2f}s\n") pitcher_start = datetime.datetime.now() - pitching_stats = await run_pitchers(f'{DATA_INPUT_FILE_PATH}', start_date=START_DATE, end_date=END_DATE, post_data=POST_DATA, season_pct=SEASON_PCT) - pitching_stats.to_csv(f'pitching_stats.csv') + pitching_stats = await run_pitchers( + f"{DATA_INPUT_FILE_PATH}", + start_date=START_DATE, + end_date=END_DATE, + post_data=POST_DATA, + season_pct=SEASON_PCT, + ) + pitching_stats.to_csv(f"pitching_stats.csv") pitcher_end = datetime.datetime.now() - print(f'\nPitcher time: {(pitcher_end - pitcher_start).total_seconds():.2f}s') + print(f"\nPitcher time: {(pitcher_end - pitcher_start).total_seconds():.2f}s") - print(f'Total: {(pitcher_end - batter_start).total_seconds():.2f}s\n\nDone!') + print(f"Total: {(pitcher_end - batter_start).total_seconds():.2f}s\n\nDone!") # await store_defense_to_csv(1998) -if __name__ == '__main__': +if __name__ == "__main__": asyncio.run(main(sys.argv[1:]))