paper-dynasty-card-creation/pull_pitching_stats.py
Cal Corum 0a17745389 Run black and ruff across entire codebase
Standardize formatting with black and apply ruff auto-fixes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 14:24:33 -05:00

339 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Baseball Reference Pitching Stats Scraper
This script scrapes the Player Standard Pitching table from Baseball Reference
and saves it as a CSV file in the specified cardset directory.
Usage:
python pull_pitching_stats.py --year 2025 --cardset-name "2025 Live Cardset"
"""
import argparse
import logging
import sys
from pathlib import Path
from typing import Dict, List
import polars as pl
import requests
from bs4 import BeautifulSoup
# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
class PitchingStatsScraper:
"""Scraper for Baseball Reference pitching statistics."""
def __init__(self, year: int):
"""Initialize the scraper with the target year."""
self.year = year
self.base_url = "https://www.baseball-reference.com"
self.pitching_url = (
f"{self.base_url}/leagues/majors/{year}-standard-pitching.shtml"
)
# HTTP session for requests
self.session = requests.Session()
self.session.headers.update(
{
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
)
def fetch_page(self) -> BeautifulSoup:
"""Fetch the Baseball Reference pitching stats page."""
logger.info(f"Fetching pitching stats from: {self.pitching_url}")
try:
response = self.session.get(self.pitching_url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
logger.info("Successfully fetched and parsed the page")
return soup
except requests.RequestException as e:
logger.error(f"Failed to fetch page: {e}")
raise
def extract_pitching_table(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
"""Extract the players_standard_pitching table from the page."""
logger.info("Extracting players_standard_pitching table")
# Find the table by ID
table = soup.find("table", {"id": "players_standard_pitching"})
if not table:
raise ValueError("Could not find table with ID 'players_standard_pitching'")
# Extract headers
header_row = table.find("thead").find("tr")
if not header_row:
raise ValueError("Could not find table headers")
headers = []
for th in header_row.find_all(["th", "td"]):
header_text = th.get_text(strip=True)
if header_text:
headers.append(header_text)
logger.info(f"Found {len(headers)} columns: {headers}")
# Extract data rows
tbody = table.find("tbody")
if not tbody:
raise ValueError("Could not find table body")
data_rows = []
for row in tbody.find_all("tr"):
# Skip header rows that might appear in tbody
if (
row.find("th")
and row.find("th").get("class")
and "thead" in row.find("th").get("class", [])
):
continue
row_data = {}
player_id = ""
cells = row.find_all(["td", "th"])
for i, cell in enumerate(cells):
if i < len(headers):
cell_text = cell.get_text(strip=True)
row_data[headers[i]] = cell_text
# Extract player ID from the Player column (usually index 1)
if headers[i] == "Player":
# Look for a link in this cell
link = cell.find("a")
if link and link.get("href"):
href = link.get("href")
# Extract player ID from href like "/players/c/crocega01.shtml"
if "/players/" in href:
# Split by '/' and get the filename, then remove .shtml
parts = href.split("/")
if len(parts) > 0:
filename = parts[-1]
if filename.endswith(".shtml"):
player_id = filename[:-6] # Remove .shtml
# Add player ID to row data
row_data["PlayerID"] = player_id
# Only add rows that have player data (name in first column)
if row_data and row_data.get(headers[0], "").strip():
data_rows.append(row_data)
logger.info(f"Extracted {len(data_rows)} player records")
return data_rows
def map_to_expected_format(self, raw_data: List[Dict[str, str]]) -> pl.DataFrame:
"""Map the scraped data to the expected CSV format."""
logger.info("Mapping data to expected format")
# Define the expected column mapping from Baseball Reference to our format
# Based on 2024 Season Cardset structure and actual Baseball Reference columns found:
# ['Rk', 'Player', 'Age', 'Team', 'Lg', 'WAR', 'W', 'L', 'W-L%', 'ERA', 'G', 'GS', 'GF', 'CG', 'SHO', 'SV',
# 'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP', 'WHIP', 'H9', 'HR9', 'BB9', 'SO9', 'SO/BB', 'Awards']
column_mapping = {
"Rk": "Rk",
"Player": "Name",
"Age": "Age",
"Team": "Team",
"Lg": "Lg",
"WAR": "WAR",
"W": "W",
"L": "L",
"W-L%": "W-L%",
"ERA": "ERA",
"G": "G",
"GS": "GS",
"GF": "GF",
"CG": "CG",
"SHO": "SHO",
"SV": "SV",
"IP": "IP",
"H": "H",
"R": "R",
"ER": "ER",
"HR": "HR",
"BB": "BB",
"IBB": "IBB",
"SO": "SO",
"HBP": "HBP",
"BK": "BK",
"WP": "WP",
"BF": "BF",
"ERA+": "ERA+",
"FIP": "FIP",
"WHIP": "WHIP",
"H9": "H9",
"HR9": "HR9",
"BB9": "BB9",
"SO9": "SO9",
"SO/BB": "SO/BB",
"Awards": "Awards",
"PlayerID": "Name-additional", # Map extracted player ID to Name-additional
}
# Expected columns in output (based on 2024 Season Cardset/pitching.csv) - 38 total columns
expected_columns = [
"Rk",
"Name",
"Age",
"Team",
"Lg",
"WAR",
"W",
"L",
"W-L%",
"ERA",
"G",
"GS",
"GF",
"CG",
"SHO",
"SV",
"IP",
"H",
"R",
"ER",
"HR",
"BB",
"IBB",
"SO",
"HBP",
"BK",
"WP",
"BF",
"ERA+",
"FIP",
"WHIP",
"H9",
"HR9",
"BB9",
"SO9",
"SO/BB",
"Awards",
"Name-additional",
]
# Convert raw data to DataFrame
if not raw_data:
logger.warning("No data to process")
return pl.DataFrame(schema={col: pl.Utf8 for col in expected_columns})
df = pl.DataFrame(raw_data)
logger.info(f"Created DataFrame with {df.height} rows and {df.width} columns")
# Create output DataFrame with expected structure
output_data = {}
for col in expected_columns:
# Find the corresponding Baseball Reference column name
br_col = None
for br_column, our_column in column_mapping.items():
if our_column == col:
br_col = br_column
break
if br_col and br_col in df.columns:
# Map from scraped data using the Baseball Reference column name
output_data[col] = df[br_col].to_list()
else:
# Column not available in scraped data, fill with empty strings
logger.warning(
f"Column '{col}' not found in scraped data, filling with empty values"
)
output_data[col] = [""] * len(raw_data)
result_df = pl.DataFrame(output_data)
logger.info(
f"Mapped to expected format: {result_df.height} rows, {result_df.width} columns"
)
return result_df
def save_to_csv(self, df: pl.DataFrame, cardset_name: str) -> str:
"""Save the DataFrame to CSV in the appropriate directory."""
# Create output directory path - use absolute path to ensure it goes to the right place
# Find the project root (where data-input directory should be)
current_path = Path.cwd()
if current_path.name == "pull-pitching-stats":
# We're running from the plans subdirectory, go up to the project root
project_root = current_path.parent.parent.parent
else:
# We're already in the project root
project_root = current_path
output_dir = project_root / "data-input" / cardset_name
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / "pitching.csv"
logger.info(f"Saving to: {output_path}")
# Save to CSV
df.write_csv(output_path)
logger.info(f"Successfully saved {df.height} records to {output_path}")
return str(output_path)
def scrape_and_save(self, cardset_name: str) -> str:
"""Complete workflow: scrape, process, and save pitching stats."""
logger.info(f"Starting pitching stats scrape for year {self.year}")
# Fetch the page
soup = self.fetch_page()
# Extract the table data
raw_data = self.extract_pitching_table(soup)
# Map to expected format
df = self.map_to_expected_format(raw_data)
# Save to CSV
output_path = self.save_to_csv(df, cardset_name)
logger.info("Pitching stats scraping completed successfully")
return output_path
def main():
"""Main entry point for the script."""
parser = argparse.ArgumentParser(
description="Scrape Baseball Reference pitching statistics"
)
parser.add_argument(
"--year", type=int, required=True, help="Year to scrape (e.g., 2025)"
)
parser.add_argument(
"--cardset-name",
type=str,
required=True,
help='Name of the cardset directory (e.g., "2025 Live Cardset")',
)
parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
args = parser.parse_args()
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
try:
scraper = PitchingStatsScraper(args.year)
output_path = scraper.scrape_and_save(args.cardset_name)
print(f"Successfully saved pitching stats to: {output_path}")
except Exception as e:
logger.error(f"Script failed: {e}")
sys.exit(1)
if __name__ == "__main__":
main()