#!/usr/bin/env python3 """ Baseball Reference Pitching Stats Scraper This script scrapes the Player Standard Pitching table from Baseball Reference and saves it as a CSV file in the specified cardset directory. Usage: python pull_pitching_stats.py --year 2025 --cardset-name "2025 Live Cardset" """ import argparse import logging import sys from pathlib import Path from typing import Dict, List import polars as pl import requests from bs4 import BeautifulSoup # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) class PitchingStatsScraper: """Scraper for Baseball Reference pitching statistics.""" def __init__(self, year: int): """Initialize the scraper with the target year.""" self.year = year self.base_url = "https://www.baseball-reference.com" self.pitching_url = ( f"{self.base_url}/leagues/majors/{year}-standard-pitching.shtml" ) # HTTP session for requests self.session = requests.Session() self.session.headers.update( { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" } ) def fetch_page(self) -> BeautifulSoup: """Fetch the Baseball Reference pitching stats page.""" logger.info(f"Fetching pitching stats from: {self.pitching_url}") try: response = self.session.get(self.pitching_url, timeout=30) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") logger.info("Successfully fetched and parsed the page") return soup except requests.RequestException as e: logger.error(f"Failed to fetch page: {e}") raise def extract_pitching_table(self, soup: BeautifulSoup) -> List[Dict[str, str]]: """Extract the players_standard_pitching table from the page.""" logger.info("Extracting players_standard_pitching table") # Find the table by ID table = soup.find("table", {"id": "players_standard_pitching"}) if not table: raise ValueError("Could not find table with ID 'players_standard_pitching'") # Extract headers header_row = table.find("thead").find("tr") if not header_row: raise ValueError("Could not find table headers") headers = [] for th in header_row.find_all(["th", "td"]): header_text = th.get_text(strip=True) if header_text: headers.append(header_text) logger.info(f"Found {len(headers)} columns: {headers}") # Extract data rows tbody = table.find("tbody") if not tbody: raise ValueError("Could not find table body") data_rows = [] for row in tbody.find_all("tr"): # Skip header rows that might appear in tbody if ( row.find("th") and row.find("th").get("class") and "thead" in row.find("th").get("class", []) ): continue row_data = {} player_id = "" cells = row.find_all(["td", "th"]) for i, cell in enumerate(cells): if i < len(headers): cell_text = cell.get_text(strip=True) row_data[headers[i]] = cell_text # Extract player ID from the Player column (usually index 1) if headers[i] == "Player": # Look for a link in this cell link = cell.find("a") if link and link.get("href"): href = link.get("href") # Extract player ID from href like "/players/c/crocega01.shtml" if "/players/" in href: # Split by '/' and get the filename, then remove .shtml parts = href.split("/") if len(parts) > 0: filename = parts[-1] if filename.endswith(".shtml"): player_id = filename[:-6] # Remove .shtml # Add player ID to row data row_data["PlayerID"] = player_id # Only add rows that have player data (name in first column) if row_data and row_data.get(headers[0], "").strip(): data_rows.append(row_data) logger.info(f"Extracted {len(data_rows)} player records") return data_rows def map_to_expected_format(self, raw_data: List[Dict[str, str]]) -> pl.DataFrame: """Map the scraped data to the expected CSV format.""" logger.info("Mapping data to expected format") # Define the expected column mapping from Baseball Reference to our format # Based on 2024 Season Cardset structure and actual Baseball Reference columns found: # ['Rk', 'Player', 'Age', 'Team', 'Lg', 'WAR', 'W', 'L', 'W-L%', 'ERA', 'G', 'GS', 'GF', 'CG', 'SHO', 'SV', # 'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP', 'WHIP', 'H9', 'HR9', 'BB9', 'SO9', 'SO/BB', 'Awards'] column_mapping = { "Rk": "Rk", "Player": "Name", "Age": "Age", "Team": "Team", "Lg": "Lg", "WAR": "WAR", "W": "W", "L": "L", "W-L%": "W-L%", "ERA": "ERA", "G": "G", "GS": "GS", "GF": "GF", "CG": "CG", "SHO": "SHO", "SV": "SV", "IP": "IP", "H": "H", "R": "R", "ER": "ER", "HR": "HR", "BB": "BB", "IBB": "IBB", "SO": "SO", "HBP": "HBP", "BK": "BK", "WP": "WP", "BF": "BF", "ERA+": "ERA+", "FIP": "FIP", "WHIP": "WHIP", "H9": "H9", "HR9": "HR9", "BB9": "BB9", "SO9": "SO9", "SO/BB": "SO/BB", "Awards": "Awards", "PlayerID": "Name-additional", # Map extracted player ID to Name-additional } # Expected columns in output (based on 2024 Season Cardset/pitching.csv) - 38 total columns expected_columns = [ "Rk", "Name", "Age", "Team", "Lg", "WAR", "W", "L", "W-L%", "ERA", "G", "GS", "GF", "CG", "SHO", "SV", "IP", "H", "R", "ER", "HR", "BB", "IBB", "SO", "HBP", "BK", "WP", "BF", "ERA+", "FIP", "WHIP", "H9", "HR9", "BB9", "SO9", "SO/BB", "Awards", "Name-additional", ] # Convert raw data to DataFrame if not raw_data: logger.warning("No data to process") return pl.DataFrame(schema={col: pl.Utf8 for col in expected_columns}) df = pl.DataFrame(raw_data) logger.info(f"Created DataFrame with {df.height} rows and {df.width} columns") # Create output DataFrame with expected structure output_data = {} for col in expected_columns: # Find the corresponding Baseball Reference column name br_col = None for br_column, our_column in column_mapping.items(): if our_column == col: br_col = br_column break if br_col and br_col in df.columns: # Map from scraped data using the Baseball Reference column name output_data[col] = df[br_col].to_list() else: # Column not available in scraped data, fill with empty strings logger.warning( f"Column '{col}' not found in scraped data, filling with empty values" ) output_data[col] = [""] * len(raw_data) result_df = pl.DataFrame(output_data) logger.info( f"Mapped to expected format: {result_df.height} rows, {result_df.width} columns" ) return result_df def save_to_csv(self, df: pl.DataFrame, cardset_name: str) -> str: """Save the DataFrame to CSV in the appropriate directory.""" # Create output directory path - use absolute path to ensure it goes to the right place # Find the project root (where data-input directory should be) current_path = Path.cwd() if current_path.name == "pull-pitching-stats": # We're running from the plans subdirectory, go up to the project root project_root = current_path.parent.parent.parent else: # We're already in the project root project_root = current_path output_dir = project_root / "data-input" / cardset_name output_dir.mkdir(parents=True, exist_ok=True) output_path = output_dir / "pitching.csv" logger.info(f"Saving to: {output_path}") # Save to CSV df.write_csv(output_path) logger.info(f"Successfully saved {df.height} records to {output_path}") return str(output_path) def scrape_and_save(self, cardset_name: str) -> str: """Complete workflow: scrape, process, and save pitching stats.""" logger.info(f"Starting pitching stats scrape for year {self.year}") # Fetch the page soup = self.fetch_page() # Extract the table data raw_data = self.extract_pitching_table(soup) # Map to expected format df = self.map_to_expected_format(raw_data) # Save to CSV output_path = self.save_to_csv(df, cardset_name) logger.info("Pitching stats scraping completed successfully") return output_path def main(): """Main entry point for the script.""" parser = argparse.ArgumentParser( description="Scrape Baseball Reference pitching statistics" ) parser.add_argument( "--year", type=int, required=True, help="Year to scrape (e.g., 2025)" ) parser.add_argument( "--cardset-name", type=str, required=True, help='Name of the cardset directory (e.g., "2025 Live Cardset")', ) parser.add_argument("--verbose", action="store_true", help="Enable verbose logging") args = parser.parse_args() if args.verbose: logging.getLogger().setLevel(logging.DEBUG) try: scraper = PitchingStatsScraper(args.year) output_path = scraper.scrape_and_save(args.cardset_name) print(f"Successfully saved pitching stats to: {output_path}") except Exception as e: logger.error(f"Script failed: {e}") sys.exit(1) if __name__ == "__main__": main()