#!/usr/bin/env python3 """ Baseball Reference Pitching Stats Scraper This script scrapes the Player Standard Pitching table from Baseball Reference and saves it as a CSV file in the specified cardset directory. Usage: python pull_pitching_stats.py --year 2025 --cardset-name "2025 Live Cardset" """ import argparse import logging import os import sys from pathlib import Path from typing import Dict, List, Optional import polars as pl import requests from bs4 import BeautifulSoup # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class PitchingStatsScraper: """Scraper for Baseball Reference pitching statistics.""" def __init__(self, year: int): """Initialize the scraper with the target year.""" self.year = year self.base_url = "https://www.baseball-reference.com" self.pitching_url = f"{self.base_url}/leagues/majors/{year}-standard-pitching.shtml" # HTTP session for requests self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }) def fetch_page(self) -> BeautifulSoup: """Fetch the Baseball Reference pitching stats page.""" logger.info(f"Fetching pitching stats from: {self.pitching_url}") try: response = self.session.get(self.pitching_url, timeout=30) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') logger.info("Successfully fetched and parsed the page") return soup except requests.RequestException as e: logger.error(f"Failed to fetch page: {e}") raise def extract_pitching_table(self, soup: BeautifulSoup) -> List[Dict[str, str]]: """Extract the players_standard_pitching table from the page.""" logger.info("Extracting players_standard_pitching table") # Find the table by ID table = soup.find('table', {'id': 'players_standard_pitching'}) if not table: raise ValueError("Could not find table with ID 'players_standard_pitching'") # Extract headers header_row = table.find('thead').find('tr') if not header_row: raise ValueError("Could not find table headers") headers = [] for th in header_row.find_all(['th', 'td']): header_text = th.get_text(strip=True) if header_text: headers.append(header_text) logger.info(f"Found {len(headers)} columns: {headers}") # Extract data rows tbody = table.find('tbody') if not tbody: raise ValueError("Could not find table body") data_rows = [] for row in tbody.find_all('tr'): # Skip header rows that might appear in tbody if row.find('th') and row.find('th').get('class') and 'thead' in row.find('th').get('class', []): continue row_data = {} player_id = "" cells = row.find_all(['td', 'th']) for i, cell in enumerate(cells): if i < len(headers): cell_text = cell.get_text(strip=True) row_data[headers[i]] = cell_text # Extract player ID from the Player column (usually index 1) if headers[i] == 'Player': # Look for a link in this cell link = cell.find('a') if link and link.get('href'): href = link.get('href') # Extract player ID from href like "/players/c/crocega01.shtml" if '/players/' in href: # Split by '/' and get the filename, then remove .shtml parts = href.split('/') if len(parts) > 0: filename = parts[-1] if filename.endswith('.shtml'): player_id = filename[:-6] # Remove .shtml # Add player ID to row data row_data['PlayerID'] = player_id # Only add rows that have player data (name in first column) if row_data and row_data.get(headers[0], '').strip(): data_rows.append(row_data) logger.info(f"Extracted {len(data_rows)} player records") return data_rows def map_to_expected_format(self, raw_data: List[Dict[str, str]]) -> pl.DataFrame: """Map the scraped data to the expected CSV format.""" logger.info("Mapping data to expected format") # Define the expected column mapping from Baseball Reference to our format # Based on 2024 Season Cardset structure and actual Baseball Reference columns found: # ['Rk', 'Player', 'Age', 'Team', 'Lg', 'WAR', 'W', 'L', 'W-L%', 'ERA', 'G', 'GS', 'GF', 'CG', 'SHO', 'SV', # 'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP', 'WHIP', 'H9', 'HR9', 'BB9', 'SO9', 'SO/BB', 'Awards'] column_mapping = { 'Rk': 'Rk', 'Player': 'Name', 'Age': 'Age', 'Team': 'Team', 'Lg': 'Lg', 'WAR': 'WAR', 'W': 'W', 'L': 'L', 'W-L%': 'W-L%', 'ERA': 'ERA', 'G': 'G', 'GS': 'GS', 'GF': 'GF', 'CG': 'CG', 'SHO': 'SHO', 'SV': 'SV', 'IP': 'IP', 'H': 'H', 'R': 'R', 'ER': 'ER', 'HR': 'HR', 'BB': 'BB', 'IBB': 'IBB', 'SO': 'SO', 'HBP': 'HBP', 'BK': 'BK', 'WP': 'WP', 'BF': 'BF', 'ERA+': 'ERA+', 'FIP': 'FIP', 'WHIP': 'WHIP', 'H9': 'H9', 'HR9': 'HR9', 'BB9': 'BB9', 'SO9': 'SO9', 'SO/BB': 'SO/BB', 'Awards': 'Awards', 'PlayerID': 'Name-additional' # Map extracted player ID to Name-additional } # Expected columns in output (based on 2024 Season Cardset/pitching.csv) - 38 total columns expected_columns = [ 'Rk', 'Name', 'Age', 'Team', 'Lg', 'WAR', 'W', 'L', 'W-L%', 'ERA', 'G', 'GS', 'GF', 'CG', 'SHO', 'SV', 'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP', 'WHIP', 'H9', 'HR9', 'BB9', 'SO9', 'SO/BB', 'Awards', 'Name-additional' ] # Convert raw data to DataFrame if not raw_data: logger.warning("No data to process") return pl.DataFrame(schema={col: pl.Utf8 for col in expected_columns}) df = pl.DataFrame(raw_data) logger.info(f"Created DataFrame with {df.height} rows and {df.width} columns") # Create output DataFrame with expected structure output_data = {} for col in expected_columns: # Find the corresponding Baseball Reference column name br_col = None for br_column, our_column in column_mapping.items(): if our_column == col: br_col = br_column break if br_col and br_col in df.columns: # Map from scraped data using the Baseball Reference column name output_data[col] = df[br_col].to_list() else: # Column not available in scraped data, fill with empty strings logger.warning(f"Column '{col}' not found in scraped data, filling with empty values") output_data[col] = [''] * len(raw_data) result_df = pl.DataFrame(output_data) logger.info(f"Mapped to expected format: {result_df.height} rows, {result_df.width} columns") return result_df def save_to_csv(self, df: pl.DataFrame, cardset_name: str) -> str: """Save the DataFrame to CSV in the appropriate directory.""" # Create output directory path - use absolute path to ensure it goes to the right place # Find the project root (where data-input directory should be) current_path = Path.cwd() if current_path.name == "pull-pitching-stats": # We're running from the plans subdirectory, go up to the project root project_root = current_path.parent.parent.parent else: # We're already in the project root project_root = current_path output_dir = project_root / "data-input" / cardset_name output_dir.mkdir(parents=True, exist_ok=True) output_path = output_dir / "pitching.csv" logger.info(f"Saving to: {output_path}") # Save to CSV df.write_csv(output_path) logger.info(f"Successfully saved {df.height} records to {output_path}") return str(output_path) def scrape_and_save(self, cardset_name: str) -> str: """Complete workflow: scrape, process, and save pitching stats.""" logger.info(f"Starting pitching stats scrape for year {self.year}") # Fetch the page soup = self.fetch_page() # Extract the table data raw_data = self.extract_pitching_table(soup) # Map to expected format df = self.map_to_expected_format(raw_data) # Save to CSV output_path = self.save_to_csv(df, cardset_name) logger.info("Pitching stats scraping completed successfully") return output_path def main(): """Main entry point for the script.""" parser = argparse.ArgumentParser( description="Scrape Baseball Reference pitching statistics" ) parser.add_argument( '--year', type=int, required=True, help='Year to scrape (e.g., 2025)' ) parser.add_argument( '--cardset-name', type=str, required=True, help='Name of the cardset directory (e.g., "2025 Live Cardset")' ) parser.add_argument( '--verbose', action='store_true', help='Enable verbose logging' ) args = parser.parse_args() if args.verbose: logging.getLogger().setLevel(logging.DEBUG) try: scraper = PitchingStatsScraper(args.year) output_path = scraper.scrape_and_save(args.cardset_name) print(f"Successfully saved pitching stats to: {output_path}") except Exception as e: logger.error(f"Script failed: {e}") sys.exit(1) if __name__ == '__main__': main()