paper-dynasty-card-creation/pull_pitching_stats.py

#!/usr/bin/env python3
"""
Baseball Reference Pitching Stats Scraper

This script scrapes the Player Standard Pitching table from Baseball Reference
and saves it as a CSV file in the specified cardset directory.

Usage:
    python pull_pitching_stats.py --year 2025 --cardset-name "2025 Live Cardset"
"""

import argparse
import logging
import os
import sys
from pathlib import Path
from typing import Dict, List, Optional

import polars as pl
import requests
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


class PitchingStatsScraper:
    """Scraper for Baseball Reference pitching statistics."""

    def __init__(self, year: int):
        """Initialize the scraper with the target year."""
        self.year = year
        self.base_url = "https://www.baseball-reference.com"
        self.pitching_url = f"{self.base_url}/leagues/majors/{year}-standard-pitching.shtml"

        # HTTP session for requests
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

    def fetch_page(self) -> BeautifulSoup:
        """Fetch the Baseball Reference pitching stats page."""
        logger.info(f"Fetching pitching stats from: {self.pitching_url}")

        try:
            response = self.session.get(self.pitching_url, timeout=30)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')
            logger.info("Successfully fetched and parsed the page")
            return soup

        except requests.RequestException as e:
            logger.error(f"Failed to fetch page: {e}")
            raise

    def extract_pitching_table(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
        """Extract the players_standard_pitching table from the page."""
        logger.info("Extracting players_standard_pitching table")

        # Find the table by ID
        table = soup.find('table', {'id': 'players_standard_pitching'})
        if not table:
            raise ValueError("Could not find table with ID 'players_standard_pitching'")

        # Extract headers
        header_row = table.find('thead').find('tr')
        if not header_row:
            raise ValueError("Could not find table headers")

        headers = []
        for th in header_row.find_all(['th', 'td']):
            header_text = th.get_text(strip=True)
            if header_text:
                headers.append(header_text)

        logger.info(f"Found {len(headers)} columns: {headers}")

        # Extract data rows
        tbody = table.find('tbody')
        if not tbody:
            raise ValueError("Could not find table body")

        data_rows = []
        for row in tbody.find_all('tr'):
            # Skip header rows that might appear in tbody
            if row.find('th') and row.find('th').get('class') and 'thead' in row.find('th').get('class', []):
                continue

            row_data = {}
            player_id = ""
            cells = row.find_all(['td', 'th'])

            for i, cell in enumerate(cells):
                if i < len(headers):
                    cell_text = cell.get_text(strip=True)
                    row_data[headers[i]] = cell_text

                    # Extract player ID from the Player column (usually index 1)
                    if headers[i] == 'Player':
                        # Look for a link in this cell
                        link = cell.find('a')
                        if link and link.get('href'):
                            href = link.get('href')
                            # Extract player ID from href like "/players/c/crocega01.shtml"
                            if '/players/' in href:
                                # Split by '/' and get the filename, then remove .shtml
                                parts = href.split('/')
                                if len(parts) > 0:
                                    filename = parts[-1]
                                    if filename.endswith('.shtml'):
                                        player_id = filename[:-6]  # Remove .shtml

            # Add player ID to row data
            row_data['PlayerID'] = player_id

            # Only add rows that have player data (name in first column)
            if row_data and row_data.get(headers[0], '').strip():
                data_rows.append(row_data)

        logger.info(f"Extracted {len(data_rows)} player records")
        return data_rows

    def map_to_expected_format(self, raw_data: List[Dict[str, str]]) -> pl.DataFrame:
        """Map the scraped data to the expected CSV format."""
        logger.info("Mapping data to expected format")

        # Define the expected column mapping from Baseball Reference to our format
        # Based on 2024 Season Cardset structure and actual Baseball Reference columns found:
        # ['Rk', 'Player', 'Age', 'Team', 'Lg', 'WAR', 'W', 'L', 'W-L%', 'ERA', 'G', 'GS', 'GF', 'CG', 'SHO', 'SV',
        #  'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP', 'WHIP', 'H9', 'HR9', 'BB9', 'SO9', 'SO/BB', 'Awards']
        column_mapping = {
            'Rk': 'Rk',
            'Player': 'Name',
            'Age': 'Age',
            'Team': 'Team',
            'Lg': 'Lg',
            'WAR': 'WAR',
            'W': 'W',
            'L': 'L',
            'W-L%': 'W-L%',
            'ERA': 'ERA',
            'G': 'G',
            'GS': 'GS',
            'GF': 'GF',
            'CG': 'CG',
            'SHO': 'SHO',
            'SV': 'SV',
            'IP': 'IP',
            'H': 'H',
            'R': 'R',
            'ER': 'ER',
            'HR': 'HR',
            'BB': 'BB',
            'IBB': 'IBB',
            'SO': 'SO',
            'HBP': 'HBP',
            'BK': 'BK',
            'WP': 'WP',
            'BF': 'BF',
            'ERA+': 'ERA+',
            'FIP': 'FIP',
            'WHIP': 'WHIP',
            'H9': 'H9',
            'HR9': 'HR9',
            'BB9': 'BB9',
            'SO9': 'SO9',
            'SO/BB': 'SO/BB',
            'Awards': 'Awards',
            'PlayerID': 'Name-additional'  # Map extracted player ID to Name-additional
        }

        # Expected columns in output (based on 2024 Season Cardset/pitching.csv) - 38 total columns
        expected_columns = [
            'Rk', 'Name', 'Age', 'Team', 'Lg', 'WAR', 'W', 'L', 'W-L%', 'ERA',
            'G', 'GS', 'GF', 'CG', 'SHO', 'SV', 'IP', 'H', 'R', 'ER',
            'HR', 'BB', 'IBB', 'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP',
            'WHIP', 'H9', 'HR9', 'BB9', 'SO9', 'SO/BB', 'Awards', 'Name-additional'
        ]

        # Convert raw data to DataFrame
        if not raw_data:
            logger.warning("No data to process")
            return pl.DataFrame(schema={col: pl.Utf8 for col in expected_columns})

        df = pl.DataFrame(raw_data)
        logger.info(f"Created DataFrame with {df.height} rows and {df.width} columns")

        # Create output DataFrame with expected structure
        output_data = {}
        for col in expected_columns:
            # Find the corresponding Baseball Reference column name
            br_col = None
            for br_column, our_column in column_mapping.items():
                if our_column == col:
                    br_col = br_column
                    break

            if br_col and br_col in df.columns:
                # Map from scraped data using the Baseball Reference column name
                output_data[col] = df[br_col].to_list()
            else:
                # Column not available in scraped data, fill with empty strings
                logger.warning(f"Column '{col}' not found in scraped data, filling with empty values")
                output_data[col] = [''] * len(raw_data)

        result_df = pl.DataFrame(output_data)
        logger.info(f"Mapped to expected format: {result_df.height} rows, {result_df.width} columns")

        return result_df

    def save_to_csv(self, df: pl.DataFrame, cardset_name: str) -> str:
        """Save the DataFrame to CSV in the appropriate directory."""
        # Create output directory path - use absolute path to ensure it goes to the right place
        # Find the project root (where data-input directory should be)
        current_path = Path.cwd()
        if current_path.name == "pull-pitching-stats":
            # We're running from the plans subdirectory, go up to the project root
            project_root = current_path.parent.parent.parent
        else:
            # We're already in the project root
            project_root = current_path

        output_dir = project_root / "data-input" / cardset_name
        output_dir.mkdir(parents=True, exist_ok=True)

        output_path = output_dir / "pitching.csv"

        logger.info(f"Saving to: {output_path}")

        # Save to CSV
        df.write_csv(output_path)

        logger.info(f"Successfully saved {df.height} records to {output_path}")
        return str(output_path)

    def scrape_and_save(self, cardset_name: str) -> str:
        """Complete workflow: scrape, process, and save pitching stats."""
        logger.info(f"Starting pitching stats scrape for year {self.year}")

        # Fetch the page
        soup = self.fetch_page()

        # Extract the table data
        raw_data = self.extract_pitching_table(soup)

        # Map to expected format
        df = self.map_to_expected_format(raw_data)

        # Save to CSV
        output_path = self.save_to_csv(df, cardset_name)

        logger.info("Pitching stats scraping completed successfully")
        return output_path


def main():
    """Main entry point for the script."""
    parser = argparse.ArgumentParser(
        description="Scrape Baseball Reference pitching statistics"
    )
    parser.add_argument(
        '--year',
        type=int,
        required=True,
        help='Year to scrape (e.g., 2025)'
    )
    parser.add_argument(
        '--cardset-name',
        type=str,
        required=True,
        help='Name of the cardset directory (e.g., "2025 Live Cardset")'
    )
    parser.add_argument(
        '--verbose',
        action='store_true',
        help='Enable verbose logging'
    )

    args = parser.parse_args()

    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    try:
        scraper = PitchingStatsScraper(args.year)
        output_path = scraper.scrape_and_save(args.cardset_name)
        print(f"Successfully saved pitching stats to: {output_path}")

    except Exception as e:
        logger.error(f"Script failed: {e}")
        sys.exit(1)


if __name__ == '__main__':
    main()