paper-dynasty-card-creation/pull_pitching_stats.py

#!/usr/bin/env python3
"""
Baseball Reference Pitching Stats Scraper

This script scrapes the Player Standard Pitching table from Baseball Reference
and saves it as a CSV file in the specified cardset directory.

Usage:
    python pull_pitching_stats.py --year 2025 --cardset-name "2025 Live Cardset"
"""

import argparse
import logging
import sys
from pathlib import Path
from typing import Dict, List

import polars as pl
import requests
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


class PitchingStatsScraper:
    """Scraper for Baseball Reference pitching statistics."""

    def __init__(self, year: int):
        """Initialize the scraper with the target year."""
        self.year = year
        self.base_url = "https://www.baseball-reference.com"
        self.pitching_url = (
            f"{self.base_url}/leagues/majors/{year}-standard-pitching.shtml"
        )

        # HTTP session for requests
        self.session = requests.Session()
        self.session.headers.update(
            {
                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
            }
        )

    def fetch_page(self) -> BeautifulSoup:
        """Fetch the Baseball Reference pitching stats page."""
        logger.info(f"Fetching pitching stats from: {self.pitching_url}")

        try:
            response = self.session.get(self.pitching_url, timeout=30)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, "html.parser")
            logger.info("Successfully fetched and parsed the page")
            return soup

        except requests.RequestException as e:
            logger.error(f"Failed to fetch page: {e}")
            raise

    def extract_pitching_table(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
        """Extract the players_standard_pitching table from the page."""
        logger.info("Extracting players_standard_pitching table")

        # Find the table by ID
        table = soup.find("table", {"id": "players_standard_pitching"})
        if not table:
            raise ValueError("Could not find table with ID 'players_standard_pitching'")

        # Extract headers
        header_row = table.find("thead").find("tr")
        if not header_row:
            raise ValueError("Could not find table headers")

        headers = []
        for th in header_row.find_all(["th", "td"]):
            header_text = th.get_text(strip=True)
            if header_text:
                headers.append(header_text)

        logger.info(f"Found {len(headers)} columns: {headers}")

        # Extract data rows
        tbody = table.find("tbody")
        if not tbody:
            raise ValueError("Could not find table body")

        data_rows = []
        for row in tbody.find_all("tr"):
            # Skip header rows that might appear in tbody
            if (
                row.find("th")
                and row.find("th").get("class")
                and "thead" in row.find("th").get("class", [])
            ):
                continue

            row_data = {}
            player_id = ""
            cells = row.find_all(["td", "th"])

            for i, cell in enumerate(cells):
                if i < len(headers):
                    cell_text = cell.get_text(strip=True)
                    row_data[headers[i]] = cell_text

                    # Extract player ID from the Player column (usually index 1)
                    if headers[i] == "Player":
                        # Look for a link in this cell
                        link = cell.find("a")
                        if link and link.get("href"):
                            href = link.get("href")
                            # Extract player ID from href like "/players/c/crocega01.shtml"
                            if "/players/" in href:
                                # Split by '/' and get the filename, then remove .shtml
                                parts = href.split("/")
                                if len(parts) > 0:
                                    filename = parts[-1]
                                    if filename.endswith(".shtml"):
                                        player_id = filename[:-6]  # Remove .shtml

            # Add player ID to row data
            row_data["PlayerID"] = player_id

            # Only add rows that have player data (name in first column)
            if row_data and row_data.get(headers[0], "").strip():
                data_rows.append(row_data)

        logger.info(f"Extracted {len(data_rows)} player records")
        return data_rows

    def map_to_expected_format(self, raw_data: List[Dict[str, str]]) -> pl.DataFrame:
        """Map the scraped data to the expected CSV format."""
        logger.info("Mapping data to expected format")

        # Define the expected column mapping from Baseball Reference to our format
        # Based on 2024 Season Cardset structure and actual Baseball Reference columns found:
        # ['Rk', 'Player', 'Age', 'Team', 'Lg', 'WAR', 'W', 'L', 'W-L%', 'ERA', 'G', 'GS', 'GF', 'CG', 'SHO', 'SV',
        #  'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP', 'WHIP', 'H9', 'HR9', 'BB9', 'SO9', 'SO/BB', 'Awards']
        column_mapping = {
            "Rk": "Rk",
            "Player": "Name",
            "Age": "Age",
            "Team": "Team",
            "Lg": "Lg",
            "WAR": "WAR",
            "W": "W",
            "L": "L",
            "W-L%": "W-L%",
            "ERA": "ERA",
            "G": "G",
            "GS": "GS",
            "GF": "GF",
            "CG": "CG",
            "SHO": "SHO",
            "SV": "SV",
            "IP": "IP",
            "H": "H",
            "R": "R",
            "ER": "ER",
            "HR": "HR",
            "BB": "BB",
            "IBB": "IBB",
            "SO": "SO",
            "HBP": "HBP",
            "BK": "BK",
            "WP": "WP",
            "BF": "BF",
            "ERA+": "ERA+",
            "FIP": "FIP",
            "WHIP": "WHIP",
            "H9": "H9",
            "HR9": "HR9",
            "BB9": "BB9",
            "SO9": "SO9",
            "SO/BB": "SO/BB",
            "Awards": "Awards",
            "PlayerID": "Name-additional",  # Map extracted player ID to Name-additional
        }

        # Expected columns in output (based on 2024 Season Cardset/pitching.csv) - 38 total columns
        expected_columns = [
            "Rk",
            "Name",
            "Age",
            "Team",
            "Lg",
            "WAR",
            "W",
            "L",
            "W-L%",
            "ERA",
            "G",
            "GS",
            "GF",
            "CG",
            "SHO",
            "SV",
            "IP",
            "H",
            "R",
            "ER",
            "HR",
            "BB",
            "IBB",
            "SO",
            "HBP",
            "BK",
            "WP",
            "BF",
            "ERA+",
            "FIP",
            "WHIP",
            "H9",
            "HR9",
            "BB9",
            "SO9",
            "SO/BB",
            "Awards",
            "Name-additional",
        ]

        # Convert raw data to DataFrame
        if not raw_data:
            logger.warning("No data to process")
            return pl.DataFrame(schema={col: pl.Utf8 for col in expected_columns})

        df = pl.DataFrame(raw_data)
        logger.info(f"Created DataFrame with {df.height} rows and {df.width} columns")

        # Create output DataFrame with expected structure
        output_data = {}
        for col in expected_columns:
            # Find the corresponding Baseball Reference column name
            br_col = None
            for br_column, our_column in column_mapping.items():
                if our_column == col:
                    br_col = br_column
                    break

            if br_col and br_col in df.columns:
                # Map from scraped data using the Baseball Reference column name
                output_data[col] = df[br_col].to_list()
            else:
                # Column not available in scraped data, fill with empty strings
                logger.warning(
                    f"Column '{col}' not found in scraped data, filling with empty values"
                )
                output_data[col] = [""] * len(raw_data)

        result_df = pl.DataFrame(output_data)
        logger.info(
            f"Mapped to expected format: {result_df.height} rows, {result_df.width} columns"
        )

        return result_df

    def save_to_csv(self, df: pl.DataFrame, cardset_name: str) -> str:
        """Save the DataFrame to CSV in the appropriate directory."""
        # Create output directory path - use absolute path to ensure it goes to the right place
        # Find the project root (where data-input directory should be)
        current_path = Path.cwd()
        if current_path.name == "pull-pitching-stats":
            # We're running from the plans subdirectory, go up to the project root
            project_root = current_path.parent.parent.parent
        else:
            # We're already in the project root
            project_root = current_path

        output_dir = project_root / "data-input" / cardset_name
        output_dir.mkdir(parents=True, exist_ok=True)

        output_path = output_dir / "pitching.csv"

        logger.info(f"Saving to: {output_path}")

        # Save to CSV
        df.write_csv(output_path)

        logger.info(f"Successfully saved {df.height} records to {output_path}")
        return str(output_path)

    def scrape_and_save(self, cardset_name: str) -> str:
        """Complete workflow: scrape, process, and save pitching stats."""
        logger.info(f"Starting pitching stats scrape for year {self.year}")

        # Fetch the page
        soup = self.fetch_page()

        # Extract the table data
        raw_data = self.extract_pitching_table(soup)

        # Map to expected format
        df = self.map_to_expected_format(raw_data)

        # Save to CSV
        output_path = self.save_to_csv(df, cardset_name)

        logger.info("Pitching stats scraping completed successfully")
        return output_path


def main():
    """Main entry point for the script."""
    parser = argparse.ArgumentParser(
        description="Scrape Baseball Reference pitching statistics"
    )
    parser.add_argument(
        "--year", type=int, required=True, help="Year to scrape (e.g., 2025)"
    )
    parser.add_argument(
        "--cardset-name",
        type=str,
        required=True,
        help='Name of the cardset directory (e.g., "2025 Live Cardset")',
    )
    parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")

    args = parser.parse_args()

    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    try:
        scraper = PitchingStatsScraper(args.year)
        output_path = scraper.scrape_and_save(args.cardset_name)
        print(f"Successfully saved pitching stats to: {output_path}")

    except Exception as e:
        logger.error(f"Script failed: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()