paper-dynasty-card-creation/scripts/fangraphs_scrape.py

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time


def get_fangraphs_splits_data():
    """
    Extract all 8 chunks of baseball splits data from FanGraphs
    """

    # Initialize Chrome driver
    driver = webdriver.Chrome()

    # Base parameters
    base_params = {
        "autoPt": "false",
        "splitTeams": "false",
        "statType": "player",
        "startDate": "2025-03-01",
        "endDate": "2025-11-01",
        "players": "",
        "filter": "",
        "groupBy": "season",
        "sort": "22,1",
    }

    # Define the 8 configurations
    configs = [
        {
            "name": "Batting_vLHP_Standard",
            "position": "B",
            "splitArr": "1",
            "statgroup": "1",
        },
        {
            "name": "Batting_vLHP_BattedBalls",
            "position": "B",
            "splitArr": "1",
            "statgroup": "3",
        },
        {
            "name": "Batting_vRHP_Standard",
            "position": "B",
            "splitArr": "2",
            "statgroup": "1",
        },
        {
            "name": "Batting_vRHP_BattedBalls",
            "position": "B",
            "splitArr": "2",
            "statgroup": "3",
        },
        {
            "name": "Pitching_vLHH_Standard",
            "position": "P",
            "splitArr": "5",
            "statgroup": "1",
        },
        {
            "name": "Pitching_vLHH_BattedBalls",
            "position": "P",
            "splitArr": "5",
            "statgroup": "3",
        },
        {
            "name": "Pitching_vRHH_Standard",
            "position": "P",
            "splitArr": "6",
            "statgroup": "1",
        },
        {
            "name": "Pitching_vRHH_BattedBalls",
            "position": "P",
            "splitArr": "6",
            "statgroup": "3",
        },
    ]

    all_data = {}

    try:
        for config in configs:
            print(f"Fetching {config['name']}...")

            # Build URL
            url_params = {
                **base_params,
                "position": config["position"],
                "splitArr": config["splitArr"],
                "statgroup": config["statgroup"],
            }

            param_string = "&".join([f"{k}={v}" for k, v in url_params.items()])
            url = (
                f"https://www.fangraphs.com/leaders/splits-leaderboards?{param_string}"
            )

            # Navigate to URL
            driver.get(url)

            # Wait for table to load (wait for table element)
            wait = WebDriverWait(driver, 10)
            table = wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))

            # Additional wait for data to populate
            time.sleep(2)

            # Extract table using pandas
            tables = pd.read_html(driver.page_source)

            if tables:
                # The main data table is typically the largest one
                df = max(tables, key=lambda x: len(x))
                all_data[config["name"]] = df
                print(f"  ✓ Got {len(df)} rows")
            else:
                print("  ✗ No tables found")

        return all_data

    finally:
        driver.quit()


# Usage
if __name__ == "__main__":
    data = get_fangraphs_splits_data()

    # Save to CSV files
    for name, df in data.items():
        df.to_csv(f"{name}.csv", index=False)
        print(f"Saved {name}.csv")

    # Or work with the dataframes directly
    # Example: print first few rows of batting vs LHP
    print("\\nBatting vs LHP (Standard) - First 5 rows:")
    print(data["Batting_vLHP_Standard"].head())