paper-dynasty-card-creation/scripts/fangraphs_scrape.py

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

def get_fangraphs_splits_data():
    """
    Extract all 8 chunks of baseball splits data from FanGraphs
    """

    # Initialize Chrome driver
    driver = webdriver.Chrome()

    # Base parameters
    base_params = {
        'autoPt': 'false',
        'splitTeams': 'false',
        'statType': 'player',
        'startDate': '2025-03-01',
        'endDate': '2025-11-01',
        'players': '',
        'filter': '',
        'groupBy': 'season',
        'sort': '22,1'
    }

    # Define the 8 configurations
    configs = [
        {'name': 'Batting_vLHP_Standard', 'position': 'B', 'splitArr': '1', 'statgroup': '1'},
        {'name': 'Batting_vLHP_BattedBalls', 'position': 'B', 'splitArr': '1', 'statgroup': '3'},
        {'name': 'Batting_vRHP_Standard', 'position': 'B', 'splitArr': '2', 'statgroup': '1'},
        {'name': 'Batting_vRHP_BattedBalls', 'position': 'B', 'splitArr': '2', 'statgroup': '3'},
        {'name': 'Pitching_vLHH_Standard', 'position': 'P', 'splitArr': '5', 'statgroup': '1'},
        {'name': 'Pitching_vLHH_BattedBalls', 'position': 'P', 'splitArr': '5', 'statgroup': '3'},
        {'name': 'Pitching_vRHH_Standard', 'position': 'P', 'splitArr': '6', 'statgroup': '1'},
        {'name': 'Pitching_vRHH_BattedBalls', 'position': 'P', 'splitArr': '6', 'statgroup': '3'},
    ]

    all_data = {}

    try:
        for config in configs:
            print(f"Fetching {config['name']}...")

            # Build URL
            url_params = {**base_params,
                         'position': config['position'],
                         'splitArr': config['splitArr'],
                         'statgroup': config['statgroup']}

            param_string = '&'.join([f"{k}={v}" for k, v in url_params.items()])
            url = f"https://www.fangraphs.com/leaders/splits-leaderboards?{param_string}"

            # Navigate to URL
            driver.get(url)

            # Wait for table to load (wait for table element)
            wait = WebDriverWait(driver, 10)
            table = wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))

            # Additional wait for data to populate
            time.sleep(2)

            # Extract table using pandas
            tables = pd.read_html(driver.page_source)

            if tables:
                # The main data table is typically the largest one
                df = max(tables, key=lambda x: len(x))
                all_data[config['name']] = df
                print(f"  ✓ Got {len(df)} rows")
            else:
                print(f"  ✗ No tables found")

        return all_data

    finally:
        driver.quit()


# Usage
if __name__ == "__main__":
    data = get_fangraphs_splits_data()

    # Save to CSV files
    for name, df in data.items():
        df.to_csv(f"{name}.csv", index=False)
        print(f"Saved {name}.csv")

    # Or work with the dataframes directly
    # Example: print first few rows of batting vs LHP
    print("\\nBatting vs LHP (Standard) - First 5 rows:")
    print(data['Batting_vLHP_Standard'].head())