paper-dynasty-card-creation/scripts/fangraphs_scrape.py
Cal Corum 923edd0eeb Update 2005 Live cardset through mid-August (73% season)
790 players (397 batters, 393 pitchers) processed from Retrosheet data
through 2005-08-15 with 0.728 season percentage. Includes updated scouting
reports, card deltas, and FanGraphs scrape script.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:56:19 -06:00

96 lines
3.2 KiB
Python

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
def get_fangraphs_splits_data():
"""
Extract all 8 chunks of baseball splits data from FanGraphs
"""
# Initialize Chrome driver
driver = webdriver.Chrome()
# Base parameters
base_params = {
'autoPt': 'false',
'splitTeams': 'false',
'statType': 'player',
'startDate': '2025-03-01',
'endDate': '2025-11-01',
'players': '',
'filter': '',
'groupBy': 'season',
'sort': '22,1'
}
# Define the 8 configurations
configs = [
{'name': 'Batting_vLHP_Standard', 'position': 'B', 'splitArr': '1', 'statgroup': '1'},
{'name': 'Batting_vLHP_BattedBalls', 'position': 'B', 'splitArr': '1', 'statgroup': '3'},
{'name': 'Batting_vRHP_Standard', 'position': 'B', 'splitArr': '2', 'statgroup': '1'},
{'name': 'Batting_vRHP_BattedBalls', 'position': 'B', 'splitArr': '2', 'statgroup': '3'},
{'name': 'Pitching_vLHH_Standard', 'position': 'P', 'splitArr': '5', 'statgroup': '1'},
{'name': 'Pitching_vLHH_BattedBalls', 'position': 'P', 'splitArr': '5', 'statgroup': '3'},
{'name': 'Pitching_vRHH_Standard', 'position': 'P', 'splitArr': '6', 'statgroup': '1'},
{'name': 'Pitching_vRHH_BattedBalls', 'position': 'P', 'splitArr': '6', 'statgroup': '3'},
]
all_data = {}
try:
for config in configs:
print(f"Fetching {config['name']}...")
# Build URL
url_params = {**base_params,
'position': config['position'],
'splitArr': config['splitArr'],
'statgroup': config['statgroup']}
param_string = '&'.join([f"{k}={v}" for k, v in url_params.items()])
url = f"https://www.fangraphs.com/leaders/splits-leaderboards?{param_string}"
# Navigate to URL
driver.get(url)
# Wait for table to load (wait for table element)
wait = WebDriverWait(driver, 10)
table = wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
# Additional wait for data to populate
time.sleep(2)
# Extract table using pandas
tables = pd.read_html(driver.page_source)
if tables:
# The main data table is typically the largest one
df = max(tables, key=lambda x: len(x))
all_data[config['name']] = df
print(f" ✓ Got {len(df)} rows")
else:
print(f" ✗ No tables found")
return all_data
finally:
driver.quit()
# Usage
if __name__ == "__main__":
data = get_fangraphs_splits_data()
# Save to CSV files
for name, df in data.items():
df.to_csv(f"{name}.csv", index=False)
print(f"Saved {name}.csv")
# Or work with the dataframes directly
# Example: print first few rows of batting vs LHP
print("\\nBatting vs LHP (Standard) - First 5 rows:")
print(data['Batting_vLHP_Standard'].head())