paper-dynasty-card-creation/scripts/fangraphs_scrape.py
Cal Corum 0a17745389 Run black and ruff across entire codebase
Standardize formatting with black and apply ruff auto-fixes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 14:24:33 -05:00

141 lines
3.7 KiB
Python

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
def get_fangraphs_splits_data():
"""
Extract all 8 chunks of baseball splits data from FanGraphs
"""
# Initialize Chrome driver
driver = webdriver.Chrome()
# Base parameters
base_params = {
"autoPt": "false",
"splitTeams": "false",
"statType": "player",
"startDate": "2025-03-01",
"endDate": "2025-11-01",
"players": "",
"filter": "",
"groupBy": "season",
"sort": "22,1",
}
# Define the 8 configurations
configs = [
{
"name": "Batting_vLHP_Standard",
"position": "B",
"splitArr": "1",
"statgroup": "1",
},
{
"name": "Batting_vLHP_BattedBalls",
"position": "B",
"splitArr": "1",
"statgroup": "3",
},
{
"name": "Batting_vRHP_Standard",
"position": "B",
"splitArr": "2",
"statgroup": "1",
},
{
"name": "Batting_vRHP_BattedBalls",
"position": "B",
"splitArr": "2",
"statgroup": "3",
},
{
"name": "Pitching_vLHH_Standard",
"position": "P",
"splitArr": "5",
"statgroup": "1",
},
{
"name": "Pitching_vLHH_BattedBalls",
"position": "P",
"splitArr": "5",
"statgroup": "3",
},
{
"name": "Pitching_vRHH_Standard",
"position": "P",
"splitArr": "6",
"statgroup": "1",
},
{
"name": "Pitching_vRHH_BattedBalls",
"position": "P",
"splitArr": "6",
"statgroup": "3",
},
]
all_data = {}
try:
for config in configs:
print(f"Fetching {config['name']}...")
# Build URL
url_params = {
**base_params,
"position": config["position"],
"splitArr": config["splitArr"],
"statgroup": config["statgroup"],
}
param_string = "&".join([f"{k}={v}" for k, v in url_params.items()])
url = (
f"https://www.fangraphs.com/leaders/splits-leaderboards?{param_string}"
)
# Navigate to URL
driver.get(url)
# Wait for table to load (wait for table element)
wait = WebDriverWait(driver, 10)
table = wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
# Additional wait for data to populate
time.sleep(2)
# Extract table using pandas
tables = pd.read_html(driver.page_source)
if tables:
# The main data table is typically the largest one
df = max(tables, key=lambda x: len(x))
all_data[config["name"]] = df
print(f" ✓ Got {len(df)} rows")
else:
print(" ✗ No tables found")
return all_data
finally:
driver.quit()
# Usage
if __name__ == "__main__":
data = get_fangraphs_splits_data()
# Save to CSV files
for name, df in data.items():
df.to_csv(f"{name}.csv", index=False)
print(f"Saved {name}.csv")
# Or work with the dataframes directly
# Example: print first few rows of batting vs LHP
print("\\nBatting vs LHP (Standard) - First 5 rows:")
print(data["Batting_vLHP_Standard"].head())