Standardize formatting with black and apply ruff auto-fixes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
141 lines
3.7 KiB
Python
141 lines
3.7 KiB
Python
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
import pandas as pd
|
|
import time
|
|
|
|
|
|
def get_fangraphs_splits_data():
|
|
"""
|
|
Extract all 8 chunks of baseball splits data from FanGraphs
|
|
"""
|
|
|
|
# Initialize Chrome driver
|
|
driver = webdriver.Chrome()
|
|
|
|
# Base parameters
|
|
base_params = {
|
|
"autoPt": "false",
|
|
"splitTeams": "false",
|
|
"statType": "player",
|
|
"startDate": "2025-03-01",
|
|
"endDate": "2025-11-01",
|
|
"players": "",
|
|
"filter": "",
|
|
"groupBy": "season",
|
|
"sort": "22,1",
|
|
}
|
|
|
|
# Define the 8 configurations
|
|
configs = [
|
|
{
|
|
"name": "Batting_vLHP_Standard",
|
|
"position": "B",
|
|
"splitArr": "1",
|
|
"statgroup": "1",
|
|
},
|
|
{
|
|
"name": "Batting_vLHP_BattedBalls",
|
|
"position": "B",
|
|
"splitArr": "1",
|
|
"statgroup": "3",
|
|
},
|
|
{
|
|
"name": "Batting_vRHP_Standard",
|
|
"position": "B",
|
|
"splitArr": "2",
|
|
"statgroup": "1",
|
|
},
|
|
{
|
|
"name": "Batting_vRHP_BattedBalls",
|
|
"position": "B",
|
|
"splitArr": "2",
|
|
"statgroup": "3",
|
|
},
|
|
{
|
|
"name": "Pitching_vLHH_Standard",
|
|
"position": "P",
|
|
"splitArr": "5",
|
|
"statgroup": "1",
|
|
},
|
|
{
|
|
"name": "Pitching_vLHH_BattedBalls",
|
|
"position": "P",
|
|
"splitArr": "5",
|
|
"statgroup": "3",
|
|
},
|
|
{
|
|
"name": "Pitching_vRHH_Standard",
|
|
"position": "P",
|
|
"splitArr": "6",
|
|
"statgroup": "1",
|
|
},
|
|
{
|
|
"name": "Pitching_vRHH_BattedBalls",
|
|
"position": "P",
|
|
"splitArr": "6",
|
|
"statgroup": "3",
|
|
},
|
|
]
|
|
|
|
all_data = {}
|
|
|
|
try:
|
|
for config in configs:
|
|
print(f"Fetching {config['name']}...")
|
|
|
|
# Build URL
|
|
url_params = {
|
|
**base_params,
|
|
"position": config["position"],
|
|
"splitArr": config["splitArr"],
|
|
"statgroup": config["statgroup"],
|
|
}
|
|
|
|
param_string = "&".join([f"{k}={v}" for k, v in url_params.items()])
|
|
url = (
|
|
f"https://www.fangraphs.com/leaders/splits-leaderboards?{param_string}"
|
|
)
|
|
|
|
# Navigate to URL
|
|
driver.get(url)
|
|
|
|
# Wait for table to load (wait for table element)
|
|
wait = WebDriverWait(driver, 10)
|
|
table = wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
|
|
|
|
# Additional wait for data to populate
|
|
time.sleep(2)
|
|
|
|
# Extract table using pandas
|
|
tables = pd.read_html(driver.page_source)
|
|
|
|
if tables:
|
|
# The main data table is typically the largest one
|
|
df = max(tables, key=lambda x: len(x))
|
|
all_data[config["name"]] = df
|
|
print(f" ✓ Got {len(df)} rows")
|
|
else:
|
|
print(" ✗ No tables found")
|
|
|
|
return all_data
|
|
|
|
finally:
|
|
driver.quit()
|
|
|
|
|
|
# Usage
|
|
if __name__ == "__main__":
|
|
data = get_fangraphs_splits_data()
|
|
|
|
# Save to CSV files
|
|
for name, df in data.items():
|
|
df.to_csv(f"{name}.csv", index=False)
|
|
print(f"Saved {name}.csv")
|
|
|
|
# Or work with the dataframes directly
|
|
# Example: print first few rows of batting vs LHP
|
|
print("\\nBatting vs LHP (Standard) - First 5 rows:")
|
|
print(data["Batting_vLHP_Standard"].head())
|