790 players (397 batters, 393 pitchers) processed from Retrosheet data through 2005-08-15 with 0.728 season percentage. Includes updated scouting reports, card deltas, and FanGraphs scrape script. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
96 lines
3.2 KiB
Python
96 lines
3.2 KiB
Python
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
import pandas as pd
|
|
import time
|
|
|
|
def get_fangraphs_splits_data():
|
|
"""
|
|
Extract all 8 chunks of baseball splits data from FanGraphs
|
|
"""
|
|
|
|
# Initialize Chrome driver
|
|
driver = webdriver.Chrome()
|
|
|
|
# Base parameters
|
|
base_params = {
|
|
'autoPt': 'false',
|
|
'splitTeams': 'false',
|
|
'statType': 'player',
|
|
'startDate': '2025-03-01',
|
|
'endDate': '2025-11-01',
|
|
'players': '',
|
|
'filter': '',
|
|
'groupBy': 'season',
|
|
'sort': '22,1'
|
|
}
|
|
|
|
# Define the 8 configurations
|
|
configs = [
|
|
{'name': 'Batting_vLHP_Standard', 'position': 'B', 'splitArr': '1', 'statgroup': '1'},
|
|
{'name': 'Batting_vLHP_BattedBalls', 'position': 'B', 'splitArr': '1', 'statgroup': '3'},
|
|
{'name': 'Batting_vRHP_Standard', 'position': 'B', 'splitArr': '2', 'statgroup': '1'},
|
|
{'name': 'Batting_vRHP_BattedBalls', 'position': 'B', 'splitArr': '2', 'statgroup': '3'},
|
|
{'name': 'Pitching_vLHH_Standard', 'position': 'P', 'splitArr': '5', 'statgroup': '1'},
|
|
{'name': 'Pitching_vLHH_BattedBalls', 'position': 'P', 'splitArr': '5', 'statgroup': '3'},
|
|
{'name': 'Pitching_vRHH_Standard', 'position': 'P', 'splitArr': '6', 'statgroup': '1'},
|
|
{'name': 'Pitching_vRHH_BattedBalls', 'position': 'P', 'splitArr': '6', 'statgroup': '3'},
|
|
]
|
|
|
|
all_data = {}
|
|
|
|
try:
|
|
for config in configs:
|
|
print(f"Fetching {config['name']}...")
|
|
|
|
# Build URL
|
|
url_params = {**base_params,
|
|
'position': config['position'],
|
|
'splitArr': config['splitArr'],
|
|
'statgroup': config['statgroup']}
|
|
|
|
param_string = '&'.join([f"{k}={v}" for k, v in url_params.items()])
|
|
url = f"https://www.fangraphs.com/leaders/splits-leaderboards?{param_string}"
|
|
|
|
# Navigate to URL
|
|
driver.get(url)
|
|
|
|
# Wait for table to load (wait for table element)
|
|
wait = WebDriverWait(driver, 10)
|
|
table = wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
|
|
|
|
# Additional wait for data to populate
|
|
time.sleep(2)
|
|
|
|
# Extract table using pandas
|
|
tables = pd.read_html(driver.page_source)
|
|
|
|
if tables:
|
|
# The main data table is typically the largest one
|
|
df = max(tables, key=lambda x: len(x))
|
|
all_data[config['name']] = df
|
|
print(f" ✓ Got {len(df)} rows")
|
|
else:
|
|
print(f" ✗ No tables found")
|
|
|
|
return all_data
|
|
|
|
finally:
|
|
driver.quit()
|
|
|
|
|
|
# Usage
|
|
if __name__ == "__main__":
|
|
data = get_fangraphs_splits_data()
|
|
|
|
# Save to CSV files
|
|
for name, df in data.items():
|
|
df.to_csv(f"{name}.csv", index=False)
|
|
print(f"Saved {name}.csv")
|
|
|
|
# Or work with the dataframes directly
|
|
# Example: print first few rows of batting vs LHP
|
|
print("\\nBatting vs LHP (Standard) - First 5 rows:")
|
|
print(data['Batting_vLHP_Standard'].head())
|