301 lines
11 KiB
Python
301 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Baseball Reference Pitching Stats Scraper
|
|
|
|
This script scrapes the Player Standard Pitching table from Baseball Reference
|
|
and saves it as a CSV file in the specified cardset directory.
|
|
|
|
Usage:
|
|
python pull_pitching_stats.py --year 2025 --cardset-name "2025 Live Cardset"
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
|
|
import polars as pl
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PitchingStatsScraper:
|
|
"""Scraper for Baseball Reference pitching statistics."""
|
|
|
|
def __init__(self, year: int):
|
|
"""Initialize the scraper with the target year."""
|
|
self.year = year
|
|
self.base_url = "https://www.baseball-reference.com"
|
|
self.pitching_url = f"{self.base_url}/leagues/majors/{year}-standard-pitching.shtml"
|
|
|
|
# HTTP session for requests
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
})
|
|
|
|
def fetch_page(self) -> BeautifulSoup:
|
|
"""Fetch the Baseball Reference pitching stats page."""
|
|
logger.info(f"Fetching pitching stats from: {self.pitching_url}")
|
|
|
|
try:
|
|
response = self.session.get(self.pitching_url, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
logger.info("Successfully fetched and parsed the page")
|
|
return soup
|
|
|
|
except requests.RequestException as e:
|
|
logger.error(f"Failed to fetch page: {e}")
|
|
raise
|
|
|
|
def extract_pitching_table(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
|
|
"""Extract the players_standard_pitching table from the page."""
|
|
logger.info("Extracting players_standard_pitching table")
|
|
|
|
# Find the table by ID
|
|
table = soup.find('table', {'id': 'players_standard_pitching'})
|
|
if not table:
|
|
raise ValueError("Could not find table with ID 'players_standard_pitching'")
|
|
|
|
# Extract headers
|
|
header_row = table.find('thead').find('tr')
|
|
if not header_row:
|
|
raise ValueError("Could not find table headers")
|
|
|
|
headers = []
|
|
for th in header_row.find_all(['th', 'td']):
|
|
header_text = th.get_text(strip=True)
|
|
if header_text:
|
|
headers.append(header_text)
|
|
|
|
logger.info(f"Found {len(headers)} columns: {headers}")
|
|
|
|
# Extract data rows
|
|
tbody = table.find('tbody')
|
|
if not tbody:
|
|
raise ValueError("Could not find table body")
|
|
|
|
data_rows = []
|
|
for row in tbody.find_all('tr'):
|
|
# Skip header rows that might appear in tbody
|
|
if row.find('th') and row.find('th').get('class') and 'thead' in row.find('th').get('class', []):
|
|
continue
|
|
|
|
row_data = {}
|
|
player_id = ""
|
|
cells = row.find_all(['td', 'th'])
|
|
|
|
for i, cell in enumerate(cells):
|
|
if i < len(headers):
|
|
cell_text = cell.get_text(strip=True)
|
|
row_data[headers[i]] = cell_text
|
|
|
|
# Extract player ID from the Player column (usually index 1)
|
|
if headers[i] == 'Player':
|
|
# Look for a link in this cell
|
|
link = cell.find('a')
|
|
if link and link.get('href'):
|
|
href = link.get('href')
|
|
# Extract player ID from href like "/players/c/crocega01.shtml"
|
|
if '/players/' in href:
|
|
# Split by '/' and get the filename, then remove .shtml
|
|
parts = href.split('/')
|
|
if len(parts) > 0:
|
|
filename = parts[-1]
|
|
if filename.endswith('.shtml'):
|
|
player_id = filename[:-6] # Remove .shtml
|
|
|
|
# Add player ID to row data
|
|
row_data['PlayerID'] = player_id
|
|
|
|
# Only add rows that have player data (name in first column)
|
|
if row_data and row_data.get(headers[0], '').strip():
|
|
data_rows.append(row_data)
|
|
|
|
logger.info(f"Extracted {len(data_rows)} player records")
|
|
return data_rows
|
|
|
|
def map_to_expected_format(self, raw_data: List[Dict[str, str]]) -> pl.DataFrame:
|
|
"""Map the scraped data to the expected CSV format."""
|
|
logger.info("Mapping data to expected format")
|
|
|
|
# Define the expected column mapping from Baseball Reference to our format
|
|
# Based on 2024 Season Cardset structure and actual Baseball Reference columns found:
|
|
# ['Rk', 'Player', 'Age', 'Team', 'Lg', 'WAR', 'W', 'L', 'W-L%', 'ERA', 'G', 'GS', 'GF', 'CG', 'SHO', 'SV',
|
|
# 'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP', 'WHIP', 'H9', 'HR9', 'BB9', 'SO9', 'SO/BB', 'Awards']
|
|
column_mapping = {
|
|
'Rk': 'Rk',
|
|
'Player': 'Name',
|
|
'Age': 'Age',
|
|
'Team': 'Team',
|
|
'Lg': 'Lg',
|
|
'WAR': 'WAR',
|
|
'W': 'W',
|
|
'L': 'L',
|
|
'W-L%': 'W-L%',
|
|
'ERA': 'ERA',
|
|
'G': 'G',
|
|
'GS': 'GS',
|
|
'GF': 'GF',
|
|
'CG': 'CG',
|
|
'SHO': 'SHO',
|
|
'SV': 'SV',
|
|
'IP': 'IP',
|
|
'H': 'H',
|
|
'R': 'R',
|
|
'ER': 'ER',
|
|
'HR': 'HR',
|
|
'BB': 'BB',
|
|
'IBB': 'IBB',
|
|
'SO': 'SO',
|
|
'HBP': 'HBP',
|
|
'BK': 'BK',
|
|
'WP': 'WP',
|
|
'BF': 'BF',
|
|
'ERA+': 'ERA+',
|
|
'FIP': 'FIP',
|
|
'WHIP': 'WHIP',
|
|
'H9': 'H9',
|
|
'HR9': 'HR9',
|
|
'BB9': 'BB9',
|
|
'SO9': 'SO9',
|
|
'SO/BB': 'SO/BB',
|
|
'Awards': 'Awards',
|
|
'PlayerID': 'Name-additional' # Map extracted player ID to Name-additional
|
|
}
|
|
|
|
# Expected columns in output (based on 2024 Season Cardset/pitching.csv) - 38 total columns
|
|
expected_columns = [
|
|
'Rk', 'Name', 'Age', 'Team', 'Lg', 'WAR', 'W', 'L', 'W-L%', 'ERA',
|
|
'G', 'GS', 'GF', 'CG', 'SHO', 'SV', 'IP', 'H', 'R', 'ER',
|
|
'HR', 'BB', 'IBB', 'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP',
|
|
'WHIP', 'H9', 'HR9', 'BB9', 'SO9', 'SO/BB', 'Awards', 'Name-additional'
|
|
]
|
|
|
|
# Convert raw data to DataFrame
|
|
if not raw_data:
|
|
logger.warning("No data to process")
|
|
return pl.DataFrame(schema={col: pl.Utf8 for col in expected_columns})
|
|
|
|
df = pl.DataFrame(raw_data)
|
|
logger.info(f"Created DataFrame with {df.height} rows and {df.width} columns")
|
|
|
|
# Create output DataFrame with expected structure
|
|
output_data = {}
|
|
for col in expected_columns:
|
|
# Find the corresponding Baseball Reference column name
|
|
br_col = None
|
|
for br_column, our_column in column_mapping.items():
|
|
if our_column == col:
|
|
br_col = br_column
|
|
break
|
|
|
|
if br_col and br_col in df.columns:
|
|
# Map from scraped data using the Baseball Reference column name
|
|
output_data[col] = df[br_col].to_list()
|
|
else:
|
|
# Column not available in scraped data, fill with empty strings
|
|
logger.warning(f"Column '{col}' not found in scraped data, filling with empty values")
|
|
output_data[col] = [''] * len(raw_data)
|
|
|
|
result_df = pl.DataFrame(output_data)
|
|
logger.info(f"Mapped to expected format: {result_df.height} rows, {result_df.width} columns")
|
|
|
|
return result_df
|
|
|
|
def save_to_csv(self, df: pl.DataFrame, cardset_name: str) -> str:
|
|
"""Save the DataFrame to CSV in the appropriate directory."""
|
|
# Create output directory path - use absolute path to ensure it goes to the right place
|
|
# Find the project root (where data-input directory should be)
|
|
current_path = Path.cwd()
|
|
if current_path.name == "pull-pitching-stats":
|
|
# We're running from the plans subdirectory, go up to the project root
|
|
project_root = current_path.parent.parent.parent
|
|
else:
|
|
# We're already in the project root
|
|
project_root = current_path
|
|
|
|
output_dir = project_root / "data-input" / cardset_name
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
output_path = output_dir / "pitching.csv"
|
|
|
|
logger.info(f"Saving to: {output_path}")
|
|
|
|
# Save to CSV
|
|
df.write_csv(output_path)
|
|
|
|
logger.info(f"Successfully saved {df.height} records to {output_path}")
|
|
return str(output_path)
|
|
|
|
def scrape_and_save(self, cardset_name: str) -> str:
|
|
"""Complete workflow: scrape, process, and save pitching stats."""
|
|
logger.info(f"Starting pitching stats scrape for year {self.year}")
|
|
|
|
# Fetch the page
|
|
soup = self.fetch_page()
|
|
|
|
# Extract the table data
|
|
raw_data = self.extract_pitching_table(soup)
|
|
|
|
# Map to expected format
|
|
df = self.map_to_expected_format(raw_data)
|
|
|
|
# Save to CSV
|
|
output_path = self.save_to_csv(df, cardset_name)
|
|
|
|
logger.info("Pitching stats scraping completed successfully")
|
|
return output_path
|
|
|
|
|
|
def main():
|
|
"""Main entry point for the script."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Scrape Baseball Reference pitching statistics"
|
|
)
|
|
parser.add_argument(
|
|
'--year',
|
|
type=int,
|
|
required=True,
|
|
help='Year to scrape (e.g., 2025)'
|
|
)
|
|
parser.add_argument(
|
|
'--cardset-name',
|
|
type=str,
|
|
required=True,
|
|
help='Name of the cardset directory (e.g., "2025 Live Cardset")'
|
|
)
|
|
parser.add_argument(
|
|
'--verbose',
|
|
action='store_true',
|
|
help='Enable verbose logging'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.verbose:
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
try:
|
|
scraper = PitchingStatsScraper(args.year)
|
|
output_path = scraper.scrape_and_save(args.cardset_name)
|
|
print(f"Successfully saved pitching stats to: {output_path}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Script failed: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main() |