paper-dynasty-card-creation/pull_pitching_stats.py
2025-07-22 09:24:34 -05:00

301 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Baseball Reference Pitching Stats Scraper
This script scrapes the Player Standard Pitching table from Baseball Reference
and saves it as a CSV file in the specified cardset directory.
Usage:
python pull_pitching_stats.py --year 2025 --cardset-name "2025 Live Cardset"
"""
import argparse
import logging
import os
import sys
from pathlib import Path
from typing import Dict, List, Optional
import polars as pl
import requests
from bs4 import BeautifulSoup
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class PitchingStatsScraper:
"""Scraper for Baseball Reference pitching statistics."""
def __init__(self, year: int):
"""Initialize the scraper with the target year."""
self.year = year
self.base_url = "https://www.baseball-reference.com"
self.pitching_url = f"{self.base_url}/leagues/majors/{year}-standard-pitching.shtml"
# HTTP session for requests
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
def fetch_page(self) -> BeautifulSoup:
"""Fetch the Baseball Reference pitching stats page."""
logger.info(f"Fetching pitching stats from: {self.pitching_url}")
try:
response = self.session.get(self.pitching_url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
logger.info("Successfully fetched and parsed the page")
return soup
except requests.RequestException as e:
logger.error(f"Failed to fetch page: {e}")
raise
def extract_pitching_table(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
"""Extract the players_standard_pitching table from the page."""
logger.info("Extracting players_standard_pitching table")
# Find the table by ID
table = soup.find('table', {'id': 'players_standard_pitching'})
if not table:
raise ValueError("Could not find table with ID 'players_standard_pitching'")
# Extract headers
header_row = table.find('thead').find('tr')
if not header_row:
raise ValueError("Could not find table headers")
headers = []
for th in header_row.find_all(['th', 'td']):
header_text = th.get_text(strip=True)
if header_text:
headers.append(header_text)
logger.info(f"Found {len(headers)} columns: {headers}")
# Extract data rows
tbody = table.find('tbody')
if not tbody:
raise ValueError("Could not find table body")
data_rows = []
for row in tbody.find_all('tr'):
# Skip header rows that might appear in tbody
if row.find('th') and row.find('th').get('class') and 'thead' in row.find('th').get('class', []):
continue
row_data = {}
player_id = ""
cells = row.find_all(['td', 'th'])
for i, cell in enumerate(cells):
if i < len(headers):
cell_text = cell.get_text(strip=True)
row_data[headers[i]] = cell_text
# Extract player ID from the Player column (usually index 1)
if headers[i] == 'Player':
# Look for a link in this cell
link = cell.find('a')
if link and link.get('href'):
href = link.get('href')
# Extract player ID from href like "/players/c/crocega01.shtml"
if '/players/' in href:
# Split by '/' and get the filename, then remove .shtml
parts = href.split('/')
if len(parts) > 0:
filename = parts[-1]
if filename.endswith('.shtml'):
player_id = filename[:-6] # Remove .shtml
# Add player ID to row data
row_data['PlayerID'] = player_id
# Only add rows that have player data (name in first column)
if row_data and row_data.get(headers[0], '').strip():
data_rows.append(row_data)
logger.info(f"Extracted {len(data_rows)} player records")
return data_rows
def map_to_expected_format(self, raw_data: List[Dict[str, str]]) -> pl.DataFrame:
"""Map the scraped data to the expected CSV format."""
logger.info("Mapping data to expected format")
# Define the expected column mapping from Baseball Reference to our format
# Based on 2024 Season Cardset structure and actual Baseball Reference columns found:
# ['Rk', 'Player', 'Age', 'Team', 'Lg', 'WAR', 'W', 'L', 'W-L%', 'ERA', 'G', 'GS', 'GF', 'CG', 'SHO', 'SV',
# 'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP', 'WHIP', 'H9', 'HR9', 'BB9', 'SO9', 'SO/BB', 'Awards']
column_mapping = {
'Rk': 'Rk',
'Player': 'Name',
'Age': 'Age',
'Team': 'Team',
'Lg': 'Lg',
'WAR': 'WAR',
'W': 'W',
'L': 'L',
'W-L%': 'W-L%',
'ERA': 'ERA',
'G': 'G',
'GS': 'GS',
'GF': 'GF',
'CG': 'CG',
'SHO': 'SHO',
'SV': 'SV',
'IP': 'IP',
'H': 'H',
'R': 'R',
'ER': 'ER',
'HR': 'HR',
'BB': 'BB',
'IBB': 'IBB',
'SO': 'SO',
'HBP': 'HBP',
'BK': 'BK',
'WP': 'WP',
'BF': 'BF',
'ERA+': 'ERA+',
'FIP': 'FIP',
'WHIP': 'WHIP',
'H9': 'H9',
'HR9': 'HR9',
'BB9': 'BB9',
'SO9': 'SO9',
'SO/BB': 'SO/BB',
'Awards': 'Awards',
'PlayerID': 'Name-additional' # Map extracted player ID to Name-additional
}
# Expected columns in output (based on 2024 Season Cardset/pitching.csv) - 38 total columns
expected_columns = [
'Rk', 'Name', 'Age', 'Team', 'Lg', 'WAR', 'W', 'L', 'W-L%', 'ERA',
'G', 'GS', 'GF', 'CG', 'SHO', 'SV', 'IP', 'H', 'R', 'ER',
'HR', 'BB', 'IBB', 'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP',
'WHIP', 'H9', 'HR9', 'BB9', 'SO9', 'SO/BB', 'Awards', 'Name-additional'
]
# Convert raw data to DataFrame
if not raw_data:
logger.warning("No data to process")
return pl.DataFrame(schema={col: pl.Utf8 for col in expected_columns})
df = pl.DataFrame(raw_data)
logger.info(f"Created DataFrame with {df.height} rows and {df.width} columns")
# Create output DataFrame with expected structure
output_data = {}
for col in expected_columns:
# Find the corresponding Baseball Reference column name
br_col = None
for br_column, our_column in column_mapping.items():
if our_column == col:
br_col = br_column
break
if br_col and br_col in df.columns:
# Map from scraped data using the Baseball Reference column name
output_data[col] = df[br_col].to_list()
else:
# Column not available in scraped data, fill with empty strings
logger.warning(f"Column '{col}' not found in scraped data, filling with empty values")
output_data[col] = [''] * len(raw_data)
result_df = pl.DataFrame(output_data)
logger.info(f"Mapped to expected format: {result_df.height} rows, {result_df.width} columns")
return result_df
def save_to_csv(self, df: pl.DataFrame, cardset_name: str) -> str:
"""Save the DataFrame to CSV in the appropriate directory."""
# Create output directory path - use absolute path to ensure it goes to the right place
# Find the project root (where data-input directory should be)
current_path = Path.cwd()
if current_path.name == "pull-pitching-stats":
# We're running from the plans subdirectory, go up to the project root
project_root = current_path.parent.parent.parent
else:
# We're already in the project root
project_root = current_path
output_dir = project_root / "data-input" / cardset_name
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / "pitching.csv"
logger.info(f"Saving to: {output_path}")
# Save to CSV
df.write_csv(output_path)
logger.info(f"Successfully saved {df.height} records to {output_path}")
return str(output_path)
def scrape_and_save(self, cardset_name: str) -> str:
"""Complete workflow: scrape, process, and save pitching stats."""
logger.info(f"Starting pitching stats scrape for year {self.year}")
# Fetch the page
soup = self.fetch_page()
# Extract the table data
raw_data = self.extract_pitching_table(soup)
# Map to expected format
df = self.map_to_expected_format(raw_data)
# Save to CSV
output_path = self.save_to_csv(df, cardset_name)
logger.info("Pitching stats scraping completed successfully")
return output_path
def main():
"""Main entry point for the script."""
parser = argparse.ArgumentParser(
description="Scrape Baseball Reference pitching statistics"
)
parser.add_argument(
'--year',
type=int,
required=True,
help='Year to scrape (e.g., 2025)'
)
parser.add_argument(
'--cardset-name',
type=str,
required=True,
help='Name of the cardset directory (e.g., "2025 Live Cardset")'
)
parser.add_argument(
'--verbose',
action='store_true',
help='Enable verbose logging'
)
args = parser.parse_args()
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
try:
scraper = PitchingStatsScraper(args.year)
output_path = scraper.scrape_and_save(args.cardset_name)
print(f"Successfully saved pitching stats to: {output_path}")
except Exception as e:
logger.error(f"Script failed: {e}")
sys.exit(1)
if __name__ == '__main__':
main()