paper-dynasty-card-creation/test_data_fetcher_demo.py
2025-07-22 09:24:34 -05:00

172 lines
7.4 KiB
Python

#!/usr/bin/env python3
"""
Demo script to test the automated data fetcher without requiring pybaseball installation
"""
import asyncio
import pandas as pd
from pathlib import Path
from unittest.mock import Mock, patch
import tempfile
# Mock pybaseball for demo
mock_pb = Mock()
mock_pb.cache = Mock()
mock_pb.cache.enable = Mock()
# Create sample data that the fetcher would normally get from pybaseball
sample_pitching_data = pd.DataFrame({
'Name': ['Jacob deGrom', 'Gerrit Cole', 'Shane Bieber'],
'Team': ['NYM', 'NYY', 'CLE'],
'W': [15, 16, 13],
'L': [4, 8, 7],
'ERA': [2.38, 3.23, 3.28],
'IP': [201.1, 200.1, 214.1],
'SO': [255, 243, 259]
})
sample_batting_data = pd.DataFrame({
'Name': ['Ronald Acuña Jr.', 'Mookie Betts', 'Juan Soto'],
'Team': ['ATL', 'LAD', 'WSN'],
'G': [119, 142, 151],
'PA': [556, 614, 654],
'H': [148, 160, 145],
'HR': [41, 35, 29],
'SB': [73, 12, 9],
'CS': [11, 3, 4],
'SB%': [0.869, 0.8, 0.692],
'GDP': [5, 15, 20],
'R': [149, 122, 111],
'BB': [78, 65, 145],
'SO': [144, 111, 93],
'IDfg': ['2203', '13611', '19251']
})
sample_splits_data = pd.DataFrame({
'Split': ['vs LHP', 'vs RHP', 'Home', 'Away'],
'G': [40, 79, 60, 59],
'PA': [150, 406, 278, 278],
'H': [42, 106, 74, 74],
'AVG': [.295, .275, .280, .280],
'OBP': [.380, .350, .365, .365],
'SLG': [.520, .480, .500, .500]
})
async def demo_data_fetcher():
"""Demonstrate the data fetcher functionality"""
print("🚀 Automated Data Fetcher Demo")
print("=" * 50)
# Mock pybaseball functions to return our sample data
mock_pb.pitching_stats_bref = Mock(return_value=sample_pitching_data)
mock_pb.batting_stats_bref = Mock(return_value=sample_batting_data)
mock_pb.batting_stats = Mock(return_value=sample_batting_data)
mock_pb.pitching_stats = Mock(return_value=sample_pitching_data)
mock_pb.get_splits = Mock(return_value=sample_splits_data)
# Patch the imports
with patch.dict('sys.modules', {
'pybaseball': mock_pb,
'creation_helpers': Mock(),
'exceptions': Mock()
}):
# Import after patching
from automated_data_fetcher import DataFetcher, LiveSeriesDataFetcher
# Mock the logger
with patch('automated_data_fetcher.logger') as mock_logger:
mock_logger.info = print # Redirect log output to print
mock_logger.warning = print
mock_logger.error = print
# Create temporary directory for output
with tempfile.TemporaryDirectory() as tmp_dir:
print(f"📁 Using temporary directory: {tmp_dir}")
# Test 1: Season Data Fetcher
print("\n🔄 Testing Season Data Fetcher...")
fetcher = DataFetcher(2023, "Season")
fetcher.output_dir = Path(tmp_dir) / "season_test"
# Mock the helper functions
with patch.object(fetcher, '_get_active_players', return_value=['2203', '13611']):
with patch.object(fetcher, '_fetch_player_splits', return_value={
'batting': sample_splits_data.copy(),
'pitching': pd.DataFrame()
}):
# Fetch data
bref_data = await fetcher.fetch_baseball_reference_data()
fg_data = await fetcher.fetch_fangraphs_data()
# Combine and save
all_data = {**bref_data, **fg_data}
fetcher.save_data_to_csv(all_data)
print(f"✅ Season data saved to: {fetcher.output_dir}")
print(f"📊 Generated {len(all_data)} data files:")
for name, df in all_data.items():
if hasattr(df, '__len__'):
print(f" - {name}: {len(df)} records")
else:
print(f" - {name}: (mock data)")
# Test 2: Live Series Data Fetcher
print("\n🔄 Testing Live Series Data Fetcher...")
live_fetcher = LiveSeriesDataFetcher(2023, 81)
live_fetcher.output_dir = Path(tmp_dir) / "live_test"
with patch.object(live_fetcher, '_get_active_players', return_value=['2203']):
with patch.object(live_fetcher, '_fetch_player_splits', return_value={
'batting': sample_splits_data.head(2), # Smaller dataset for live
'pitching': pd.DataFrame()
}):
live_data = await live_fetcher.fetch_live_data()
live_fetcher.save_data_to_csv(live_data)
print(f"✅ Live data saved to: {live_fetcher.output_dir}")
print(f"📊 Generated {len(live_data)} data files:")
for name, df in live_data.items():
if hasattr(df, '__len__'):
print(f" - {name}: {len(df)} records")
else:
print(f" - {name}: (mock data)")
print(f"📅 Date range: {live_fetcher.start_date} to {live_fetcher.end_date}")
# Test 3: File outputs
print("\n📄 Generated CSV Files:")
for output_dir in [fetcher.output_dir, live_fetcher.output_dir]:
if output_dir.exists():
csv_files = list(output_dir.glob("*.csv"))
print(f"\n📁 {output_dir.name}:")
for csv_file in csv_files:
size = csv_file.stat().st_size
print(f" - {csv_file.name} ({size} bytes)")
# Test 4: Show what still needs manual download
print("\n⚠️ MANUAL DOWNLOAD STILL REQUIRED:")
manual_files = [
"vlhp-basic.csv (FanGraphs vs LHP batting)",
"vlhp-rate.csv (FanGraphs vs LHP rate stats)",
"vrhp-basic.csv (FanGraphs vs RHP batting)",
"vrhp-rate.csv (FanGraphs vs RHP rate stats)",
"vlhh-basic.csv (FanGraphs vs LHH pitching)",
"vlhh-rate.csv (FanGraphs vs LHH rate stats)",
"vrhh-basic.csv (FanGraphs vs RHH pitching)",
"vrhh-rate.csv (FanGraphs vs RHH rate stats)"
]
for file in manual_files:
print(f" - {file}")
print(f"\n✨ Demo Complete!")
print("=" * 50)
print("The automated data fetcher successfully:")
print("✅ Created output directories")
print("✅ Fetched Baseball Reference data (mocked)")
print("✅ Fetched FanGraphs basic data (mocked)")
print("✅ Saved data to properly named CSV files")
print("✅ Handled date ranges for live series")
print("⚠️ FanGraphs split data still requires manual download")
if __name__ == '__main__':
asyncio.run(demo_data_fetcher())