paper-dynasty-card-creation/tests/test_automated_data_fetcher.py
2025-07-22 09:24:34 -05:00

445 lines
18 KiB
Python

import pytest
import asyncio
import pandas as pd
import tempfile
import shutil
from pathlib import Path
from unittest.mock import Mock, patch, AsyncMock
import sys
# Create a proper mock for pybaseball
mock_pb = Mock()
mock_pb.cache = Mock()
mock_pb.cache.enable = Mock()
mock_pb.batting_stats_bref = Mock()
mock_pb.pitching_stats_bref = Mock()
mock_pb.batting_stats = Mock()
mock_pb.pitching_stats = Mock()
mock_pb.batting_stats_range = Mock()
mock_pb.pitching_stats_range = Mock()
mock_pb.get_splits = Mock()
# Mock the modules before importing
with patch.dict('sys.modules', {
'pybaseball': mock_pb,
'creation_helpers': Mock(),
'exceptions': Mock()
}):
from automated_data_fetcher import DataFetcher, LiveSeriesDataFetcher, fetch_season_data, fetch_live_series_data
class TestDataFetcher:
"""Test cases for the DataFetcher class"""
@pytest.fixture
def fetcher(self):
"""Create a DataFetcher instance for testing"""
with tempfile.TemporaryDirectory() as tmp_dir:
# Override output directory to use temp directory
fetcher = DataFetcher(2023, "Season")
fetcher.output_dir = Path(tmp_dir) / "test_output"
yield fetcher
@pytest.fixture
def sample_batting_data(self):
"""Sample batting data for testing"""
return pd.DataFrame({
'Name': ['Player A', 'Player B', 'Player C'],
'Team': ['NYY', 'LAD', 'BOS'],
'G': [162, 140, 120],
'PA': [650, 580, 450],
'H': [180, 160, 120],
'HR': [30, 25, 15],
'RBI': [100, 85, 65],
'SB': [20, 5, 8],
'CS': [5, 2, 3],
'SB%': [0.8, 0.714, 0.727],
'GDP': [15, 12, 8],
'R': [95, 80, 55],
'BB': [65, 55, 40],
'SO': [150, 120, 90],
'IDfg': ['12345', '67890', '11111']
})
@pytest.fixture
def sample_pitching_data(self):
"""Sample pitching data for testing"""
return pd.DataFrame({
'Name': ['Pitcher A', 'Pitcher B'],
'Team': ['NYY', 'LAD'],
'W': [15, 12],
'L': [8, 10],
'ERA': [3.25, 4.15],
'G': [32, 30],
'GS': [32, 30],
'IP': [200.1, 180.2],
'H': [180, 190],
'HR': [25, 30],
'BB': [60, 70],
'SO': [220, 180]
})
@pytest.fixture
def sample_splits_data(self):
"""Sample splits data for testing"""
return pd.DataFrame({
'Split': ['vs LHP', 'vs RHP', 'Home', 'Away'],
'G': [80, 82, 81, 81],
'PA': [320, 330, 325, 325],
'H': [85, 95, 90, 90],
'AVG': [.280, .295, .285, .285],
'OBP': [.350, .365, .360, .355],
'SLG': [.450, .480, .465, .465]
})
def test_init(self, fetcher):
"""Test DataFetcher initialization"""
assert fetcher.season == 2023
assert fetcher.cardset_type == "Season"
assert fetcher.cache_enabled == True
# Note: fetcher.output_dir is overridden in the fixture to use temp directory
def test_ensure_output_dir(self, fetcher):
"""Test output directory creation"""
assert not fetcher.output_dir.exists()
fetcher.ensure_output_dir()
assert fetcher.output_dir.exists()
def test_get_csv_filename(self, fetcher):
"""Test CSV filename mapping"""
assert fetcher._get_csv_filename('pitching') == 'pitching.csv'
assert fetcher._get_csv_filename('running') == 'running.csv'
assert fetcher._get_csv_filename('batting_basic') == 'batter-stats.csv'
assert fetcher._get_csv_filename('pitching_basic') == 'pitcher-stats.csv'
assert fetcher._get_csv_filename('unknown_type') == 'unknown_type.csv'
def test_transform_for_card_creation_batting_splits(self, fetcher, sample_splits_data):
"""Test batting splits transformation"""
result = fetcher._transform_for_card_creation(sample_splits_data, 'batting_splits')
# Should filter to only handedness splits
expected_splits = ['vs LHP', 'vs RHP']
assert all(split in expected_splits for split in result['Split'].values)
assert len(result) == 2
def test_transform_for_card_creation_running(self, fetcher, sample_batting_data):
"""Test running stats transformation"""
result = fetcher._transform_for_card_creation(sample_batting_data, 'running')
# Should include only running-related columns
expected_cols = ['Name', 'SB', 'CS', 'SB%', 'GDP']
assert all(col in expected_cols for col in result.columns)
def test_save_data_to_csv(self, fetcher, sample_batting_data):
"""Test saving data to CSV"""
fetcher.ensure_output_dir()
data = {'batting_basic': sample_batting_data}
fetcher.save_data_to_csv(data)
# Check file was created
expected_file = fetcher.output_dir / 'batter-stats.csv'
assert expected_file.exists()
# Verify content
saved_data = pd.read_csv(expected_file)
assert len(saved_data) == len(sample_batting_data)
assert 'Name' in saved_data.columns
def test_save_data_to_csv_empty_dataframe(self, fetcher):
"""Test saving empty dataframe"""
fetcher.ensure_output_dir()
empty_data = {'empty_set': pd.DataFrame()}
fetcher.save_data_to_csv(empty_data)
# Should not create file for empty data
expected_file = fetcher.output_dir / 'empty_set.csv'
assert not expected_file.exists()
@patch('automated_data_fetcher.pb.batting_stats_bref')
@patch('automated_data_fetcher.pb.pitching_stats_bref')
async def test_fetch_baseball_reference_data(self, mock_pitching, mock_batting, fetcher,
sample_batting_data, sample_pitching_data):
"""Test fetching Baseball Reference data"""
# Mock pybaseball functions
mock_batting.return_value = sample_batting_data
mock_pitching.return_value = sample_pitching_data
# Mock player ID and splits functions
with patch.object(fetcher, '_get_active_players', return_value=['12345', '67890']):
with patch.object(fetcher, '_fetch_player_splits', return_value={
'batting': pd.DataFrame(), 'pitching': pd.DataFrame()
}):
result = await fetcher.fetch_baseball_reference_data()
# Verify data structure
assert 'pitching' in result
assert 'running' in result
assert 'batting_splits' in result
assert 'pitching_splits' in result
# Verify data content
assert len(result['pitching']) == 2
assert len(result['running']) == 3
@patch('automated_data_fetcher.pb.batting_stats')
@patch('automated_data_fetcher.pb.pitching_stats')
async def test_fetch_fangraphs_data(self, mock_pitching, mock_batting, fetcher,
sample_batting_data, sample_pitching_data):
"""Test fetching FanGraphs data"""
# Mock pybaseball functions
mock_batting.return_value = sample_batting_data
mock_pitching.return_value = sample_pitching_data
result = await fetcher.fetch_fangraphs_data()
# Verify data structure
assert 'batting_basic' in result
assert 'pitching_basic' in result
# Verify function calls
mock_batting.assert_called_once_with(2023, 2023)
mock_pitching.assert_called_once_with(2023, 2023)
@patch('automated_data_fetcher.pb.batting_stats_range')
@patch('automated_data_fetcher.pb.pitching_stats_range')
async def test_fetch_fangraphs_data_with_dates(self, mock_pitching, mock_batting, fetcher,
sample_batting_data, sample_pitching_data):
"""Test fetching FanGraphs data with date range"""
# Mock pybaseball functions
mock_batting.return_value = sample_batting_data
mock_pitching.return_value = sample_pitching_data
start_date = "2023-03-01"
end_date = "2023-09-01"
result = await fetcher.fetch_fangraphs_data(start_date, end_date)
# Verify function calls with date parameters
mock_batting.assert_called_once_with(start_date, end_date)
mock_pitching.assert_called_once_with(start_date, end_date)
@patch('automated_data_fetcher.get_all_pybaseball_ids')
async def test_get_active_players_existing_function(self, mock_get_ids, fetcher):
"""Test getting player IDs using existing function"""
mock_get_ids.return_value = ['12345', '67890', '11111']
result = await fetcher._get_active_players()
assert result == ['12345', '67890', '11111']
mock_get_ids.assert_called_once_with(2023)
@patch('automated_data_fetcher.get_all_pybaseball_ids')
@patch('automated_data_fetcher.pb.batting_stats')
async def test_get_active_players_fallback(self, mock_batting, mock_get_ids, fetcher, sample_batting_data):
"""Test getting player IDs with fallback to FanGraphs"""
# Mock existing function to fail
mock_get_ids.side_effect = Exception("Function not available")
mock_batting.return_value = sample_batting_data
result = await fetcher._get_active_players()
# Should fallback to FanGraphs data
expected_ids = ['12345', '67890', '11111']
assert result == expected_ids
@patch('automated_data_fetcher.pb.get_splits')
async def test_fetch_player_splits(self, mock_get_splits, fetcher, sample_splits_data):
"""Test fetching player splits"""
# Mock get_splits to return sample data
mock_get_splits.return_value = sample_splits_data
player_ids = ['12345', '67890']
result = await fetcher._fetch_player_splits(player_ids)
# Verify structure
assert 'batting' in result
assert 'pitching' in result
# Verify splits were called for each player
assert mock_get_splits.call_count == 4 # 2 players * 2 split types
class TestLiveSeriesDataFetcher:
"""Test cases for the LiveSeriesDataFetcher class"""
@pytest.fixture
def live_fetcher(self):
"""Create a LiveSeriesDataFetcher instance for testing"""
with tempfile.TemporaryDirectory() as tmp_dir:
fetcher = LiveSeriesDataFetcher(2023, 81) # Half season
fetcher.output_dir = Path(tmp_dir) / "test_output"
yield fetcher
def test_init(self, live_fetcher):
"""Test LiveSeriesDataFetcher initialization"""
assert live_fetcher.season == 2023
assert live_fetcher.cardset_type == "Live"
assert live_fetcher.games_played == 81
assert live_fetcher.start_date == "2023-03-01"
def test_calculate_end_date(self, live_fetcher):
"""Test end date calculation"""
# 81 games should be roughly half season (90 days)
end_date = live_fetcher._calculate_end_date(81)
# Should be a valid date string
assert len(end_date) == 10 # YYYY-MM-DD format
assert end_date.startswith("2023")
# Should be after start date
assert end_date > "2023-03-01"
# Test full season
full_season_end = live_fetcher._calculate_end_date(162)
assert full_season_end > end_date
@patch.object(DataFetcher, 'fetch_baseball_reference_data')
@patch.object(DataFetcher, 'fetch_fangraphs_data')
async def test_fetch_live_data(self, mock_fg_data, mock_bref_data, live_fetcher):
"""Test fetching live series data"""
# Mock return values
mock_bref_data.return_value = {'pitching': pd.DataFrame(), 'running': pd.DataFrame()}
mock_fg_data.return_value = {'batting_basic': pd.DataFrame()}
result = await live_fetcher.fetch_live_data()
# Verify both data sources were called
mock_bref_data.assert_called_once()
mock_fg_data.assert_called_once_with(live_fetcher.start_date, live_fetcher.end_date)
# Verify combined result
assert 'pitching' in result
assert 'running' in result
assert 'batting_basic' in result
class TestUtilityFunctions:
"""Test cases for utility functions"""
@patch('automated_data_fetcher.DataFetcher')
async def test_fetch_season_data(self, mock_fetcher_class):
"""Test fetch_season_data function"""
# Create mock fetcher instance
mock_fetcher = Mock()
mock_fetcher.fetch_baseball_reference_data = AsyncMock(return_value={'pitching': pd.DataFrame()})
mock_fetcher.fetch_fangraphs_data = AsyncMock(return_value={'batting_basic': pd.DataFrame()})
mock_fetcher.save_data_to_csv = Mock()
mock_fetcher.output_dir = Path("test/output")
mock_fetcher_class.return_value = mock_fetcher
# Capture print output
with patch('builtins.print') as mock_print:
await fetch_season_data(2023)
# Verify fetcher was created and methods called
mock_fetcher_class.assert_called_once_with(2023, "Season")
mock_fetcher.fetch_baseball_reference_data.assert_called_once()
mock_fetcher.fetch_fangraphs_data.assert_called_once()
mock_fetcher.save_data_to_csv.assert_called_once()
# Verify print output includes completion message
print_calls = [call[0][0] for call in mock_print.call_args_list]
assert any("AUTOMATED DOWNLOAD COMPLETE" in call for call in print_calls)
@patch('automated_data_fetcher.LiveSeriesDataFetcher')
async def test_fetch_live_series_data(self, mock_fetcher_class):
"""Test fetch_live_series_data function"""
# Create mock fetcher instance
mock_fetcher = Mock()
mock_fetcher.fetch_live_data = AsyncMock(return_value={'live_data': pd.DataFrame()})
mock_fetcher.save_data_to_csv = Mock()
mock_fetcher_class.return_value = mock_fetcher
await fetch_live_series_data(2023, 81)
# Verify fetcher was created and methods called
mock_fetcher_class.assert_called_once_with(2023, 81)
mock_fetcher.fetch_live_data.assert_called_once()
mock_fetcher.save_data_to_csv.assert_called_once()
class TestErrorHandling:
"""Test error handling scenarios"""
@pytest.fixture
def fetcher(self):
"""Create a DataFetcher instance for error testing"""
return DataFetcher(2023, "Season")
@patch('automated_data_fetcher.pb.pitching_stats_bref')
async def test_fetch_baseball_reference_data_error(self, mock_pitching, fetcher):
"""Test error handling in Baseball Reference data fetch"""
# Mock function to raise an exception
mock_pitching.side_effect = Exception("Network error")
with pytest.raises(Exception, match="Error fetching Baseball Reference data"):
await fetcher.fetch_baseball_reference_data()
@patch('automated_data_fetcher.pb.batting_stats')
async def test_fetch_fangraphs_data_error(self, mock_batting, fetcher):
"""Test error handling in FanGraphs data fetch"""
# Mock function to raise an exception
mock_batting.side_effect = Exception("API error")
with pytest.raises(Exception, match="Error fetching FanGraphs data"):
await fetcher.fetch_fangraphs_data()
@patch('automated_data_fetcher.get_all_pybaseball_ids')
@patch('automated_data_fetcher.pb.batting_stats')
async def test_get_active_players_complete_failure(self, mock_batting, mock_get_ids, fetcher):
"""Test complete failure in getting player IDs"""
# Mock both functions to fail
mock_get_ids.side_effect = Exception("Function error")
mock_batting.side_effect = Exception("API error")
result = await fetcher._get_active_players()
# Should return empty list when all methods fail
assert result == []
@patch('automated_data_fetcher.pb.get_splits')
async def test_fetch_player_splits_individual_errors(self, mock_get_splits, fetcher):
"""Test handling individual player split fetch errors"""
# Mock get_splits to fail for some players
def side_effect(player_id, **kwargs):
if player_id == 'bad_player':
raise Exception("Player not found")
return pd.DataFrame({'Split': ['vs LHP'], 'AVG': [.250]})
mock_get_splits.side_effect = side_effect
player_ids = ['good_player', 'bad_player', 'another_good_player']
result = await fetcher._fetch_player_splits(player_ids)
# Should handle errors gracefully and return data for successful players
assert 'batting' in result
assert 'pitching' in result
# Should have been called for all players despite errors
assert mock_get_splits.call_count == 6 # 3 players * 2 split types
# Integration test markers
@pytest.mark.integration
class TestIntegration:
"""Integration tests that require network access"""
@pytest.mark.skip(reason="Requires network access and may be slow")
async def test_real_data_fetch(self):
"""Test fetching real data from pybaseball (skip by default)"""
fetcher = DataFetcher(2022, "Season") # Use a complete season
# This would actually call pybaseball APIs
# Only run when specifically testing integration
try:
fg_data = await fetcher.fetch_fangraphs_data()
assert 'batting_basic' in fg_data
assert 'pitching_basic' in fg_data
except Exception as e:
pytest.skip(f"Network error during integration test: {e}")
if __name__ == '__main__':
# Run tests
pytest.main([__file__, '-v'])