# Plan 003: Idle Game Eviction **Priority**: CRITICAL **Effort**: 1-2 hours **Status**: NOT STARTED **Risk Level**: HIGH - Memory leak / OOM --- ## Problem Statement The `StateManager` tracks `_last_access[game_id]` for each game but **never uses it for eviction**. Games remain in memory indefinitely, causing unbounded memory growth. ```python # backend/app/core/state_manager.py self._last_access[game_id] = pendulum.now("UTC") # Tracked but never used! ``` ## Impact - **Memory**: ~50KB per game × 1000 games = 50MB+ after a month - **Stability**: OOM crash after days/weeks of operation - **Performance**: Degraded performance as dictionaries grow ## Files to Modify | File | Changes | |------|---------| | `backend/app/core/state_manager.py` | Add eviction logic | | `backend/app/main.py` | Start background eviction task | | `backend/app/config.py` | Add eviction configuration | ## Implementation Steps ### Step 1: Add Configuration (10 min) Update `backend/app/config.py`: ```python class Settings(BaseSettings): # ... existing settings ... # Game eviction settings game_idle_timeout_hours: int = 24 # Evict games idle > 24 hours game_eviction_interval_minutes: int = 60 # Check every hour game_max_in_memory: int = 500 # Hard limit on in-memory games ``` ### Step 2: Implement Eviction Logic (30 min) Update `backend/app/core/state_manager.py`: ```python import pendulum from app.config import settings class StateManager: # ... existing code ... async def evict_idle_games(self) -> list[UUID]: """ Remove games that have been idle beyond the timeout threshold. Returns list of evicted game IDs. """ now = pendulum.now("UTC") timeout_seconds = settings.game_idle_timeout_hours * 3600 evicted = [] # Find idle games for game_id, last_access in list(self._last_access.items()): idle_seconds = (now - last_access).total_seconds() if idle_seconds > timeout_seconds: evicted.append(game_id) # Evict them for game_id in evicted: await self._evict_game(game_id) logger.info(f"Evicted idle game {game_id} (idle {idle_seconds/3600:.1f} hours)") if evicted: logger.info(f"Evicted {len(evicted)} idle games. Active: {len(self._games)}") return evicted async def _evict_game(self, game_id: UUID) -> None: """ Remove a single game from memory. Persists final state to database before removal. """ # Persist final state if game_id in self._games: game_state = self._games[game_id] try: await db_ops.save_game_state(game_id, game_state) logger.debug(f"Persisted game {game_id} before eviction") except Exception as e: logger.error(f"Failed to persist game {game_id}: {e}") # Remove from all tracking dictionaries self._games.pop(game_id, None) self._lineups.pop(game_id, None) self._last_access.pop(game_id, None) self._game_locks.pop(game_id, None) async def enforce_memory_limit(self) -> list[UUID]: """ Enforce hard limit on in-memory games. Evicts oldest games if limit exceeded. """ if len(self._games) <= settings.game_max_in_memory: return [] # Sort by last access time (oldest first) sorted_games = sorted( self._last_access.items(), key=lambda x: x[1] ) # Evict oldest until under limit to_evict = len(self._games) - settings.game_max_in_memory evicted = [] for game_id, _ in sorted_games[:to_evict]: await self._evict_game(game_id) evicted.append(game_id) logger.warning(f"Force-evicted game {game_id} (memory limit)") return evicted def get_memory_stats(self) -> dict: """Return memory usage statistics.""" return { "active_games": len(self._games), "max_games": settings.game_max_in_memory, "oldest_game_hours": self._get_oldest_game_age_hours(), "total_lineups_cached": sum(len(l) for l in self._lineups.values()) } def _get_oldest_game_age_hours(self) -> float: if not self._last_access: return 0.0 oldest = min(self._last_access.values()) return (pendulum.now("UTC") - oldest).total_seconds() / 3600 ``` ### Step 3: Create Background Task (30 min) Update `backend/app/main.py`: ```python import asyncio from contextlib import asynccontextmanager from app.core.state_manager import state_manager from app.config import settings # Background task handle eviction_task: asyncio.Task | None = None async def periodic_eviction(): """Background task to periodically evict idle games.""" interval = settings.game_eviction_interval_minutes * 60 while True: try: await asyncio.sleep(interval) # Run eviction evicted = await state_manager.evict_idle_games() # Enforce memory limit force_evicted = await state_manager.enforce_memory_limit() # Log stats stats = state_manager.get_memory_stats() logger.info(f"Memory stats: {stats}") except asyncio.CancelledError: logger.info("Eviction task cancelled") break except Exception as e: logger.error(f"Eviction task error: {e}") # Continue running despite errors @asynccontextmanager async def lifespan(app: FastAPI): """Application lifespan handler.""" global eviction_task # Startup logger.info("Starting background eviction task") eviction_task = asyncio.create_task(periodic_eviction()) yield # Shutdown logger.info("Stopping background eviction task") if eviction_task: eviction_task.cancel() try: await eviction_task except asyncio.CancelledError: pass app = FastAPI(lifespan=lifespan) ``` ### Step 4: Add Health Endpoint (15 min) Add to `backend/app/api/routes.py`: ```python @router.get("/health/memory") async def memory_health(): """Return memory usage statistics.""" stats = state_manager.get_memory_stats() # Determine health status usage_pct = stats["active_games"] / stats["max_games"] * 100 if usage_pct > 90: status = "critical" elif usage_pct > 75: status = "warning" else: status = "healthy" return { "status": status, "usage_percent": round(usage_pct, 1), **stats } ``` ### Step 5: Write Tests (30 min) Create `backend/tests/unit/core/test_game_eviction.py`: ```python import pytest import pendulum from uuid import uuid4 from unittest.mock import patch, AsyncMock class TestGameEviction: """Tests for idle game eviction.""" @pytest.fixture def state_manager(self): from app.core.state_manager import StateManager return StateManager() @pytest.mark.asyncio async def test_evict_idle_games_removes_old_games(self, state_manager): """Games idle beyond threshold are evicted.""" game_id = uuid4() # Create game with old timestamp state_manager._games[game_id] = MockGameState() state_manager._last_access[game_id] = pendulum.now("UTC").subtract(hours=25) with patch.object(state_manager, '_evict_game', new_callable=AsyncMock) as mock: evicted = await state_manager.evict_idle_games() assert game_id in evicted mock.assert_called_once_with(game_id) @pytest.mark.asyncio async def test_evict_idle_games_keeps_active_games(self, state_manager): """Recently accessed games are not evicted.""" game_id = uuid4() state_manager._games[game_id] = MockGameState() state_manager._last_access[game_id] = pendulum.now("UTC").subtract(hours=1) evicted = await state_manager.evict_idle_games() assert game_id not in evicted assert game_id in state_manager._games @pytest.mark.asyncio async def test_enforce_memory_limit_evicts_oldest(self, state_manager): """Oldest games evicted when memory limit exceeded.""" # Create games at different times for i in range(10): game_id = uuid4() state_manager._games[game_id] = MockGameState() state_manager._last_access[game_id] = pendulum.now("UTC").subtract(hours=i) with patch.object(settings, 'game_max_in_memory', 5): evicted = await state_manager.enforce_memory_limit() assert len(evicted) == 5 assert len(state_manager._games) == 5 @pytest.mark.asyncio async def test_evict_game_persists_state(self, state_manager): """Game state is persisted before eviction.""" game_id = uuid4() game_state = MockGameState() state_manager._games[game_id] = game_state with patch('app.database.operations.db_ops.save_game_state', new_callable=AsyncMock) as mock: await state_manager._evict_game(game_id) mock.assert_called_once_with(game_id, game_state) ``` ## Verification Checklist - [ ] Idle games are evicted after 24 hours - [ ] Memory limit is enforced - [ ] Game state is persisted before eviction - [ ] Background task runs without errors - [ ] Health endpoint shows accurate stats - [ ] Tests pass ## Monitoring After deployment, monitor: - `/health/memory` endpoint - Log messages for eviction events - Memory usage of the process ## Rollback Plan If issues arise: 1. Increase `game_idle_timeout_hours` to reduce evictions 2. Increase `game_max_in_memory` limit 3. Disable eviction task (comment out in lifespan) ## Dependencies - None (can be implemented independently) ## Notes - Consider adding WebSocket notification before eviction - May want to add "extend session" API for active users - Future: Add Redis-backed state for horizontal scaling