# Plan 007: Session Expiration **Priority**: HIGH **Effort**: 1-2 hours **Status**: NOT STARTED **Risk Level**: MEDIUM - Zombie connections --- ## Problem Statement WebSocket sessions persist indefinitely after network failures. There's no: - Ping timeout configuration for Socket.io - Session expiration tracking - Cleanup of zombie connections Zombie connections accumulate, causing: - Memory leaks - Stale user presence in games - Inaccurate connection counts ## Impact - **Memory**: Unbounded connection tracking growth - **UX**: Stale players shown as "connected" - **Performance**: Broadcasting to dead connections ## Files to Modify | File | Action | |------|--------| | `backend/app/main.py` | Configure Socket.io timeouts | | `backend/app/websocket/connection_manager.py` | Add session expiration | | `backend/app/websocket/handlers.py` | Handle heartbeat events | ## Implementation Steps ### Step 1: Configure Socket.io Timeouts (15 min) Update `backend/app/main.py`: ```python import socketio sio = socketio.AsyncServer( async_mode="asgi", cors_allowed_origins="*", # Timeout configuration ping_timeout=30, # Wait 30s for pong before disconnect ping_interval=25, # Send ping every 25s max_http_buffer_size=1_000_000, # 1MB max message size logger=True, engineio_logger=True ) ``` **Explanation**: - `ping_interval=25`: Server sends ping every 25 seconds - `ping_timeout=30`: Client must respond within 30 seconds - Total: Connection dies after 55 seconds of no response ### Step 2: Add Session Tracking (30 min) Update `backend/app/websocket/connection_manager.py`: ```python import pendulum from dataclasses import dataclass from uuid import UUID import asyncio @dataclass class SessionInfo: """Tracks WebSocket session metadata.""" user_id: int | None connected_at: pendulum.DateTime last_activity: pendulum.DateTime games: set[UUID] ip_address: str | None = None class ConnectionManager: def __init__(self): self._sessions: dict[str, SessionInfo] = {} # sid -> SessionInfo self._user_sessions: dict[int, set[str]] = {} # user_id -> sids self._game_sessions: dict[UUID, set[str]] = {} # game_id -> sids self._expiration_task: asyncio.Task | None = None async def connect(self, sid: str, user_id: int | None = None, ip_address: str | None = None): """Register new connection.""" now = pendulum.now("UTC") self._sessions[sid] = SessionInfo( user_id=user_id, connected_at=now, last_activity=now, games=set(), ip_address=ip_address ) if user_id: if user_id not in self._user_sessions: self._user_sessions[user_id] = set() self._user_sessions[user_id].add(sid) logger.info(f"Session connected: {sid} (user={user_id})") async def disconnect(self, sid: str): """Clean up disconnected session.""" session = self._sessions.pop(sid, None) if session: # Remove from user tracking if session.user_id and session.user_id in self._user_sessions: self._user_sessions[session.user_id].discard(sid) if not self._user_sessions[session.user_id]: del self._user_sessions[session.user_id] # Remove from game rooms for game_id in session.games: if game_id in self._game_sessions: self._game_sessions[game_id].discard(sid) logger.info(f"Session disconnected: {sid} (was connected {session.connected_at})") async def update_activity(self, sid: str): """Update last activity timestamp for session.""" if sid in self._sessions: self._sessions[sid].last_activity = pendulum.now("UTC") async def get_session(self, sid: str) -> SessionInfo | None: """Get session info.""" return self._sessions.get(sid) async def get_user_id(self, sid: str) -> int | None: """Get user ID for session.""" session = self._sessions.get(sid) return session.user_id if session else None async def join_game(self, sid: str, game_id: UUID): """Add session to game room.""" if sid in self._sessions: self._sessions[sid].games.add(game_id) if game_id not in self._game_sessions: self._game_sessions[game_id] = set() self._game_sessions[game_id].add(sid) await self.update_activity(sid) async def leave_game(self, sid: str, game_id: UUID): """Remove session from game room.""" if sid in self._sessions: self._sessions[sid].games.discard(game_id) if game_id in self._game_sessions: self._game_sessions[game_id].discard(sid) async def expire_inactive_sessions(self, timeout_seconds: int = 300): """ Expire sessions with no activity for timeout period. Called periodically by background task. """ now = pendulum.now("UTC") expired = [] for sid, session in list(self._sessions.items()): inactive_seconds = (now - session.last_activity).total_seconds() if inactive_seconds > timeout_seconds: expired.append(sid) logger.warning(f"Expiring inactive session: {sid} (inactive {inactive_seconds}s)") for sid in expired: await self.disconnect(sid) # Notify Socket.io to close the connection try: await sio.disconnect(sid) except Exception as e: logger.debug(f"Error disconnecting expired session {sid}: {e}") if expired: logger.info(f"Expired {len(expired)} inactive sessions") return expired def get_stats(self) -> dict: """Return connection statistics.""" return { "total_sessions": len(self._sessions), "unique_users": len(self._user_sessions), "active_games": len(self._game_sessions), "sessions_per_game": { str(gid): len(sids) for gid, sids in self._game_sessions.items() } } # Global instance manager = ConnectionManager() ``` ### Step 3: Start Expiration Background Task (15 min) Update `backend/app/main.py`: ```python from app.websocket.connection_manager import manager async def session_expiration_task(): """Background task to expire inactive sessions.""" while True: try: await asyncio.sleep(60) # Check every minute await manager.expire_inactive_sessions(timeout_seconds=300) # 5 min timeout except asyncio.CancelledError: break except Exception as e: logger.error(f"Session expiration error: {e}") @asynccontextmanager async def lifespan(app: FastAPI): # Start session expiration task expiration_task = asyncio.create_task(session_expiration_task()) yield # Stop task expiration_task.cancel() try: await expiration_task except asyncio.CancelledError: pass ``` ### Step 4: Update Handlers to Track Activity (20 min) Update `backend/app/websocket/handlers.py`: ```python from app.websocket.connection_manager import manager @sio.event async def connect(sid, environ, auth): """Handle new connection.""" # Extract user info from auth user_id = None if auth and "token" in auth: user_id = await extract_user_id_from_token(auth["token"]) # Extract IP address ip_address = environ.get("REMOTE_ADDR") await manager.connect(sid, user_id=user_id, ip_address=ip_address) logger.info(f"Client connected: {sid}") @sio.event async def disconnect(sid): """Handle disconnection.""" await manager.disconnect(sid) logger.info(f"Client disconnected: {sid}") # Update activity on any action @sio.event async def submit_defensive_decision(sid, data): await manager.update_activity(sid) # ... existing logic ... @sio.event async def submit_offensive_decision(sid, data): await manager.update_activity(sid) # ... existing logic ... @sio.event async def roll_dice(sid, data): await manager.update_activity(sid) # ... existing logic ... # Add explicit heartbeat handler (optional, for client-initiated keepalive) @sio.event async def heartbeat(sid, data): """Client-initiated heartbeat to keep session alive.""" await manager.update_activity(sid) await sio.emit("heartbeat_ack", {"timestamp": pendulum.now("UTC").isoformat()}, to=sid) ``` ### Step 5: Add Health Endpoint (10 min) Update `backend/app/api/routes.py`: ```python from app.websocket.connection_manager import manager @router.get("/health/connections") async def connection_health(): """Return WebSocket connection statistics.""" stats = manager.get_stats() return { "status": "healthy", **stats } ``` ### Step 6: Write Tests (30 min) Create `backend/tests/unit/websocket/test_session_expiration.py`: ```python import pytest import pendulum from uuid import uuid4 from app.websocket.connection_manager import ConnectionManager, SessionInfo class TestSessionExpiration: """Tests for session expiration.""" @pytest.fixture def manager(self): return ConnectionManager() @pytest.mark.asyncio async def test_connect_creates_session(self, manager): """Connect creates session with correct info.""" await manager.connect("sid1", user_id=123) session = await manager.get_session("sid1") assert session is not None assert session.user_id == 123 @pytest.mark.asyncio async def test_disconnect_removes_session(self, manager): """Disconnect removes session.""" await manager.connect("sid1", user_id=123) await manager.disconnect("sid1") session = await manager.get_session("sid1") assert session is None @pytest.mark.asyncio async def test_activity_updates_timestamp(self, manager): """Activity updates last_activity timestamp.""" await manager.connect("sid1") original = manager._sessions["sid1"].last_activity await asyncio.sleep(0.01) await manager.update_activity("sid1") updated = manager._sessions["sid1"].last_activity assert updated > original @pytest.mark.asyncio async def test_expire_removes_inactive_sessions(self, manager): """Inactive sessions are expired.""" await manager.connect("sid1") # Make session old manager._sessions["sid1"].last_activity = pendulum.now("UTC").subtract(minutes=10) expired = await manager.expire_inactive_sessions(timeout_seconds=300) assert "sid1" in expired assert "sid1" not in manager._sessions @pytest.mark.asyncio async def test_active_sessions_not_expired(self, manager): """Active sessions are not expired.""" await manager.connect("sid1") await manager.update_activity("sid1") expired = await manager.expire_inactive_sessions(timeout_seconds=300) assert "sid1" not in expired assert "sid1" in manager._sessions @pytest.mark.asyncio async def test_join_game_tracked(self, manager): """Joining game updates session and game tracking.""" await manager.connect("sid1") game_id = uuid4() await manager.join_game("sid1", game_id) assert game_id in manager._sessions["sid1"].games assert "sid1" in manager._game_sessions[game_id] ``` ## Verification Checklist - [ ] Socket.io ping/pong configured - [ ] Sessions track last activity - [ ] Inactive sessions are expired (5 min default) - [ ] Background task runs without errors - [ ] Health endpoint shows connection stats - [ ] Tests pass ## Configuration Options | Setting | Default | Description | |---------|---------|-------------| | `ping_interval` | 25s | How often to send ping | | `ping_timeout` | 30s | Max wait for pong | | Expiration timeout | 300s | Inactivity before expiration | | Check interval | 60s | How often to check for expired | ## Rollback Plan If issues arise: 1. Increase expiration timeout 2. Disable expiration task 3. Revert Socket.io timeout config ## Dependencies - None (can be implemented independently) ## Notes - Consider sending "about to expire" warning to clients - May want different timeouts for different game states - Future: Add reconnection handling with session recovery