strat-gameplay-webapp/backend/app/monitoring/pool_monitor.py
Cal Corum 2a392b87f8 CLAUDE: Add rate limiting, pool monitoring, and exception infrastructure
- Add rate_limit.py middleware with per-client throttling and cleanup task
- Add pool_monitor.py for database connection pool health monitoring
- Add custom exceptions module (GameEngineError, DatabaseError, etc.)
- Add config settings for eviction intervals, session timeouts, memory limits
- Add unit tests for rate limiting and pool monitoring

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-28 12:06:10 -06:00

221 lines
6.2 KiB
Python

"""
Database connection pool monitoring.
Monitors SQLAlchemy async connection pool health and provides
statistics for observability and alerting.
Key features:
- Real-time pool statistics (checked in/out, overflow)
- Health status classification (healthy/warning/critical)
- Historical stats tracking
- Background monitoring with configurable interval
- Warning logs when pool usage exceeds threshold
Author: Claude
Date: 2025-11-27
"""
import asyncio
import logging
from dataclasses import dataclass, field
from typing import Optional
import pendulum
from sqlalchemy.ext.asyncio import AsyncEngine
from app.config import get_settings
logger = logging.getLogger(f"{__name__}.PoolMonitor")
@dataclass
class PoolStats:
"""
Connection pool statistics snapshot.
Captures the current state of the database connection pool
for monitoring and alerting purposes.
"""
pool_size: int
max_overflow: int
checkedin: int # Available connections
checkedout: int # In-use connections
overflow: int # Overflow connections in use
total_capacity: int
usage_percent: float
timestamp: pendulum.DateTime = field(default_factory=lambda: pendulum.now("UTC"))
class PoolMonitor:
"""
Monitor database connection pool health.
Provides real-time statistics and health status for the SQLAlchemy
connection pool. Useful for detecting pool exhaustion before it
causes request failures.
Usage:
monitor = PoolMonitor(engine)
stats = monitor.get_stats()
health = monitor.get_health_status()
"""
def __init__(
self,
engine: AsyncEngine,
alert_threshold: float = 0.8,
max_history: int = 100,
):
"""
Initialize pool monitor.
Args:
engine: SQLAlchemy async engine to monitor
alert_threshold: Usage percentage to trigger warning (0.8 = 80%)
max_history: Maximum stats snapshots to keep in history
"""
self._engine = engine
self._stats_history: list[PoolStats] = []
self._max_history = max_history
self._alert_threshold = alert_threshold
self._settings = get_settings()
def get_stats(self) -> PoolStats:
"""
Get current pool statistics.
Returns:
PoolStats with current pool state
"""
pool = self._engine.pool
checkedin = pool.checkedin()
checkedout = pool.checkedout()
overflow = pool.overflow()
total_capacity = self._settings.db_pool_size + self._settings.db_max_overflow
usage_percent = checkedout / total_capacity if total_capacity > 0 else 0
stats = PoolStats(
pool_size=self._settings.db_pool_size,
max_overflow=self._settings.db_max_overflow,
checkedin=checkedin,
checkedout=checkedout,
overflow=overflow,
total_capacity=total_capacity,
usage_percent=usage_percent,
)
# Record history
self._stats_history.append(stats)
if len(self._stats_history) > self._max_history:
self._stats_history.pop(0)
# Check for alerts
if usage_percent >= self._alert_threshold:
logger.warning(
f"Connection pool usage high: {usage_percent:.1%} "
f"({checkedout}/{total_capacity})"
)
if overflow > 0:
logger.info(f"Pool overflow active: {overflow} overflow connections")
return stats
def get_health_status(self) -> dict:
"""
Get pool health status for monitoring endpoint.
Returns:
Dict with status, statistics, and timestamp
"""
stats = self.get_stats()
if stats.usage_percent >= 0.9:
status = "critical"
elif stats.usage_percent >= 0.75:
status = "warning"
else:
status = "healthy"
return {
"status": status,
"pool_size": stats.pool_size,
"max_overflow": stats.max_overflow,
"available": stats.checkedin,
"in_use": stats.checkedout,
"overflow_active": stats.overflow,
"total_capacity": stats.total_capacity,
"usage_percent": round(stats.usage_percent * 100, 1),
"timestamp": stats.timestamp.isoformat(),
}
def get_history(self, limit: int = 10) -> list[dict]:
"""
Get recent stats history.
Args:
limit: Maximum number of history entries to return
Returns:
List of stats snapshots
"""
return [
{
"checkedout": s.checkedout,
"usage_percent": round(s.usage_percent * 100, 1),
"timestamp": s.timestamp.isoformat(),
}
for s in self._stats_history[-limit:]
]
async def start_monitoring(self, interval_seconds: int = 60):
"""
Background task to periodically collect stats.
Useful for continuous logging and alerting. Runs until cancelled.
Args:
interval_seconds: Seconds between stat collections
"""
logger.info(f"Starting pool monitoring (interval: {interval_seconds}s)")
while True:
try:
stats = self.get_stats()
logger.debug(
f"Pool stats: {stats.checkedout}/{stats.total_capacity} "
f"({stats.usage_percent:.1%})"
)
await asyncio.sleep(interval_seconds)
except asyncio.CancelledError:
logger.info("Pool monitoring stopped")
break
except Exception as e:
logger.error(f"Pool monitoring error: {e}")
await asyncio.sleep(interval_seconds)
# Global instance (initialized in main.py)
pool_monitor: Optional[PoolMonitor] = None
def init_pool_monitor(engine: AsyncEngine) -> PoolMonitor:
"""
Initialize global pool monitor.
Should be called during application startup.
Args:
engine: SQLAlchemy async engine to monitor
Returns:
Initialized PoolMonitor instance
"""
global pool_monitor
pool_monitor = PoolMonitor(engine)
logger.info("Pool monitor initialized")
return pool_monitor