strat-gameplay-webapp/backend/app/monitoring/pool_monitor.py

"""
Database connection pool monitoring.

Monitors SQLAlchemy async connection pool health and provides
statistics for observability and alerting.

Key features:
- Real-time pool statistics (checked in/out, overflow)
- Health status classification (healthy/warning/critical)
- Historical stats tracking
- Background monitoring with configurable interval
- Warning logs when pool usage exceeds threshold

Author: Claude
Date: 2025-11-27
"""

import asyncio
import logging
from dataclasses import dataclass, field
from typing import Optional

import pendulum
from sqlalchemy.ext.asyncio import AsyncEngine

from app.config import get_settings

logger = logging.getLogger(f"{__name__}.PoolMonitor")


@dataclass
class PoolStats:
    """
    Connection pool statistics snapshot.

    Captures the current state of the database connection pool
    for monitoring and alerting purposes.
    """

    pool_size: int
    max_overflow: int
    checkedin: int  # Available connections
    checkedout: int  # In-use connections
    overflow: int  # Overflow connections in use
    total_capacity: int
    usage_percent: float
    timestamp: pendulum.DateTime = field(default_factory=lambda: pendulum.now("UTC"))


class PoolMonitor:
    """
    Monitor database connection pool health.

    Provides real-time statistics and health status for the SQLAlchemy
    connection pool. Useful for detecting pool exhaustion before it
    causes request failures.

    Usage:
        monitor = PoolMonitor(engine)
        stats = monitor.get_stats()
        health = monitor.get_health_status()
    """

    def __init__(
        self,
        engine: AsyncEngine,
        alert_threshold: float = 0.8,
        max_history: int = 100,
    ):
        """
        Initialize pool monitor.

        Args:
            engine: SQLAlchemy async engine to monitor
            alert_threshold: Usage percentage to trigger warning (0.8 = 80%)
            max_history: Maximum stats snapshots to keep in history
        """
        self._engine = engine
        self._stats_history: list[PoolStats] = []
        self._max_history = max_history
        self._alert_threshold = alert_threshold
        self._settings = get_settings()

    def get_stats(self) -> PoolStats:
        """
        Get current pool statistics.

        Returns:
            PoolStats with current pool state
        """
        pool = self._engine.pool

        checkedin = pool.checkedin()
        checkedout = pool.checkedout()
        overflow = pool.overflow()
        total_capacity = self._settings.db_pool_size + self._settings.db_max_overflow

        usage_percent = checkedout / total_capacity if total_capacity > 0 else 0

        stats = PoolStats(
            pool_size=self._settings.db_pool_size,
            max_overflow=self._settings.db_max_overflow,
            checkedin=checkedin,
            checkedout=checkedout,
            overflow=overflow,
            total_capacity=total_capacity,
            usage_percent=usage_percent,
        )

        # Record history
        self._stats_history.append(stats)
        if len(self._stats_history) > self._max_history:
            self._stats_history.pop(0)

        # Check for alerts
        if usage_percent >= self._alert_threshold:
            logger.warning(
                f"Connection pool usage high: {usage_percent:.1%} "
                f"({checkedout}/{total_capacity})"
            )

        if overflow > 0:
            logger.info(f"Pool overflow active: {overflow} overflow connections")

        return stats

    def get_health_status(self) -> dict:
        """
        Get pool health status for monitoring endpoint.

        Returns:
            Dict with status, statistics, and timestamp
        """
        stats = self.get_stats()

        if stats.usage_percent >= 0.9:
            status = "critical"
        elif stats.usage_percent >= 0.75:
            status = "warning"
        else:
            status = "healthy"

        return {
            "status": status,
            "pool_size": stats.pool_size,
            "max_overflow": stats.max_overflow,
            "available": stats.checkedin,
            "in_use": stats.checkedout,
            "overflow_active": stats.overflow,
            "total_capacity": stats.total_capacity,
            "usage_percent": round(stats.usage_percent * 100, 1),
            "timestamp": stats.timestamp.isoformat(),
        }

    def get_history(self, limit: int = 10) -> list[dict]:
        """
        Get recent stats history.

        Args:
            limit: Maximum number of history entries to return

        Returns:
            List of stats snapshots
        """
        return [
            {
                "checkedout": s.checkedout,
                "usage_percent": round(s.usage_percent * 100, 1),
                "timestamp": s.timestamp.isoformat(),
            }
            for s in self._stats_history[-limit:]
        ]

    async def start_monitoring(self, interval_seconds: int = 60):
        """
        Background task to periodically collect stats.

        Useful for continuous logging and alerting. Runs until cancelled.

        Args:
            interval_seconds: Seconds between stat collections
        """
        logger.info(f"Starting pool monitoring (interval: {interval_seconds}s)")

        while True:
            try:
                stats = self.get_stats()
                logger.debug(
                    f"Pool stats: {stats.checkedout}/{stats.total_capacity} "
                    f"({stats.usage_percent:.1%})"
                )
                await asyncio.sleep(interval_seconds)
            except asyncio.CancelledError:
                logger.info("Pool monitoring stopped")
                break
            except Exception as e:
                logger.error(f"Pool monitoring error: {e}")
                await asyncio.sleep(interval_seconds)


# Global instance (initialized in main.py)
pool_monitor: Optional[PoolMonitor] = None


def init_pool_monitor(engine: AsyncEngine) -> PoolMonitor:
    """
    Initialize global pool monitor.

    Should be called during application startup.

    Args:
        engine: SQLAlchemy async engine to monitor

    Returns:
        Initialized PoolMonitor instance
    """
    global pool_monitor
    pool_monitor = PoolMonitor(engine)
    logger.info("Pool monitor initialized")
    return pool_monitor