Frontend UX improvements: - Single-click Discord OAuth from home page (no intermediate /auth page) - Auto-redirect authenticated users from home to /games - Fixed Nuxt layout system - app.vue now wraps NuxtPage with NuxtLayout - Games page now has proper card container with shadow/border styling - Layout header includes working logout with API cookie clearing Games list enhancements: - Display team names (lname) instead of just team IDs - Show current score for each team - Show inning indicator (Top/Bot X) for active games - Responsive header with wrapped buttons on mobile Backend improvements: - Added team caching to SbaApiClient (1-hour TTL) - Enhanced GameListItem with team names, scores, inning data - Games endpoint now enriches response with SBA API team data Docker optimizations: - Optimized Dockerfile using --chown flag on COPY (faster than chown -R) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
12 KiB
12 KiB
Plan 012: Connection Pool Monitoring
Priority: MEDIUM Effort: 2 hours Status: NOT STARTED Risk Level: LOW - Observability
Problem Statement
No monitoring exists for database connection pool usage:
- Can't detect pool exhaustion before it causes failures
- No visibility into connection health
- No alerting for connection issues
Impact
- Reliability: Pool exhaustion causes request failures
- Debugging: Hard to diagnose connection issues
- Capacity: Can't plan for scaling needs
Current Configuration
# config.py
db_pool_size: int = 20
db_max_overflow: int = 10
# Total capacity: 30 connections
Implementation Steps
Step 1: Create Pool Monitor (30 min)
Create backend/app/monitoring/pool_monitor.py:
"""Database connection pool monitoring."""
import asyncio
import logging
from dataclasses import dataclass
from datetime import datetime
from typing import Optional
from sqlalchemy.ext.asyncio import AsyncEngine
from app.config import settings
logger = logging.getLogger(f"{__name__}.PoolMonitor")
@dataclass
class PoolStats:
"""Connection pool statistics."""
pool_size: int
max_overflow: int
checkedin: int # Available connections
checkedout: int # In-use connections
overflow: int # Overflow connections in use
total_capacity: int
usage_percent: float
timestamp: datetime
class PoolMonitor:
"""
Monitor database connection pool health.
Provides stats and alerts for pool usage.
"""
def __init__(self, engine: AsyncEngine):
self._engine = engine
self._stats_history: list[PoolStats] = []
self._max_history = 100
self._alert_threshold = 0.8 # 80% usage
def get_stats(self) -> PoolStats:
"""Get current pool statistics."""
pool = self._engine.pool
checkedin = pool.checkedin()
checkedout = pool.checkedout()
overflow = pool.overflow()
total_capacity = settings.db_pool_size + settings.db_max_overflow
usage_percent = checkedout / total_capacity if total_capacity > 0 else 0
stats = PoolStats(
pool_size=settings.db_pool_size,
max_overflow=settings.db_max_overflow,
checkedin=checkedin,
checkedout=checkedout,
overflow=overflow,
total_capacity=total_capacity,
usage_percent=usage_percent,
timestamp=datetime.utcnow()
)
# Record history
self._stats_history.append(stats)
if len(self._stats_history) > self._max_history:
self._stats_history.pop(0)
# Check for alerts
if usage_percent >= self._alert_threshold:
logger.warning(
f"Connection pool usage high: {usage_percent:.1%} "
f"({checkedout}/{total_capacity})"
)
if overflow > 0:
logger.info(f"Pool overflow active: {overflow} overflow connections")
return stats
def get_health_status(self) -> dict:
"""Get pool health status for monitoring endpoint."""
stats = self.get_stats()
if stats.usage_percent >= 0.9:
status = "critical"
elif stats.usage_percent >= 0.75:
status = "warning"
else:
status = "healthy"
return {
"status": status,
"pool_size": stats.pool_size,
"max_overflow": stats.max_overflow,
"available": stats.checkedin,
"in_use": stats.checkedout,
"overflow_active": stats.overflow,
"total_capacity": stats.total_capacity,
"usage_percent": round(stats.usage_percent * 100, 1),
"timestamp": stats.timestamp.isoformat()
}
def get_history(self, limit: int = 10) -> list[dict]:
"""Get recent stats history."""
return [
{
"checkedout": s.checkedout,
"usage_percent": round(s.usage_percent * 100, 1),
"timestamp": s.timestamp.isoformat()
}
for s in self._stats_history[-limit:]
]
async def start_monitoring(self, interval_seconds: int = 60):
"""
Background task to periodically collect stats.
Useful for logging and alerting.
"""
while True:
try:
stats = self.get_stats()
logger.debug(
f"Pool stats: {stats.checkedout}/{stats.total_capacity} "
f"({stats.usage_percent:.1%})"
)
await asyncio.sleep(interval_seconds)
except asyncio.CancelledError:
logger.info("Pool monitoring stopped")
break
except Exception as e:
logger.error(f"Pool monitoring error: {e}")
await asyncio.sleep(interval_seconds)
# Global instance (initialized in main.py)
pool_monitor: Optional[PoolMonitor] = None
def init_pool_monitor(engine: AsyncEngine) -> PoolMonitor:
"""Initialize global pool monitor."""
global pool_monitor
pool_monitor = PoolMonitor(engine)
return pool_monitor
Step 2: Add Health Endpoint (20 min)
Update backend/app/api/routes.py:
from app.monitoring.pool_monitor import pool_monitor
@router.get("/health/database")
async def database_health():
"""
Database connection pool health.
Returns:
Pool statistics and health status.
"""
if not pool_monitor:
return {"status": "unknown", "message": "Pool monitor not initialized"}
health = pool_monitor.get_health_status()
history = pool_monitor.get_history(limit=5)
return {
**health,
"recent_history": history
}
@router.get("/health")
async def overall_health():
"""
Overall application health including database.
"""
db_health = pool_monitor.get_health_status() if pool_monitor else {"status": "unknown"}
# Aggregate health status
statuses = [db_health.get("status", "unknown")]
if "critical" in statuses:
overall = "critical"
elif "warning" in statuses:
overall = "warning"
elif "unknown" in statuses:
overall = "degraded"
else:
overall = "healthy"
return {
"status": overall,
"components": {
"database": db_health
}
}
Step 3: Initialize in Application (15 min)
Update backend/app/main.py:
from app.database.session import engine
from app.monitoring.pool_monitor import init_pool_monitor, pool_monitor
@asynccontextmanager
async def lifespan(app: FastAPI):
# Initialize pool monitor
monitor = init_pool_monitor(engine)
# Start background monitoring
monitoring_task = asyncio.create_task(
monitor.start_monitoring(interval_seconds=60)
)
logger.info("Pool monitoring started")
yield
# Stop monitoring
monitoring_task.cancel()
try:
await monitoring_task
except asyncio.CancelledError:
pass
logger.info("Pool monitoring stopped")
Step 4: Add Connection Recycling (15 min)
Update backend/app/database/session.py:
from sqlalchemy.ext.asyncio import create_async_engine
from app.config import settings
engine = create_async_engine(
settings.database_url,
echo=settings.debug,
pool_size=settings.db_pool_size,
max_overflow=settings.db_max_overflow,
# New: Connection health settings
pool_pre_ping=True, # Test connection before use
pool_recycle=3600, # Recycle connections after 1 hour
pool_timeout=30, # Wait max 30s for connection
)
Step 5: Add Prometheus Metrics (Optional) (30 min)
Create backend/app/monitoring/metrics.py:
"""Prometheus metrics for monitoring."""
from prometheus_client import Gauge, Counter, Histogram
# Connection pool metrics
db_pool_size = Gauge(
'db_pool_size',
'Database connection pool size'
)
db_pool_checkedout = Gauge(
'db_pool_checkedout',
'Database connections currently in use'
)
db_pool_overflow = Gauge(
'db_pool_overflow',
'Database overflow connections in use'
)
db_pool_usage = Gauge(
'db_pool_usage_percent',
'Database connection pool usage percentage'
)
# Query metrics
db_query_duration = Histogram(
'db_query_duration_seconds',
'Database query duration',
buckets=[0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0]
)
db_query_errors = Counter(
'db_query_errors_total',
'Total database query errors'
)
def update_pool_metrics(stats):
"""Update Prometheus metrics with pool stats."""
db_pool_size.set(stats.pool_size)
db_pool_checkedout.set(stats.checkedout)
db_pool_overflow.set(stats.overflow)
db_pool_usage.set(stats.usage_percent * 100)
Step 6: Write Tests (15 min)
Create backend/tests/unit/monitoring/test_pool_monitor.py:
"""Tests for connection pool monitoring."""
import pytest
from unittest.mock import MagicMock
from app.monitoring.pool_monitor import PoolMonitor, PoolStats
class TestPoolMonitor:
"""Tests for PoolMonitor."""
@pytest.fixture
def mock_engine(self):
engine = MagicMock()
pool = MagicMock()
pool.checkedin.return_value = 15
pool.checkedout.return_value = 5
pool.overflow.return_value = 0
engine.pool = pool
return engine
def test_get_stats_returns_pool_stats(self, mock_engine):
"""get_stats returns PoolStats with correct values."""
monitor = PoolMonitor(mock_engine)
stats = monitor.get_stats()
assert isinstance(stats, PoolStats)
assert stats.checkedout == 5
assert stats.checkedin == 15
def test_health_status_healthy(self, mock_engine):
"""Health status is healthy when usage < 75%."""
monitor = PoolMonitor(mock_engine)
health = monitor.get_health_status()
assert health["status"] == "healthy"
def test_health_status_warning(self, mock_engine):
"""Health status is warning when usage 75-90%."""
mock_engine.pool.checkedout.return_value = 24 # 80%
mock_engine.pool.checkedin.return_value = 6
monitor = PoolMonitor(mock_engine)
health = monitor.get_health_status()
assert health["status"] == "warning"
def test_health_status_critical(self, mock_engine):
"""Health status is critical when usage >= 90%."""
mock_engine.pool.checkedout.return_value = 28 # 93%
mock_engine.pool.checkedin.return_value = 2
monitor = PoolMonitor(mock_engine)
health = monitor.get_health_status()
assert health["status"] == "critical"
def test_history_tracked(self, mock_engine):
"""Stats history is tracked."""
monitor = PoolMonitor(mock_engine)
# Get stats multiple times
for _ in range(5):
monitor.get_stats()
history = monitor.get_history(limit=5)
assert len(history) == 5
Verification Checklist
- Pool monitor initialized on startup
/health/databaseendpoint returns stats- Background monitoring logs stats periodically
- High usage triggers warning log
- Pool recycling configured
- Tests pass
Monitoring Dashboard (Future)
Consider adding Grafana dashboard with:
- Pool usage over time
- Connection wait times
- Error rates
- Query latencies
Rollback Plan
Pool monitoring is additive - simply remove the health endpoint if issues arise.
Dependencies
- None (can be implemented independently)
Notes
- Consider alerting integration (PagerDuty, Slack)
- May want to add connection timeout tracking
- Future: Add slow query logging