Add HTTP health check endpoint for container monitoring
Implements a comprehensive health check system using aiohttp to support container orchestration and external monitoring systems. Features: - /health endpoint: Basic liveness check (is process running?) - /ready endpoint: Readiness check (is bot connected to Discord?) - /metrics endpoint: Detailed bot metrics (guilds, users, cogs, latency) Changes: - Add aiohttp to requirements.txt - Create health_server.py module with HTTP server - Update paperdynasty.py to run health server alongside bot - Update docker-compose.yml with HTTP-based healthcheck - Fix deploy.sh Docker image name Benefits: - Auto-restart on bot hangs/deadlocks - Foundation for external monitoring (Prometheus, Grafana, etc.) - Detailed diagnostics for troubleshooting - Industry-standard health check pattern 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
af49704272
commit
440f017c92
@ -2,7 +2,7 @@
|
||||
set -e # Exit on error
|
||||
|
||||
# Configuration
|
||||
DOCKER_IMAGE="manticorum67/paper-dynasty"
|
||||
DOCKER_IMAGE="manticorum67/paper-dynasty-discordapp"
|
||||
REMOTE_HOST="sba-bots"
|
||||
REMOTE_PATH="/home/cal/container-data/paper-dynasty"
|
||||
DOCKERFILE_PATH="."
|
||||
|
||||
132
health_server.py
Normal file
132
health_server.py
Normal file
@ -0,0 +1,132 @@
|
||||
"""
|
||||
HTTP health check server for Paper Dynasty Discord bot.
|
||||
|
||||
Provides health and readiness endpoints for container monitoring and orchestration.
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Optional
|
||||
from aiohttp import web
|
||||
import discord
|
||||
from discord.ext import commands
|
||||
|
||||
logger = logging.getLogger('discord_app.health')
|
||||
|
||||
|
||||
class HealthServer:
|
||||
"""HTTP server for health checks and metrics."""
|
||||
|
||||
def __init__(self, bot: commands.Bot, host: str = '0.0.0.0', port: int = 8080):
|
||||
"""
|
||||
Initialize health server.
|
||||
|
||||
Args:
|
||||
bot: Discord bot instance to monitor
|
||||
host: Host to bind to (default: 0.0.0.0 for container access)
|
||||
port: Port to listen on (default: 8080)
|
||||
"""
|
||||
self.bot = bot
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.app = web.Application()
|
||||
self.runner: Optional[web.AppRunner] = None
|
||||
self.site: Optional[web.TCPSite] = None
|
||||
|
||||
# Setup routes
|
||||
self.app.router.add_get('/health', self.health_check)
|
||||
self.app.router.add_get('/ready', self.readiness_check)
|
||||
self.app.router.add_get('/metrics', self.metrics)
|
||||
|
||||
async def health_check(self, request: web.Request) -> web.Response:
|
||||
"""
|
||||
Basic liveness check - is the process running?
|
||||
|
||||
Returns 200 if the server is responsive.
|
||||
"""
|
||||
return web.json_response({
|
||||
'status': 'healthy',
|
||||
'service': 'paper-dynasty-discord-bot'
|
||||
})
|
||||
|
||||
async def readiness_check(self, request: web.Request) -> web.Response:
|
||||
"""
|
||||
Readiness check - is the bot ready to serve requests?
|
||||
|
||||
Returns:
|
||||
200 if bot is connected to Discord
|
||||
503 if bot is not ready
|
||||
"""
|
||||
if self.bot.is_ready():
|
||||
return web.json_response({
|
||||
'status': 'ready',
|
||||
'discord_connected': True,
|
||||
'latency_ms': round(self.bot.latency * 1000, 2) if self.bot.latency else None
|
||||
})
|
||||
else:
|
||||
return web.json_response({
|
||||
'status': 'not_ready',
|
||||
'discord_connected': False
|
||||
}, status=503)
|
||||
|
||||
async def metrics(self, request: web.Request) -> web.Response:
|
||||
"""
|
||||
Return bot metrics for monitoring.
|
||||
|
||||
Provides detailed information about bot state for external monitoring systems.
|
||||
"""
|
||||
metrics_data = {
|
||||
'bot': {
|
||||
'is_ready': self.bot.is_ready(),
|
||||
'is_closed': self.bot.is_closed(),
|
||||
'latency_ms': round(self.bot.latency * 1000, 2) if self.bot.latency else None,
|
||||
},
|
||||
'guilds': {
|
||||
'count': len(self.bot.guilds),
|
||||
'guild_ids': [g.id for g in self.bot.guilds]
|
||||
},
|
||||
'users': {
|
||||
'count': len(self.bot.users)
|
||||
},
|
||||
'cogs': {
|
||||
'loaded': list(self.bot.cogs.keys()),
|
||||
'count': len(self.bot.cogs)
|
||||
}
|
||||
}
|
||||
|
||||
return web.json_response(metrics_data)
|
||||
|
||||
async def start(self):
|
||||
"""Start the health check server."""
|
||||
self.runner = web.AppRunner(self.app)
|
||||
await self.runner.setup()
|
||||
self.site = web.TCPSite(self.runner, self.host, self.port)
|
||||
await self.site.start()
|
||||
logger.info(f'Health check server started on {self.host}:{self.port}')
|
||||
|
||||
async def stop(self):
|
||||
"""Stop the health check server."""
|
||||
if self.site:
|
||||
await self.site.stop()
|
||||
if self.runner:
|
||||
await self.runner.cleanup()
|
||||
logger.info('Health check server stopped')
|
||||
|
||||
|
||||
async def run_health_server(bot: commands.Bot, host: str = '0.0.0.0', port: int = 8080):
|
||||
"""
|
||||
Run health server as a background task.
|
||||
|
||||
Args:
|
||||
bot: Discord bot instance
|
||||
host: Host to bind to
|
||||
port: Port to listen on
|
||||
"""
|
||||
server = HealthServer(bot, host, port)
|
||||
await server.start()
|
||||
|
||||
# Keep the server running until bot is closed
|
||||
try:
|
||||
while not bot.is_closed():
|
||||
await asyncio.sleep(1)
|
||||
finally:
|
||||
await server.stop()
|
||||
@ -8,6 +8,7 @@ import os
|
||||
from discord.ext import commands
|
||||
|
||||
from in_game.gameplay_models import create_db_and_tables
|
||||
from health_server import run_health_server
|
||||
|
||||
raw_log_level = os.getenv('LOG_LEVEL')
|
||||
if raw_log_level == 'DEBUG':
|
||||
@ -100,7 +101,21 @@ async def main():
|
||||
except Exception as e:
|
||||
logger.error(f'Failed to load cog: {c}')
|
||||
logger.error(f'{e}')
|
||||
|
||||
# Start health server and bot concurrently
|
||||
async with bot:
|
||||
await bot.start(os.environ.get('BOT_TOKEN', 'NONE'))
|
||||
# Create health server task
|
||||
health_task = asyncio.create_task(run_health_server(bot))
|
||||
|
||||
try:
|
||||
# Start bot (this blocks until bot stops)
|
||||
await bot.start(os.environ.get('BOT_TOKEN', 'NONE'))
|
||||
finally:
|
||||
# Cleanup: cancel health server when bot stops
|
||||
health_task.cancel()
|
||||
try:
|
||||
await health_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
asyncio.run(main())
|
||||
|
||||
@ -10,4 +10,5 @@ pytest
|
||||
pytest-asyncio
|
||||
pandas
|
||||
psycopg2-binary
|
||||
aiohttp
|
||||
# psycopg[binary]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user