Add HTTP health check endpoint for container monitoring
Implements a comprehensive health check system using aiohttp to support container orchestration and external monitoring systems. Features: - /health endpoint: Basic liveness check (is process running?) - /ready endpoint: Readiness check (is bot connected to Discord?) - /metrics endpoint: Detailed bot metrics (guilds, users, cogs, latency) Changes: - Add aiohttp to requirements.txt - Create health_server.py module with HTTP server - Update paperdynasty.py to run health server alongside bot - Update docker-compose.yml with HTTP-based healthcheck - Fix deploy.sh Docker image name Benefits: - Auto-restart on bot hangs/deadlocks - Foundation for external monitoring (Prometheus, Grafana, etc.) - Detailed diagnostics for troubleshooting - Industry-standard health check pattern 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
af49704272
commit
440f017c92
@ -2,7 +2,7 @@
|
|||||||
set -e # Exit on error
|
set -e # Exit on error
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
DOCKER_IMAGE="manticorum67/paper-dynasty"
|
DOCKER_IMAGE="manticorum67/paper-dynasty-discordapp"
|
||||||
REMOTE_HOST="sba-bots"
|
REMOTE_HOST="sba-bots"
|
||||||
REMOTE_PATH="/home/cal/container-data/paper-dynasty"
|
REMOTE_PATH="/home/cal/container-data/paper-dynasty"
|
||||||
DOCKERFILE_PATH="."
|
DOCKERFILE_PATH="."
|
||||||
|
|||||||
132
health_server.py
Normal file
132
health_server.py
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
"""
|
||||||
|
HTTP health check server for Paper Dynasty Discord bot.
|
||||||
|
|
||||||
|
Provides health and readiness endpoints for container monitoring and orchestration.
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
from aiohttp import web
|
||||||
|
import discord
|
||||||
|
from discord.ext import commands
|
||||||
|
|
||||||
|
logger = logging.getLogger('discord_app.health')
|
||||||
|
|
||||||
|
|
||||||
|
class HealthServer:
|
||||||
|
"""HTTP server for health checks and metrics."""
|
||||||
|
|
||||||
|
def __init__(self, bot: commands.Bot, host: str = '0.0.0.0', port: int = 8080):
|
||||||
|
"""
|
||||||
|
Initialize health server.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
bot: Discord bot instance to monitor
|
||||||
|
host: Host to bind to (default: 0.0.0.0 for container access)
|
||||||
|
port: Port to listen on (default: 8080)
|
||||||
|
"""
|
||||||
|
self.bot = bot
|
||||||
|
self.host = host
|
||||||
|
self.port = port
|
||||||
|
self.app = web.Application()
|
||||||
|
self.runner: Optional[web.AppRunner] = None
|
||||||
|
self.site: Optional[web.TCPSite] = None
|
||||||
|
|
||||||
|
# Setup routes
|
||||||
|
self.app.router.add_get('/health', self.health_check)
|
||||||
|
self.app.router.add_get('/ready', self.readiness_check)
|
||||||
|
self.app.router.add_get('/metrics', self.metrics)
|
||||||
|
|
||||||
|
async def health_check(self, request: web.Request) -> web.Response:
|
||||||
|
"""
|
||||||
|
Basic liveness check - is the process running?
|
||||||
|
|
||||||
|
Returns 200 if the server is responsive.
|
||||||
|
"""
|
||||||
|
return web.json_response({
|
||||||
|
'status': 'healthy',
|
||||||
|
'service': 'paper-dynasty-discord-bot'
|
||||||
|
})
|
||||||
|
|
||||||
|
async def readiness_check(self, request: web.Request) -> web.Response:
|
||||||
|
"""
|
||||||
|
Readiness check - is the bot ready to serve requests?
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
200 if bot is connected to Discord
|
||||||
|
503 if bot is not ready
|
||||||
|
"""
|
||||||
|
if self.bot.is_ready():
|
||||||
|
return web.json_response({
|
||||||
|
'status': 'ready',
|
||||||
|
'discord_connected': True,
|
||||||
|
'latency_ms': round(self.bot.latency * 1000, 2) if self.bot.latency else None
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
return web.json_response({
|
||||||
|
'status': 'not_ready',
|
||||||
|
'discord_connected': False
|
||||||
|
}, status=503)
|
||||||
|
|
||||||
|
async def metrics(self, request: web.Request) -> web.Response:
|
||||||
|
"""
|
||||||
|
Return bot metrics for monitoring.
|
||||||
|
|
||||||
|
Provides detailed information about bot state for external monitoring systems.
|
||||||
|
"""
|
||||||
|
metrics_data = {
|
||||||
|
'bot': {
|
||||||
|
'is_ready': self.bot.is_ready(),
|
||||||
|
'is_closed': self.bot.is_closed(),
|
||||||
|
'latency_ms': round(self.bot.latency * 1000, 2) if self.bot.latency else None,
|
||||||
|
},
|
||||||
|
'guilds': {
|
||||||
|
'count': len(self.bot.guilds),
|
||||||
|
'guild_ids': [g.id for g in self.bot.guilds]
|
||||||
|
},
|
||||||
|
'users': {
|
||||||
|
'count': len(self.bot.users)
|
||||||
|
},
|
||||||
|
'cogs': {
|
||||||
|
'loaded': list(self.bot.cogs.keys()),
|
||||||
|
'count': len(self.bot.cogs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return web.json_response(metrics_data)
|
||||||
|
|
||||||
|
async def start(self):
|
||||||
|
"""Start the health check server."""
|
||||||
|
self.runner = web.AppRunner(self.app)
|
||||||
|
await self.runner.setup()
|
||||||
|
self.site = web.TCPSite(self.runner, self.host, self.port)
|
||||||
|
await self.site.start()
|
||||||
|
logger.info(f'Health check server started on {self.host}:{self.port}')
|
||||||
|
|
||||||
|
async def stop(self):
|
||||||
|
"""Stop the health check server."""
|
||||||
|
if self.site:
|
||||||
|
await self.site.stop()
|
||||||
|
if self.runner:
|
||||||
|
await self.runner.cleanup()
|
||||||
|
logger.info('Health check server stopped')
|
||||||
|
|
||||||
|
|
||||||
|
async def run_health_server(bot: commands.Bot, host: str = '0.0.0.0', port: int = 8080):
|
||||||
|
"""
|
||||||
|
Run health server as a background task.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
bot: Discord bot instance
|
||||||
|
host: Host to bind to
|
||||||
|
port: Port to listen on
|
||||||
|
"""
|
||||||
|
server = HealthServer(bot, host, port)
|
||||||
|
await server.start()
|
||||||
|
|
||||||
|
# Keep the server running until bot is closed
|
||||||
|
try:
|
||||||
|
while not bot.is_closed():
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
finally:
|
||||||
|
await server.stop()
|
||||||
@ -8,6 +8,7 @@ import os
|
|||||||
from discord.ext import commands
|
from discord.ext import commands
|
||||||
|
|
||||||
from in_game.gameplay_models import create_db_and_tables
|
from in_game.gameplay_models import create_db_and_tables
|
||||||
|
from health_server import run_health_server
|
||||||
|
|
||||||
raw_log_level = os.getenv('LOG_LEVEL')
|
raw_log_level = os.getenv('LOG_LEVEL')
|
||||||
if raw_log_level == 'DEBUG':
|
if raw_log_level == 'DEBUG':
|
||||||
@ -100,7 +101,21 @@ async def main():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f'Failed to load cog: {c}')
|
logger.error(f'Failed to load cog: {c}')
|
||||||
logger.error(f'{e}')
|
logger.error(f'{e}')
|
||||||
|
|
||||||
|
# Start health server and bot concurrently
|
||||||
async with bot:
|
async with bot:
|
||||||
await bot.start(os.environ.get('BOT_TOKEN', 'NONE'))
|
# Create health server task
|
||||||
|
health_task = asyncio.create_task(run_health_server(bot))
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Start bot (this blocks until bot stops)
|
||||||
|
await bot.start(os.environ.get('BOT_TOKEN', 'NONE'))
|
||||||
|
finally:
|
||||||
|
# Cleanup: cancel health server when bot stops
|
||||||
|
health_task.cancel()
|
||||||
|
try:
|
||||||
|
await health_task
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
pass
|
||||||
|
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
|||||||
@ -10,4 +10,5 @@ pytest
|
|||||||
pytest-asyncio
|
pytest-asyncio
|
||||||
pandas
|
pandas
|
||||||
psycopg2-binary
|
psycopg2-binary
|
||||||
|
aiohttp
|
||||||
# psycopg[binary]
|
# psycopg[binary]
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user