claude-home/monitoring/scripts/jellyfin_gpu_monitor.py
Cal Corum 3112b3d6fe CLAUDE: Add Jellyfin GPU health monitor with auto-restart
- Created jellyfin_gpu_monitor.py for detecting lost GPU access
- Sends Discord alerts when GPU access fails
- Auto-restarts container to restore GPU binding
- Runs every 5 minutes via cron on ubuntu-manticore
- Documents FFmpeg exit code 187 (NVENC failure) in troubleshooting

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-28 22:57:04 -06:00

396 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Jellyfin GPU Health Monitor with Discord Alerts
Monitors Jellyfin container's GPU access and sends Discord notifications
when GPU access is lost. Optionally auto-restarts the container.
The GTX 1070 in ubuntu-manticore is shared between Jellyfin and Tdarr.
GPU access can be lost after driver updates, Docker restarts, or other
runtime issues - this monitor detects that condition before users report
playback failures.
Usage:
# Basic health check
python3 jellyfin_gpu_monitor.py --check
# Check with Discord alerts
python3 jellyfin_gpu_monitor.py --check --discord-alerts
# Check and auto-restart if GPU lost
python3 jellyfin_gpu_monitor.py --check --auto-restart
# Full monitoring with alerts and auto-restart
python3 jellyfin_gpu_monitor.py --check --discord-alerts --auto-restart
# Test Discord integration
python3 jellyfin_gpu_monitor.py --discord-test
"""
import argparse
import json
import logging
import subprocess
import sys
from dataclasses import dataclass, asdict
from datetime import datetime
from typing import Optional
import requests
@dataclass
class GPUStatus:
timestamp: str
container_name: str
gpu_accessible: bool
gpu_name: Optional[str] = None
gpu_temp: Optional[int] = None
driver_version: Optional[str] = None
cuda_version: Optional[str] = None
error: Optional[str] = None
@dataclass
class ContainerStatus:
running: bool
status: str
uptime: Optional[str] = None
error: Optional[str] = None
@dataclass
class HealthStatus:
timestamp: str
overall_healthy: bool
gpu_status: GPUStatus
container_status: ContainerStatus
action_taken: Optional[str] = None
class DiscordNotifier:
def __init__(self, webhook_url: str, timeout: int = 10):
self.webhook_url = webhook_url
self.timeout = timeout
self.logger = logging.getLogger(f"{__name__}.DiscordNotifier")
def send_alert(self, title: str, description: str, color: int = 0xff6b6b,
fields: list = None) -> bool:
"""Send embed alert to Discord."""
embed = {
"title": title,
"description": description,
"color": color,
"timestamp": datetime.now().isoformat(),
"fields": fields or []
}
payload = {
"username": "Jellyfin GPU Monitor",
"embeds": [embed]
}
try:
response = requests.post(
self.webhook_url,
json=payload,
timeout=self.timeout
)
response.raise_for_status()
self.logger.info("Discord notification sent successfully")
return True
except Exception as e:
self.logger.error(f"Failed to send Discord notification: {e}")
return False
def send_gpu_lost_alert(self, gpu_status: GPUStatus, auto_restart: bool) -> bool:
"""Send alert when GPU access is lost."""
action = "Container will be automatically restarted" if auto_restart else "Manual intervention required"
fields = [
{"name": "Container", "value": gpu_status.container_name, "inline": True},
{"name": "Error", "value": gpu_status.error or "Unknown", "inline": True},
{"name": "Action", "value": action, "inline": False}
]
return self.send_alert(
title="Jellyfin GPU Access Lost",
description="The Jellyfin container has lost access to the NVIDIA GPU. Transcoding will fail until resolved.",
color=0xff6b6b, # Red
fields=fields
)
def send_gpu_restored_alert(self, gpu_status: GPUStatus) -> bool:
"""Send alert when GPU access is restored."""
fields = [
{"name": "Container", "value": gpu_status.container_name, "inline": True},
{"name": "GPU", "value": gpu_status.gpu_name or "Unknown", "inline": True},
{"name": "Driver", "value": gpu_status.driver_version or "Unknown", "inline": True}
]
return self.send_alert(
title="Jellyfin GPU Access Restored",
description="GPU access has been restored. Hardware transcoding should now work.",
color=0x28a745, # Green
fields=fields
)
class JellyfinGPUMonitor:
def __init__(self, container_name: str = "jellyfin",
discord_webhook: str = None,
enable_discord: bool = False,
auto_restart: bool = False,
ssh_host: str = None):
self.container_name = container_name
self.auto_restart = auto_restart
self.ssh_host = ssh_host
self.logger = logging.getLogger(__name__)
self.discord = None
if enable_discord and discord_webhook:
self.discord = DiscordNotifier(discord_webhook)
def _run_command(self, cmd: list, timeout: int = 30) -> tuple:
"""Run command locally or via SSH."""
if self.ssh_host:
cmd = ["ssh", self.ssh_host] + [" ".join(cmd)]
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=timeout,
shell=isinstance(cmd[-1], str) and self.ssh_host is not None
)
return result.returncode, result.stdout.strip(), result.stderr.strip()
except subprocess.TimeoutExpired:
return -1, "", "Command timed out"
except Exception as e:
return -1, "", str(e)
def check_container_status(self) -> ContainerStatus:
"""Check if Jellyfin container is running."""
cmd = ["docker", "inspect", "--format",
"{{.State.Running}}|{{.State.Status}}|{{.State.StartedAt}}",
self.container_name]
code, stdout, stderr = self._run_command(cmd)
if code != 0:
return ContainerStatus(
running=False,
status="not_found",
error=stderr or "Container not found"
)
parts = stdout.split("|")
running = parts[0].lower() == "true"
status = parts[1] if len(parts) > 1 else "unknown"
started_at = parts[2] if len(parts) > 2 else None
return ContainerStatus(
running=running,
status=status,
uptime=started_at
)
def check_gpu_access(self) -> GPUStatus:
"""Check if container has GPU access via nvidia-smi."""
timestamp = datetime.now().isoformat()
# Run nvidia-smi inside the container
cmd = ["docker", "exec", self.container_name, "nvidia-smi",
"--query-gpu=name,temperature.gpu,driver_version",
"--format=csv,noheader,nounits"]
code, stdout, stderr = self._run_command(cmd)
if code != 0:
# Try basic nvidia-smi to get more error info
cmd_basic = ["docker", "exec", self.container_name, "nvidia-smi"]
_, _, stderr_basic = self._run_command(cmd_basic)
return GPUStatus(
timestamp=timestamp,
container_name=self.container_name,
gpu_accessible=False,
error=stderr_basic or stderr or "nvidia-smi failed"
)
# Parse nvidia-smi output
try:
parts = [p.strip() for p in stdout.split(",")]
gpu_name = parts[0] if len(parts) > 0 else None
gpu_temp = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else None
driver_version = parts[2] if len(parts) > 2 else None
return GPUStatus(
timestamp=timestamp,
container_name=self.container_name,
gpu_accessible=True,
gpu_name=gpu_name,
gpu_temp=gpu_temp,
driver_version=driver_version
)
except Exception as e:
return GPUStatus(
timestamp=timestamp,
container_name=self.container_name,
gpu_accessible=True, # nvidia-smi worked, just parsing failed
error=f"Parse error: {e}"
)
def restart_container(self) -> bool:
"""Restart the Jellyfin container."""
self.logger.info(f"Restarting container: {self.container_name}")
cmd = ["docker", "restart", self.container_name]
code, stdout, stderr = self._run_command(cmd, timeout=120)
if code == 0:
self.logger.info("Container restarted successfully")
return True
else:
self.logger.error(f"Failed to restart container: {stderr}")
return False
def health_check(self) -> HealthStatus:
"""Perform full health check."""
timestamp = datetime.now().isoformat()
action_taken = None
# Check container first
container_status = self.check_container_status()
if not container_status.running:
gpu_status = GPUStatus(
timestamp=timestamp,
container_name=self.container_name,
gpu_accessible=False,
error="Container not running"
)
return HealthStatus(
timestamp=timestamp,
overall_healthy=False,
gpu_status=gpu_status,
container_status=container_status
)
# Check GPU access
gpu_status = self.check_gpu_access()
# Handle GPU access lost
if not gpu_status.gpu_accessible:
self.logger.warning(f"GPU access lost: {gpu_status.error}")
# Send Discord alert
if self.discord:
self.discord.send_gpu_lost_alert(gpu_status, self.auto_restart)
# Auto-restart if enabled
if self.auto_restart:
if self.restart_container():
action_taken = "Container restarted"
# Re-check GPU after restart
import time
time.sleep(5) # Wait for container to initialize
gpu_status = self.check_gpu_access()
container_status = self.check_container_status()
if gpu_status.gpu_accessible and self.discord:
self.discord.send_gpu_restored_alert(gpu_status)
else:
action_taken = "Restart failed"
overall_healthy = (
container_status.running and
gpu_status.gpu_accessible
)
return HealthStatus(
timestamp=timestamp,
overall_healthy=overall_healthy,
gpu_status=gpu_status,
container_status=container_status,
action_taken=action_taken
)
def main():
parser = argparse.ArgumentParser(description='Monitor Jellyfin GPU health')
parser.add_argument('--container', default='jellyfin', help='Container name')
parser.add_argument('--check', action='store_true', help='Perform health check')
parser.add_argument('--discord-webhook',
default='https://discord.com/api/webhooks/1404105821549498398/y2Ud1RK9rzFjv58xbypUfQNe3jrL7ZUq1FkQHa4_dfOHm2ylp93z0f4tY0O8Z-vQgKhD',
help='Discord webhook URL')
parser.add_argument('--discord-alerts', action='store_true', help='Enable Discord alerts')
parser.add_argument('--discord-test', action='store_true', help='Test Discord integration')
parser.add_argument('--auto-restart', action='store_true', help='Auto-restart on GPU loss')
parser.add_argument('--ssh-host', default=None, help='SSH host for remote monitoring')
parser.add_argument('--output', choices=['json', 'pretty'], default='pretty')
parser.add_argument('--verbose', action='store_true', help='Verbose logging')
args = parser.parse_args()
# Configure logging
level = logging.DEBUG if args.verbose else logging.INFO
logging.basicConfig(
level=level,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# Discord test
if args.discord_test:
notifier = DiscordNotifier(args.discord_webhook)
success = notifier.send_alert(
title="Jellyfin GPU Monitor Test",
description="Discord integration is working correctly.",
color=0x00ff00,
fields=[
{"name": "Container", "value": args.container, "inline": True},
{"name": "Status", "value": "Test successful", "inline": True}
]
)
sys.exit(0 if success else 1)
# Health check
if args.check:
monitor = JellyfinGPUMonitor(
container_name=args.container,
discord_webhook=args.discord_webhook,
enable_discord=args.discord_alerts,
auto_restart=args.auto_restart,
ssh_host=args.ssh_host
)
result = monitor.health_check()
if args.output == 'json':
print(json.dumps(asdict(result), indent=2))
else:
print(f"=== Jellyfin GPU Health Check - {result.timestamp} ===")
status_icon = "" if result.overall_healthy else ""
print(f"Overall: {status_icon} {'Healthy' if result.overall_healthy else 'UNHEALTHY'}")
print(f"\nContainer: {result.container_status.status}")
print(f"GPU Access: {'Yes' if result.gpu_status.gpu_accessible else 'NO'}")
if result.gpu_status.gpu_accessible:
print(f"GPU: {result.gpu_status.gpu_name}")
print(f"Temperature: {result.gpu_status.gpu_temp}C")
print(f"Driver: {result.gpu_status.driver_version}")
else:
print(f"Error: {result.gpu_status.error}")
if result.action_taken:
print(f"\nAction: {result.action_taken}")
sys.exit(0 if result.overall_healthy else 1)
parser.print_help()
if __name__ == '__main__':
main()