- Created jellyfin_gpu_monitor.py for detecting lost GPU access - Sends Discord alerts when GPU access fails - Auto-restarts container to restore GPU binding - Runs every 5 minutes via cron on ubuntu-manticore - Documents FFmpeg exit code 187 (NVENC failure) in troubleshooting Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
396 lines
14 KiB
Python
396 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Jellyfin GPU Health Monitor with Discord Alerts
|
|
|
|
Monitors Jellyfin container's GPU access and sends Discord notifications
|
|
when GPU access is lost. Optionally auto-restarts the container.
|
|
|
|
The GTX 1070 in ubuntu-manticore is shared between Jellyfin and Tdarr.
|
|
GPU access can be lost after driver updates, Docker restarts, or other
|
|
runtime issues - this monitor detects that condition before users report
|
|
playback failures.
|
|
|
|
Usage:
|
|
# Basic health check
|
|
python3 jellyfin_gpu_monitor.py --check
|
|
|
|
# Check with Discord alerts
|
|
python3 jellyfin_gpu_monitor.py --check --discord-alerts
|
|
|
|
# Check and auto-restart if GPU lost
|
|
python3 jellyfin_gpu_monitor.py --check --auto-restart
|
|
|
|
# Full monitoring with alerts and auto-restart
|
|
python3 jellyfin_gpu_monitor.py --check --discord-alerts --auto-restart
|
|
|
|
# Test Discord integration
|
|
python3 jellyfin_gpu_monitor.py --discord-test
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import subprocess
|
|
import sys
|
|
from dataclasses import dataclass, asdict
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
import requests
|
|
|
|
|
|
@dataclass
|
|
class GPUStatus:
|
|
timestamp: str
|
|
container_name: str
|
|
gpu_accessible: bool
|
|
gpu_name: Optional[str] = None
|
|
gpu_temp: Optional[int] = None
|
|
driver_version: Optional[str] = None
|
|
cuda_version: Optional[str] = None
|
|
error: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class ContainerStatus:
|
|
running: bool
|
|
status: str
|
|
uptime: Optional[str] = None
|
|
error: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class HealthStatus:
|
|
timestamp: str
|
|
overall_healthy: bool
|
|
gpu_status: GPUStatus
|
|
container_status: ContainerStatus
|
|
action_taken: Optional[str] = None
|
|
|
|
|
|
class DiscordNotifier:
|
|
def __init__(self, webhook_url: str, timeout: int = 10):
|
|
self.webhook_url = webhook_url
|
|
self.timeout = timeout
|
|
self.logger = logging.getLogger(f"{__name__}.DiscordNotifier")
|
|
|
|
def send_alert(self, title: str, description: str, color: int = 0xff6b6b,
|
|
fields: list = None) -> bool:
|
|
"""Send embed alert to Discord."""
|
|
embed = {
|
|
"title": title,
|
|
"description": description,
|
|
"color": color,
|
|
"timestamp": datetime.now().isoformat(),
|
|
"fields": fields or []
|
|
}
|
|
|
|
payload = {
|
|
"username": "Jellyfin GPU Monitor",
|
|
"embeds": [embed]
|
|
}
|
|
|
|
try:
|
|
response = requests.post(
|
|
self.webhook_url,
|
|
json=payload,
|
|
timeout=self.timeout
|
|
)
|
|
response.raise_for_status()
|
|
self.logger.info("Discord notification sent successfully")
|
|
return True
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to send Discord notification: {e}")
|
|
return False
|
|
|
|
def send_gpu_lost_alert(self, gpu_status: GPUStatus, auto_restart: bool) -> bool:
|
|
"""Send alert when GPU access is lost."""
|
|
action = "Container will be automatically restarted" if auto_restart else "Manual intervention required"
|
|
|
|
fields = [
|
|
{"name": "Container", "value": gpu_status.container_name, "inline": True},
|
|
{"name": "Error", "value": gpu_status.error or "Unknown", "inline": True},
|
|
{"name": "Action", "value": action, "inline": False}
|
|
]
|
|
|
|
return self.send_alert(
|
|
title="Jellyfin GPU Access Lost",
|
|
description="The Jellyfin container has lost access to the NVIDIA GPU. Transcoding will fail until resolved.",
|
|
color=0xff6b6b, # Red
|
|
fields=fields
|
|
)
|
|
|
|
def send_gpu_restored_alert(self, gpu_status: GPUStatus) -> bool:
|
|
"""Send alert when GPU access is restored."""
|
|
fields = [
|
|
{"name": "Container", "value": gpu_status.container_name, "inline": True},
|
|
{"name": "GPU", "value": gpu_status.gpu_name or "Unknown", "inline": True},
|
|
{"name": "Driver", "value": gpu_status.driver_version or "Unknown", "inline": True}
|
|
]
|
|
|
|
return self.send_alert(
|
|
title="Jellyfin GPU Access Restored",
|
|
description="GPU access has been restored. Hardware transcoding should now work.",
|
|
color=0x28a745, # Green
|
|
fields=fields
|
|
)
|
|
|
|
|
|
class JellyfinGPUMonitor:
|
|
def __init__(self, container_name: str = "jellyfin",
|
|
discord_webhook: str = None,
|
|
enable_discord: bool = False,
|
|
auto_restart: bool = False,
|
|
ssh_host: str = None):
|
|
self.container_name = container_name
|
|
self.auto_restart = auto_restart
|
|
self.ssh_host = ssh_host
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
self.discord = None
|
|
if enable_discord and discord_webhook:
|
|
self.discord = DiscordNotifier(discord_webhook)
|
|
|
|
def _run_command(self, cmd: list, timeout: int = 30) -> tuple:
|
|
"""Run command locally or via SSH."""
|
|
if self.ssh_host:
|
|
cmd = ["ssh", self.ssh_host] + [" ".join(cmd)]
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=timeout,
|
|
shell=isinstance(cmd[-1], str) and self.ssh_host is not None
|
|
)
|
|
return result.returncode, result.stdout.strip(), result.stderr.strip()
|
|
except subprocess.TimeoutExpired:
|
|
return -1, "", "Command timed out"
|
|
except Exception as e:
|
|
return -1, "", str(e)
|
|
|
|
def check_container_status(self) -> ContainerStatus:
|
|
"""Check if Jellyfin container is running."""
|
|
cmd = ["docker", "inspect", "--format",
|
|
"{{.State.Running}}|{{.State.Status}}|{{.State.StartedAt}}",
|
|
self.container_name]
|
|
|
|
code, stdout, stderr = self._run_command(cmd)
|
|
|
|
if code != 0:
|
|
return ContainerStatus(
|
|
running=False,
|
|
status="not_found",
|
|
error=stderr or "Container not found"
|
|
)
|
|
|
|
parts = stdout.split("|")
|
|
running = parts[0].lower() == "true"
|
|
status = parts[1] if len(parts) > 1 else "unknown"
|
|
started_at = parts[2] if len(parts) > 2 else None
|
|
|
|
return ContainerStatus(
|
|
running=running,
|
|
status=status,
|
|
uptime=started_at
|
|
)
|
|
|
|
def check_gpu_access(self) -> GPUStatus:
|
|
"""Check if container has GPU access via nvidia-smi."""
|
|
timestamp = datetime.now().isoformat()
|
|
|
|
# Run nvidia-smi inside the container
|
|
cmd = ["docker", "exec", self.container_name, "nvidia-smi",
|
|
"--query-gpu=name,temperature.gpu,driver_version",
|
|
"--format=csv,noheader,nounits"]
|
|
|
|
code, stdout, stderr = self._run_command(cmd)
|
|
|
|
if code != 0:
|
|
# Try basic nvidia-smi to get more error info
|
|
cmd_basic = ["docker", "exec", self.container_name, "nvidia-smi"]
|
|
_, _, stderr_basic = self._run_command(cmd_basic)
|
|
|
|
return GPUStatus(
|
|
timestamp=timestamp,
|
|
container_name=self.container_name,
|
|
gpu_accessible=False,
|
|
error=stderr_basic or stderr or "nvidia-smi failed"
|
|
)
|
|
|
|
# Parse nvidia-smi output
|
|
try:
|
|
parts = [p.strip() for p in stdout.split(",")]
|
|
gpu_name = parts[0] if len(parts) > 0 else None
|
|
gpu_temp = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else None
|
|
driver_version = parts[2] if len(parts) > 2 else None
|
|
|
|
return GPUStatus(
|
|
timestamp=timestamp,
|
|
container_name=self.container_name,
|
|
gpu_accessible=True,
|
|
gpu_name=gpu_name,
|
|
gpu_temp=gpu_temp,
|
|
driver_version=driver_version
|
|
)
|
|
except Exception as e:
|
|
return GPUStatus(
|
|
timestamp=timestamp,
|
|
container_name=self.container_name,
|
|
gpu_accessible=True, # nvidia-smi worked, just parsing failed
|
|
error=f"Parse error: {e}"
|
|
)
|
|
|
|
def restart_container(self) -> bool:
|
|
"""Restart the Jellyfin container."""
|
|
self.logger.info(f"Restarting container: {self.container_name}")
|
|
|
|
cmd = ["docker", "restart", self.container_name]
|
|
code, stdout, stderr = self._run_command(cmd, timeout=120)
|
|
|
|
if code == 0:
|
|
self.logger.info("Container restarted successfully")
|
|
return True
|
|
else:
|
|
self.logger.error(f"Failed to restart container: {stderr}")
|
|
return False
|
|
|
|
def health_check(self) -> HealthStatus:
|
|
"""Perform full health check."""
|
|
timestamp = datetime.now().isoformat()
|
|
action_taken = None
|
|
|
|
# Check container first
|
|
container_status = self.check_container_status()
|
|
|
|
if not container_status.running:
|
|
gpu_status = GPUStatus(
|
|
timestamp=timestamp,
|
|
container_name=self.container_name,
|
|
gpu_accessible=False,
|
|
error="Container not running"
|
|
)
|
|
return HealthStatus(
|
|
timestamp=timestamp,
|
|
overall_healthy=False,
|
|
gpu_status=gpu_status,
|
|
container_status=container_status
|
|
)
|
|
|
|
# Check GPU access
|
|
gpu_status = self.check_gpu_access()
|
|
|
|
# Handle GPU access lost
|
|
if not gpu_status.gpu_accessible:
|
|
self.logger.warning(f"GPU access lost: {gpu_status.error}")
|
|
|
|
# Send Discord alert
|
|
if self.discord:
|
|
self.discord.send_gpu_lost_alert(gpu_status, self.auto_restart)
|
|
|
|
# Auto-restart if enabled
|
|
if self.auto_restart:
|
|
if self.restart_container():
|
|
action_taken = "Container restarted"
|
|
|
|
# Re-check GPU after restart
|
|
import time
|
|
time.sleep(5) # Wait for container to initialize
|
|
gpu_status = self.check_gpu_access()
|
|
container_status = self.check_container_status()
|
|
|
|
if gpu_status.gpu_accessible and self.discord:
|
|
self.discord.send_gpu_restored_alert(gpu_status)
|
|
else:
|
|
action_taken = "Restart failed"
|
|
|
|
overall_healthy = (
|
|
container_status.running and
|
|
gpu_status.gpu_accessible
|
|
)
|
|
|
|
return HealthStatus(
|
|
timestamp=timestamp,
|
|
overall_healthy=overall_healthy,
|
|
gpu_status=gpu_status,
|
|
container_status=container_status,
|
|
action_taken=action_taken
|
|
)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Monitor Jellyfin GPU health')
|
|
parser.add_argument('--container', default='jellyfin', help='Container name')
|
|
parser.add_argument('--check', action='store_true', help='Perform health check')
|
|
parser.add_argument('--discord-webhook',
|
|
default='https://discord.com/api/webhooks/1404105821549498398/y2Ud1RK9rzFjv58xbypUfQNe3jrL7ZUq1FkQHa4_dfOHm2ylp93z0f4tY0O8Z-vQgKhD',
|
|
help='Discord webhook URL')
|
|
parser.add_argument('--discord-alerts', action='store_true', help='Enable Discord alerts')
|
|
parser.add_argument('--discord-test', action='store_true', help='Test Discord integration')
|
|
parser.add_argument('--auto-restart', action='store_true', help='Auto-restart on GPU loss')
|
|
parser.add_argument('--ssh-host', default=None, help='SSH host for remote monitoring')
|
|
parser.add_argument('--output', choices=['json', 'pretty'], default='pretty')
|
|
parser.add_argument('--verbose', action='store_true', help='Verbose logging')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Configure logging
|
|
level = logging.DEBUG if args.verbose else logging.INFO
|
|
logging.basicConfig(
|
|
level=level,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
)
|
|
|
|
# Discord test
|
|
if args.discord_test:
|
|
notifier = DiscordNotifier(args.discord_webhook)
|
|
success = notifier.send_alert(
|
|
title="Jellyfin GPU Monitor Test",
|
|
description="Discord integration is working correctly.",
|
|
color=0x00ff00,
|
|
fields=[
|
|
{"name": "Container", "value": args.container, "inline": True},
|
|
{"name": "Status", "value": "Test successful", "inline": True}
|
|
]
|
|
)
|
|
sys.exit(0 if success else 1)
|
|
|
|
# Health check
|
|
if args.check:
|
|
monitor = JellyfinGPUMonitor(
|
|
container_name=args.container,
|
|
discord_webhook=args.discord_webhook,
|
|
enable_discord=args.discord_alerts,
|
|
auto_restart=args.auto_restart,
|
|
ssh_host=args.ssh_host
|
|
)
|
|
|
|
result = monitor.health_check()
|
|
|
|
if args.output == 'json':
|
|
print(json.dumps(asdict(result), indent=2))
|
|
else:
|
|
print(f"=== Jellyfin GPU Health Check - {result.timestamp} ===")
|
|
status_icon = "" if result.overall_healthy else ""
|
|
print(f"Overall: {status_icon} {'Healthy' if result.overall_healthy else 'UNHEALTHY'}")
|
|
print(f"\nContainer: {result.container_status.status}")
|
|
print(f"GPU Access: {'Yes' if result.gpu_status.gpu_accessible else 'NO'}")
|
|
|
|
if result.gpu_status.gpu_accessible:
|
|
print(f"GPU: {result.gpu_status.gpu_name}")
|
|
print(f"Temperature: {result.gpu_status.gpu_temp}C")
|
|
print(f"Driver: {result.gpu_status.driver_version}")
|
|
else:
|
|
print(f"Error: {result.gpu_status.error}")
|
|
|
|
if result.action_taken:
|
|
print(f"\nAction: {result.action_taken}")
|
|
|
|
sys.exit(0 if result.overall_healthy else 1)
|
|
|
|
parser.print_help()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|