#!/usr/bin/env python3 """ Jellyfin GPU Health Monitor with Discord Alerts Monitors Jellyfin container's GPU access and sends Discord notifications when GPU access is lost. Optionally auto-restarts the container. The GTX 1070 in ubuntu-manticore is shared between Jellyfin and Tdarr. GPU access can be lost after driver updates, Docker restarts, or other runtime issues - this monitor detects that condition before users report playback failures. Usage: # Basic health check python3 jellyfin_gpu_monitor.py --check # Check with Discord alerts python3 jellyfin_gpu_monitor.py --check --discord-alerts # Check and auto-restart if GPU lost python3 jellyfin_gpu_monitor.py --check --auto-restart # Full monitoring with alerts and auto-restart python3 jellyfin_gpu_monitor.py --check --discord-alerts --auto-restart # Test Discord integration python3 jellyfin_gpu_monitor.py --discord-test """ import argparse import json import logging import subprocess import sys from dataclasses import dataclass, asdict from datetime import datetime from typing import Optional import requests @dataclass class GPUStatus: timestamp: str container_name: str gpu_accessible: bool gpu_name: Optional[str] = None gpu_temp: Optional[int] = None driver_version: Optional[str] = None cuda_version: Optional[str] = None error: Optional[str] = None @dataclass class ContainerStatus: running: bool status: str uptime: Optional[str] = None error: Optional[str] = None @dataclass class HealthStatus: timestamp: str overall_healthy: bool gpu_status: GPUStatus container_status: ContainerStatus action_taken: Optional[str] = None class DiscordNotifier: def __init__(self, webhook_url: str, timeout: int = 10): self.webhook_url = webhook_url self.timeout = timeout self.logger = logging.getLogger(f"{__name__}.DiscordNotifier") def send_alert(self, title: str, description: str, color: int = 0xff6b6b, fields: list = None) -> bool: """Send embed alert to Discord.""" embed = { "title": title, "description": description, "color": color, "timestamp": datetime.now().isoformat(), "fields": fields or [] } payload = { "username": "Jellyfin GPU Monitor", "embeds": [embed] } try: response = requests.post( self.webhook_url, json=payload, timeout=self.timeout ) response.raise_for_status() self.logger.info("Discord notification sent successfully") return True except Exception as e: self.logger.error(f"Failed to send Discord notification: {e}") return False def send_gpu_lost_alert(self, gpu_status: GPUStatus, auto_restart: bool) -> bool: """Send alert when GPU access is lost.""" action = "Container will be automatically restarted" if auto_restart else "Manual intervention required" fields = [ {"name": "Container", "value": gpu_status.container_name, "inline": True}, {"name": "Error", "value": gpu_status.error or "Unknown", "inline": True}, {"name": "Action", "value": action, "inline": False} ] return self.send_alert( title="Jellyfin GPU Access Lost", description="The Jellyfin container has lost access to the NVIDIA GPU. Transcoding will fail until resolved.", color=0xff6b6b, # Red fields=fields ) def send_gpu_restored_alert(self, gpu_status: GPUStatus) -> bool: """Send alert when GPU access is restored.""" fields = [ {"name": "Container", "value": gpu_status.container_name, "inline": True}, {"name": "GPU", "value": gpu_status.gpu_name or "Unknown", "inline": True}, {"name": "Driver", "value": gpu_status.driver_version or "Unknown", "inline": True} ] return self.send_alert( title="Jellyfin GPU Access Restored", description="GPU access has been restored. Hardware transcoding should now work.", color=0x28a745, # Green fields=fields ) class JellyfinGPUMonitor: def __init__(self, container_name: str = "jellyfin", discord_webhook: str = None, enable_discord: bool = False, auto_restart: bool = False, ssh_host: str = None): self.container_name = container_name self.auto_restart = auto_restart self.ssh_host = ssh_host self.logger = logging.getLogger(__name__) self.discord = None if enable_discord and discord_webhook: self.discord = DiscordNotifier(discord_webhook) def _run_command(self, cmd: list, timeout: int = 30) -> tuple: """Run command locally or via SSH.""" if self.ssh_host: cmd = ["ssh", self.ssh_host] + [" ".join(cmd)] try: result = subprocess.run( cmd, capture_output=True, text=True, timeout=timeout, shell=isinstance(cmd[-1], str) and self.ssh_host is not None ) return result.returncode, result.stdout.strip(), result.stderr.strip() except subprocess.TimeoutExpired: return -1, "", "Command timed out" except Exception as e: return -1, "", str(e) def check_container_status(self) -> ContainerStatus: """Check if Jellyfin container is running.""" cmd = ["docker", "inspect", "--format", "{{.State.Running}}|{{.State.Status}}|{{.State.StartedAt}}", self.container_name] code, stdout, stderr = self._run_command(cmd) if code != 0: return ContainerStatus( running=False, status="not_found", error=stderr or "Container not found" ) parts = stdout.split("|") running = parts[0].lower() == "true" status = parts[1] if len(parts) > 1 else "unknown" started_at = parts[2] if len(parts) > 2 else None return ContainerStatus( running=running, status=status, uptime=started_at ) def check_gpu_access(self) -> GPUStatus: """Check if container has GPU access via nvidia-smi.""" timestamp = datetime.now().isoformat() # Run nvidia-smi inside the container cmd = ["docker", "exec", self.container_name, "nvidia-smi", "--query-gpu=name,temperature.gpu,driver_version", "--format=csv,noheader,nounits"] code, stdout, stderr = self._run_command(cmd) if code != 0: # Try basic nvidia-smi to get more error info cmd_basic = ["docker", "exec", self.container_name, "nvidia-smi"] _, _, stderr_basic = self._run_command(cmd_basic) return GPUStatus( timestamp=timestamp, container_name=self.container_name, gpu_accessible=False, error=stderr_basic or stderr or "nvidia-smi failed" ) # Parse nvidia-smi output try: parts = [p.strip() for p in stdout.split(",")] gpu_name = parts[0] if len(parts) > 0 else None gpu_temp = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else None driver_version = parts[2] if len(parts) > 2 else None return GPUStatus( timestamp=timestamp, container_name=self.container_name, gpu_accessible=True, gpu_name=gpu_name, gpu_temp=gpu_temp, driver_version=driver_version ) except Exception as e: return GPUStatus( timestamp=timestamp, container_name=self.container_name, gpu_accessible=True, # nvidia-smi worked, just parsing failed error=f"Parse error: {e}" ) def restart_container(self) -> bool: """Restart the Jellyfin container.""" self.logger.info(f"Restarting container: {self.container_name}") cmd = ["docker", "restart", self.container_name] code, stdout, stderr = self._run_command(cmd, timeout=120) if code == 0: self.logger.info("Container restarted successfully") return True else: self.logger.error(f"Failed to restart container: {stderr}") return False def health_check(self) -> HealthStatus: """Perform full health check.""" timestamp = datetime.now().isoformat() action_taken = None # Check container first container_status = self.check_container_status() if not container_status.running: gpu_status = GPUStatus( timestamp=timestamp, container_name=self.container_name, gpu_accessible=False, error="Container not running" ) return HealthStatus( timestamp=timestamp, overall_healthy=False, gpu_status=gpu_status, container_status=container_status ) # Check GPU access gpu_status = self.check_gpu_access() # Handle GPU access lost if not gpu_status.gpu_accessible: self.logger.warning(f"GPU access lost: {gpu_status.error}") # Send Discord alert if self.discord: self.discord.send_gpu_lost_alert(gpu_status, self.auto_restart) # Auto-restart if enabled if self.auto_restart: if self.restart_container(): action_taken = "Container restarted" # Re-check GPU after restart import time time.sleep(5) # Wait for container to initialize gpu_status = self.check_gpu_access() container_status = self.check_container_status() if gpu_status.gpu_accessible and self.discord: self.discord.send_gpu_restored_alert(gpu_status) else: action_taken = "Restart failed" overall_healthy = ( container_status.running and gpu_status.gpu_accessible ) return HealthStatus( timestamp=timestamp, overall_healthy=overall_healthy, gpu_status=gpu_status, container_status=container_status, action_taken=action_taken ) def main(): parser = argparse.ArgumentParser(description='Monitor Jellyfin GPU health') parser.add_argument('--container', default='jellyfin', help='Container name') parser.add_argument('--check', action='store_true', help='Perform health check') parser.add_argument('--discord-webhook', default='https://discord.com/api/webhooks/1404105821549498398/y2Ud1RK9rzFjv58xbypUfQNe3jrL7ZUq1FkQHa4_dfOHm2ylp93z0f4tY0O8Z-vQgKhD', help='Discord webhook URL') parser.add_argument('--discord-alerts', action='store_true', help='Enable Discord alerts') parser.add_argument('--discord-test', action='store_true', help='Test Discord integration') parser.add_argument('--auto-restart', action='store_true', help='Auto-restart on GPU loss') parser.add_argument('--ssh-host', default=None, help='SSH host for remote monitoring') parser.add_argument('--output', choices=['json', 'pretty'], default='pretty') parser.add_argument('--verbose', action='store_true', help='Verbose logging') args = parser.parse_args() # Configure logging level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig( level=level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) # Discord test if args.discord_test: notifier = DiscordNotifier(args.discord_webhook) success = notifier.send_alert( title="Jellyfin GPU Monitor Test", description="Discord integration is working correctly.", color=0x00ff00, fields=[ {"name": "Container", "value": args.container, "inline": True}, {"name": "Status", "value": "Test successful", "inline": True} ] ) sys.exit(0 if success else 1) # Health check if args.check: monitor = JellyfinGPUMonitor( container_name=args.container, discord_webhook=args.discord_webhook, enable_discord=args.discord_alerts, auto_restart=args.auto_restart, ssh_host=args.ssh_host ) result = monitor.health_check() if args.output == 'json': print(json.dumps(asdict(result), indent=2)) else: print(f"=== Jellyfin GPU Health Check - {result.timestamp} ===") status_icon = "" if result.overall_healthy else "" print(f"Overall: {status_icon} {'Healthy' if result.overall_healthy else 'UNHEALTHY'}") print(f"\nContainer: {result.container_status.status}") print(f"GPU Access: {'Yes' if result.gpu_status.gpu_accessible else 'NO'}") if result.gpu_status.gpu_accessible: print(f"GPU: {result.gpu_status.gpu_name}") print(f"Temperature: {result.gpu_status.gpu_temp}C") print(f"Driver: {result.gpu_status.driver_version}") else: print(f"Error: {result.gpu_status.error}") if result.action_taken: print(f"\nAction: {result.action_taken}") sys.exit(0 if result.overall_healthy else 1) parser.print_help() if __name__ == '__main__': main()