diff --git a/media-servers/CONTEXT.md b/media-servers/CONTEXT.md index e6de43a..de0c492 100644 --- a/media-servers/CONTEXT.md +++ b/media-servers/CONTEXT.md @@ -109,6 +109,27 @@ volumes: - NVDEC: H.264, HEVC, VP8, VP9, AV1 - Sessions: 3+ concurrent +## GPU Health Monitoring + +### Jellyfin GPU Monitor +**Location**: `ubuntu-manticore:~/scripts/jellyfin_gpu_monitor.py` +**Schedule**: Every 5 minutes via cron +**Logs**: `~/logs/jellyfin-gpu-monitor.log` + +The monitor detects when the Jellyfin container loses GPU access (common after +driver updates or Docker restarts) and automatically: +1. Sends Discord alert +2. Restarts the container to restore GPU access +3. Confirms GPU is restored + +**Manual check:** +```bash +ssh ubuntu-manticore "python3 ~/scripts/jellyfin_gpu_monitor.py --check" +``` + +**FFmpeg exit code 187**: Indicates NVENC failure due to lost GPU access. +The monitor catches this condition before users report playback failures. + ## Troubleshooting ### Common Issues @@ -116,6 +137,7 @@ volumes: 2. **Transcoding failures**: Verify codec support for your GPU generation 3. **Slow playback start**: Check network mount performance 4. **Cache filling up**: Monitor trickplay/thumbnail generation +5. **FFmpeg exit code 187**: GPU access lost - monitor should auto-restart ### Diagnostic Commands ```bash diff --git a/monitoring/scripts/jellyfin_gpu_monitor.py b/monitoring/scripts/jellyfin_gpu_monitor.py new file mode 100644 index 0000000..822eb45 --- /dev/null +++ b/monitoring/scripts/jellyfin_gpu_monitor.py @@ -0,0 +1,395 @@ +#!/usr/bin/env python3 +""" +Jellyfin GPU Health Monitor with Discord Alerts + +Monitors Jellyfin container's GPU access and sends Discord notifications +when GPU access is lost. Optionally auto-restarts the container. + +The GTX 1070 in ubuntu-manticore is shared between Jellyfin and Tdarr. +GPU access can be lost after driver updates, Docker restarts, or other +runtime issues - this monitor detects that condition before users report +playback failures. + +Usage: + # Basic health check + python3 jellyfin_gpu_monitor.py --check + + # Check with Discord alerts + python3 jellyfin_gpu_monitor.py --check --discord-alerts + + # Check and auto-restart if GPU lost + python3 jellyfin_gpu_monitor.py --check --auto-restart + + # Full monitoring with alerts and auto-restart + python3 jellyfin_gpu_monitor.py --check --discord-alerts --auto-restart + + # Test Discord integration + python3 jellyfin_gpu_monitor.py --discord-test +""" + +import argparse +import json +import logging +import subprocess +import sys +from dataclasses import dataclass, asdict +from datetime import datetime +from typing import Optional +import requests + + +@dataclass +class GPUStatus: + timestamp: str + container_name: str + gpu_accessible: bool + gpu_name: Optional[str] = None + gpu_temp: Optional[int] = None + driver_version: Optional[str] = None + cuda_version: Optional[str] = None + error: Optional[str] = None + + +@dataclass +class ContainerStatus: + running: bool + status: str + uptime: Optional[str] = None + error: Optional[str] = None + + +@dataclass +class HealthStatus: + timestamp: str + overall_healthy: bool + gpu_status: GPUStatus + container_status: ContainerStatus + action_taken: Optional[str] = None + + +class DiscordNotifier: + def __init__(self, webhook_url: str, timeout: int = 10): + self.webhook_url = webhook_url + self.timeout = timeout + self.logger = logging.getLogger(f"{__name__}.DiscordNotifier") + + def send_alert(self, title: str, description: str, color: int = 0xff6b6b, + fields: list = None) -> bool: + """Send embed alert to Discord.""" + embed = { + "title": title, + "description": description, + "color": color, + "timestamp": datetime.now().isoformat(), + "fields": fields or [] + } + + payload = { + "username": "Jellyfin GPU Monitor", + "embeds": [embed] + } + + try: + response = requests.post( + self.webhook_url, + json=payload, + timeout=self.timeout + ) + response.raise_for_status() + self.logger.info("Discord notification sent successfully") + return True + except Exception as e: + self.logger.error(f"Failed to send Discord notification: {e}") + return False + + def send_gpu_lost_alert(self, gpu_status: GPUStatus, auto_restart: bool) -> bool: + """Send alert when GPU access is lost.""" + action = "Container will be automatically restarted" if auto_restart else "Manual intervention required" + + fields = [ + {"name": "Container", "value": gpu_status.container_name, "inline": True}, + {"name": "Error", "value": gpu_status.error or "Unknown", "inline": True}, + {"name": "Action", "value": action, "inline": False} + ] + + return self.send_alert( + title="Jellyfin GPU Access Lost", + description="The Jellyfin container has lost access to the NVIDIA GPU. Transcoding will fail until resolved.", + color=0xff6b6b, # Red + fields=fields + ) + + def send_gpu_restored_alert(self, gpu_status: GPUStatus) -> bool: + """Send alert when GPU access is restored.""" + fields = [ + {"name": "Container", "value": gpu_status.container_name, "inline": True}, + {"name": "GPU", "value": gpu_status.gpu_name or "Unknown", "inline": True}, + {"name": "Driver", "value": gpu_status.driver_version or "Unknown", "inline": True} + ] + + return self.send_alert( + title="Jellyfin GPU Access Restored", + description="GPU access has been restored. Hardware transcoding should now work.", + color=0x28a745, # Green + fields=fields + ) + + +class JellyfinGPUMonitor: + def __init__(self, container_name: str = "jellyfin", + discord_webhook: str = None, + enable_discord: bool = False, + auto_restart: bool = False, + ssh_host: str = None): + self.container_name = container_name + self.auto_restart = auto_restart + self.ssh_host = ssh_host + self.logger = logging.getLogger(__name__) + + self.discord = None + if enable_discord and discord_webhook: + self.discord = DiscordNotifier(discord_webhook) + + def _run_command(self, cmd: list, timeout: int = 30) -> tuple: + """Run command locally or via SSH.""" + if self.ssh_host: + cmd = ["ssh", self.ssh_host] + [" ".join(cmd)] + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout, + shell=isinstance(cmd[-1], str) and self.ssh_host is not None + ) + return result.returncode, result.stdout.strip(), result.stderr.strip() + except subprocess.TimeoutExpired: + return -1, "", "Command timed out" + except Exception as e: + return -1, "", str(e) + + def check_container_status(self) -> ContainerStatus: + """Check if Jellyfin container is running.""" + cmd = ["docker", "inspect", "--format", + "{{.State.Running}}|{{.State.Status}}|{{.State.StartedAt}}", + self.container_name] + + code, stdout, stderr = self._run_command(cmd) + + if code != 0: + return ContainerStatus( + running=False, + status="not_found", + error=stderr or "Container not found" + ) + + parts = stdout.split("|") + running = parts[0].lower() == "true" + status = parts[1] if len(parts) > 1 else "unknown" + started_at = parts[2] if len(parts) > 2 else None + + return ContainerStatus( + running=running, + status=status, + uptime=started_at + ) + + def check_gpu_access(self) -> GPUStatus: + """Check if container has GPU access via nvidia-smi.""" + timestamp = datetime.now().isoformat() + + # Run nvidia-smi inside the container + cmd = ["docker", "exec", self.container_name, "nvidia-smi", + "--query-gpu=name,temperature.gpu,driver_version", + "--format=csv,noheader,nounits"] + + code, stdout, stderr = self._run_command(cmd) + + if code != 0: + # Try basic nvidia-smi to get more error info + cmd_basic = ["docker", "exec", self.container_name, "nvidia-smi"] + _, _, stderr_basic = self._run_command(cmd_basic) + + return GPUStatus( + timestamp=timestamp, + container_name=self.container_name, + gpu_accessible=False, + error=stderr_basic or stderr or "nvidia-smi failed" + ) + + # Parse nvidia-smi output + try: + parts = [p.strip() for p in stdout.split(",")] + gpu_name = parts[0] if len(parts) > 0 else None + gpu_temp = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else None + driver_version = parts[2] if len(parts) > 2 else None + + return GPUStatus( + timestamp=timestamp, + container_name=self.container_name, + gpu_accessible=True, + gpu_name=gpu_name, + gpu_temp=gpu_temp, + driver_version=driver_version + ) + except Exception as e: + return GPUStatus( + timestamp=timestamp, + container_name=self.container_name, + gpu_accessible=True, # nvidia-smi worked, just parsing failed + error=f"Parse error: {e}" + ) + + def restart_container(self) -> bool: + """Restart the Jellyfin container.""" + self.logger.info(f"Restarting container: {self.container_name}") + + cmd = ["docker", "restart", self.container_name] + code, stdout, stderr = self._run_command(cmd, timeout=120) + + if code == 0: + self.logger.info("Container restarted successfully") + return True + else: + self.logger.error(f"Failed to restart container: {stderr}") + return False + + def health_check(self) -> HealthStatus: + """Perform full health check.""" + timestamp = datetime.now().isoformat() + action_taken = None + + # Check container first + container_status = self.check_container_status() + + if not container_status.running: + gpu_status = GPUStatus( + timestamp=timestamp, + container_name=self.container_name, + gpu_accessible=False, + error="Container not running" + ) + return HealthStatus( + timestamp=timestamp, + overall_healthy=False, + gpu_status=gpu_status, + container_status=container_status + ) + + # Check GPU access + gpu_status = self.check_gpu_access() + + # Handle GPU access lost + if not gpu_status.gpu_accessible: + self.logger.warning(f"GPU access lost: {gpu_status.error}") + + # Send Discord alert + if self.discord: + self.discord.send_gpu_lost_alert(gpu_status, self.auto_restart) + + # Auto-restart if enabled + if self.auto_restart: + if self.restart_container(): + action_taken = "Container restarted" + + # Re-check GPU after restart + import time + time.sleep(5) # Wait for container to initialize + gpu_status = self.check_gpu_access() + container_status = self.check_container_status() + + if gpu_status.gpu_accessible and self.discord: + self.discord.send_gpu_restored_alert(gpu_status) + else: + action_taken = "Restart failed" + + overall_healthy = ( + container_status.running and + gpu_status.gpu_accessible + ) + + return HealthStatus( + timestamp=timestamp, + overall_healthy=overall_healthy, + gpu_status=gpu_status, + container_status=container_status, + action_taken=action_taken + ) + + +def main(): + parser = argparse.ArgumentParser(description='Monitor Jellyfin GPU health') + parser.add_argument('--container', default='jellyfin', help='Container name') + parser.add_argument('--check', action='store_true', help='Perform health check') + parser.add_argument('--discord-webhook', + default='https://discord.com/api/webhooks/1404105821549498398/y2Ud1RK9rzFjv58xbypUfQNe3jrL7ZUq1FkQHa4_dfOHm2ylp93z0f4tY0O8Z-vQgKhD', + help='Discord webhook URL') + parser.add_argument('--discord-alerts', action='store_true', help='Enable Discord alerts') + parser.add_argument('--discord-test', action='store_true', help='Test Discord integration') + parser.add_argument('--auto-restart', action='store_true', help='Auto-restart on GPU loss') + parser.add_argument('--ssh-host', default=None, help='SSH host for remote monitoring') + parser.add_argument('--output', choices=['json', 'pretty'], default='pretty') + parser.add_argument('--verbose', action='store_true', help='Verbose logging') + + args = parser.parse_args() + + # Configure logging + level = logging.DEBUG if args.verbose else logging.INFO + logging.basicConfig( + level=level, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + + # Discord test + if args.discord_test: + notifier = DiscordNotifier(args.discord_webhook) + success = notifier.send_alert( + title="Jellyfin GPU Monitor Test", + description="Discord integration is working correctly.", + color=0x00ff00, + fields=[ + {"name": "Container", "value": args.container, "inline": True}, + {"name": "Status", "value": "Test successful", "inline": True} + ] + ) + sys.exit(0 if success else 1) + + # Health check + if args.check: + monitor = JellyfinGPUMonitor( + container_name=args.container, + discord_webhook=args.discord_webhook, + enable_discord=args.discord_alerts, + auto_restart=args.auto_restart, + ssh_host=args.ssh_host + ) + + result = monitor.health_check() + + if args.output == 'json': + print(json.dumps(asdict(result), indent=2)) + else: + print(f"=== Jellyfin GPU Health Check - {result.timestamp} ===") + status_icon = "" if result.overall_healthy else "" + print(f"Overall: {status_icon} {'Healthy' if result.overall_healthy else 'UNHEALTHY'}") + print(f"\nContainer: {result.container_status.status}") + print(f"GPU Access: {'Yes' if result.gpu_status.gpu_accessible else 'NO'}") + + if result.gpu_status.gpu_accessible: + print(f"GPU: {result.gpu_status.gpu_name}") + print(f"Temperature: {result.gpu_status.gpu_temp}C") + print(f"Driver: {result.gpu_status.driver_version}") + else: + print(f"Error: {result.gpu_status.error}") + + if result.action_taken: + print(f"\nAction: {result.action_taken}") + + sys.exit(0 if result.overall_healthy else 1) + + parser.print_help() + + +if __name__ == '__main__': + main()