CLAUDE: Add Jellyfin GPU health monitor with auto-restart
- Created jellyfin_gpu_monitor.py for detecting lost GPU access - Sends Discord alerts when GPU access fails - Auto-restarts container to restore GPU binding - Runs every 5 minutes via cron on ubuntu-manticore - Documents FFmpeg exit code 187 (NVENC failure) in troubleshooting Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
a900f9c744
commit
3112b3d6fe
@ -109,6 +109,27 @@ volumes:
|
||||
- NVDEC: H.264, HEVC, VP8, VP9, AV1
|
||||
- Sessions: 3+ concurrent
|
||||
|
||||
## GPU Health Monitoring
|
||||
|
||||
### Jellyfin GPU Monitor
|
||||
**Location**: `ubuntu-manticore:~/scripts/jellyfin_gpu_monitor.py`
|
||||
**Schedule**: Every 5 minutes via cron
|
||||
**Logs**: `~/logs/jellyfin-gpu-monitor.log`
|
||||
|
||||
The monitor detects when the Jellyfin container loses GPU access (common after
|
||||
driver updates or Docker restarts) and automatically:
|
||||
1. Sends Discord alert
|
||||
2. Restarts the container to restore GPU access
|
||||
3. Confirms GPU is restored
|
||||
|
||||
**Manual check:**
|
||||
```bash
|
||||
ssh ubuntu-manticore "python3 ~/scripts/jellyfin_gpu_monitor.py --check"
|
||||
```
|
||||
|
||||
**FFmpeg exit code 187**: Indicates NVENC failure due to lost GPU access.
|
||||
The monitor catches this condition before users report playback failures.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
@ -116,6 +137,7 @@ volumes:
|
||||
2. **Transcoding failures**: Verify codec support for your GPU generation
|
||||
3. **Slow playback start**: Check network mount performance
|
||||
4. **Cache filling up**: Monitor trickplay/thumbnail generation
|
||||
5. **FFmpeg exit code 187**: GPU access lost - monitor should auto-restart
|
||||
|
||||
### Diagnostic Commands
|
||||
```bash
|
||||
|
||||
395
monitoring/scripts/jellyfin_gpu_monitor.py
Normal file
395
monitoring/scripts/jellyfin_gpu_monitor.py
Normal file
@ -0,0 +1,395 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Jellyfin GPU Health Monitor with Discord Alerts
|
||||
|
||||
Monitors Jellyfin container's GPU access and sends Discord notifications
|
||||
when GPU access is lost. Optionally auto-restarts the container.
|
||||
|
||||
The GTX 1070 in ubuntu-manticore is shared between Jellyfin and Tdarr.
|
||||
GPU access can be lost after driver updates, Docker restarts, or other
|
||||
runtime issues - this monitor detects that condition before users report
|
||||
playback failures.
|
||||
|
||||
Usage:
|
||||
# Basic health check
|
||||
python3 jellyfin_gpu_monitor.py --check
|
||||
|
||||
# Check with Discord alerts
|
||||
python3 jellyfin_gpu_monitor.py --check --discord-alerts
|
||||
|
||||
# Check and auto-restart if GPU lost
|
||||
python3 jellyfin_gpu_monitor.py --check --auto-restart
|
||||
|
||||
# Full monitoring with alerts and auto-restart
|
||||
python3 jellyfin_gpu_monitor.py --check --discord-alerts --auto-restart
|
||||
|
||||
# Test Discord integration
|
||||
python3 jellyfin_gpu_monitor.py --discord-test
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import subprocess
|
||||
import sys
|
||||
from dataclasses import dataclass, asdict
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
import requests
|
||||
|
||||
|
||||
@dataclass
|
||||
class GPUStatus:
|
||||
timestamp: str
|
||||
container_name: str
|
||||
gpu_accessible: bool
|
||||
gpu_name: Optional[str] = None
|
||||
gpu_temp: Optional[int] = None
|
||||
driver_version: Optional[str] = None
|
||||
cuda_version: Optional[str] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ContainerStatus:
|
||||
running: bool
|
||||
status: str
|
||||
uptime: Optional[str] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class HealthStatus:
|
||||
timestamp: str
|
||||
overall_healthy: bool
|
||||
gpu_status: GPUStatus
|
||||
container_status: ContainerStatus
|
||||
action_taken: Optional[str] = None
|
||||
|
||||
|
||||
class DiscordNotifier:
|
||||
def __init__(self, webhook_url: str, timeout: int = 10):
|
||||
self.webhook_url = webhook_url
|
||||
self.timeout = timeout
|
||||
self.logger = logging.getLogger(f"{__name__}.DiscordNotifier")
|
||||
|
||||
def send_alert(self, title: str, description: str, color: int = 0xff6b6b,
|
||||
fields: list = None) -> bool:
|
||||
"""Send embed alert to Discord."""
|
||||
embed = {
|
||||
"title": title,
|
||||
"description": description,
|
||||
"color": color,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"fields": fields or []
|
||||
}
|
||||
|
||||
payload = {
|
||||
"username": "Jellyfin GPU Monitor",
|
||||
"embeds": [embed]
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
self.webhook_url,
|
||||
json=payload,
|
||||
timeout=self.timeout
|
||||
)
|
||||
response.raise_for_status()
|
||||
self.logger.info("Discord notification sent successfully")
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to send Discord notification: {e}")
|
||||
return False
|
||||
|
||||
def send_gpu_lost_alert(self, gpu_status: GPUStatus, auto_restart: bool) -> bool:
|
||||
"""Send alert when GPU access is lost."""
|
||||
action = "Container will be automatically restarted" if auto_restart else "Manual intervention required"
|
||||
|
||||
fields = [
|
||||
{"name": "Container", "value": gpu_status.container_name, "inline": True},
|
||||
{"name": "Error", "value": gpu_status.error or "Unknown", "inline": True},
|
||||
{"name": "Action", "value": action, "inline": False}
|
||||
]
|
||||
|
||||
return self.send_alert(
|
||||
title="Jellyfin GPU Access Lost",
|
||||
description="The Jellyfin container has lost access to the NVIDIA GPU. Transcoding will fail until resolved.",
|
||||
color=0xff6b6b, # Red
|
||||
fields=fields
|
||||
)
|
||||
|
||||
def send_gpu_restored_alert(self, gpu_status: GPUStatus) -> bool:
|
||||
"""Send alert when GPU access is restored."""
|
||||
fields = [
|
||||
{"name": "Container", "value": gpu_status.container_name, "inline": True},
|
||||
{"name": "GPU", "value": gpu_status.gpu_name or "Unknown", "inline": True},
|
||||
{"name": "Driver", "value": gpu_status.driver_version or "Unknown", "inline": True}
|
||||
]
|
||||
|
||||
return self.send_alert(
|
||||
title="Jellyfin GPU Access Restored",
|
||||
description="GPU access has been restored. Hardware transcoding should now work.",
|
||||
color=0x28a745, # Green
|
||||
fields=fields
|
||||
)
|
||||
|
||||
|
||||
class JellyfinGPUMonitor:
|
||||
def __init__(self, container_name: str = "jellyfin",
|
||||
discord_webhook: str = None,
|
||||
enable_discord: bool = False,
|
||||
auto_restart: bool = False,
|
||||
ssh_host: str = None):
|
||||
self.container_name = container_name
|
||||
self.auto_restart = auto_restart
|
||||
self.ssh_host = ssh_host
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
self.discord = None
|
||||
if enable_discord and discord_webhook:
|
||||
self.discord = DiscordNotifier(discord_webhook)
|
||||
|
||||
def _run_command(self, cmd: list, timeout: int = 30) -> tuple:
|
||||
"""Run command locally or via SSH."""
|
||||
if self.ssh_host:
|
||||
cmd = ["ssh", self.ssh_host] + [" ".join(cmd)]
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
shell=isinstance(cmd[-1], str) and self.ssh_host is not None
|
||||
)
|
||||
return result.returncode, result.stdout.strip(), result.stderr.strip()
|
||||
except subprocess.TimeoutExpired:
|
||||
return -1, "", "Command timed out"
|
||||
except Exception as e:
|
||||
return -1, "", str(e)
|
||||
|
||||
def check_container_status(self) -> ContainerStatus:
|
||||
"""Check if Jellyfin container is running."""
|
||||
cmd = ["docker", "inspect", "--format",
|
||||
"{{.State.Running}}|{{.State.Status}}|{{.State.StartedAt}}",
|
||||
self.container_name]
|
||||
|
||||
code, stdout, stderr = self._run_command(cmd)
|
||||
|
||||
if code != 0:
|
||||
return ContainerStatus(
|
||||
running=False,
|
||||
status="not_found",
|
||||
error=stderr or "Container not found"
|
||||
)
|
||||
|
||||
parts = stdout.split("|")
|
||||
running = parts[0].lower() == "true"
|
||||
status = parts[1] if len(parts) > 1 else "unknown"
|
||||
started_at = parts[2] if len(parts) > 2 else None
|
||||
|
||||
return ContainerStatus(
|
||||
running=running,
|
||||
status=status,
|
||||
uptime=started_at
|
||||
)
|
||||
|
||||
def check_gpu_access(self) -> GPUStatus:
|
||||
"""Check if container has GPU access via nvidia-smi."""
|
||||
timestamp = datetime.now().isoformat()
|
||||
|
||||
# Run nvidia-smi inside the container
|
||||
cmd = ["docker", "exec", self.container_name, "nvidia-smi",
|
||||
"--query-gpu=name,temperature.gpu,driver_version",
|
||||
"--format=csv,noheader,nounits"]
|
||||
|
||||
code, stdout, stderr = self._run_command(cmd)
|
||||
|
||||
if code != 0:
|
||||
# Try basic nvidia-smi to get more error info
|
||||
cmd_basic = ["docker", "exec", self.container_name, "nvidia-smi"]
|
||||
_, _, stderr_basic = self._run_command(cmd_basic)
|
||||
|
||||
return GPUStatus(
|
||||
timestamp=timestamp,
|
||||
container_name=self.container_name,
|
||||
gpu_accessible=False,
|
||||
error=stderr_basic or stderr or "nvidia-smi failed"
|
||||
)
|
||||
|
||||
# Parse nvidia-smi output
|
||||
try:
|
||||
parts = [p.strip() for p in stdout.split(",")]
|
||||
gpu_name = parts[0] if len(parts) > 0 else None
|
||||
gpu_temp = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else None
|
||||
driver_version = parts[2] if len(parts) > 2 else None
|
||||
|
||||
return GPUStatus(
|
||||
timestamp=timestamp,
|
||||
container_name=self.container_name,
|
||||
gpu_accessible=True,
|
||||
gpu_name=gpu_name,
|
||||
gpu_temp=gpu_temp,
|
||||
driver_version=driver_version
|
||||
)
|
||||
except Exception as e:
|
||||
return GPUStatus(
|
||||
timestamp=timestamp,
|
||||
container_name=self.container_name,
|
||||
gpu_accessible=True, # nvidia-smi worked, just parsing failed
|
||||
error=f"Parse error: {e}"
|
||||
)
|
||||
|
||||
def restart_container(self) -> bool:
|
||||
"""Restart the Jellyfin container."""
|
||||
self.logger.info(f"Restarting container: {self.container_name}")
|
||||
|
||||
cmd = ["docker", "restart", self.container_name]
|
||||
code, stdout, stderr = self._run_command(cmd, timeout=120)
|
||||
|
||||
if code == 0:
|
||||
self.logger.info("Container restarted successfully")
|
||||
return True
|
||||
else:
|
||||
self.logger.error(f"Failed to restart container: {stderr}")
|
||||
return False
|
||||
|
||||
def health_check(self) -> HealthStatus:
|
||||
"""Perform full health check."""
|
||||
timestamp = datetime.now().isoformat()
|
||||
action_taken = None
|
||||
|
||||
# Check container first
|
||||
container_status = self.check_container_status()
|
||||
|
||||
if not container_status.running:
|
||||
gpu_status = GPUStatus(
|
||||
timestamp=timestamp,
|
||||
container_name=self.container_name,
|
||||
gpu_accessible=False,
|
||||
error="Container not running"
|
||||
)
|
||||
return HealthStatus(
|
||||
timestamp=timestamp,
|
||||
overall_healthy=False,
|
||||
gpu_status=gpu_status,
|
||||
container_status=container_status
|
||||
)
|
||||
|
||||
# Check GPU access
|
||||
gpu_status = self.check_gpu_access()
|
||||
|
||||
# Handle GPU access lost
|
||||
if not gpu_status.gpu_accessible:
|
||||
self.logger.warning(f"GPU access lost: {gpu_status.error}")
|
||||
|
||||
# Send Discord alert
|
||||
if self.discord:
|
||||
self.discord.send_gpu_lost_alert(gpu_status, self.auto_restart)
|
||||
|
||||
# Auto-restart if enabled
|
||||
if self.auto_restart:
|
||||
if self.restart_container():
|
||||
action_taken = "Container restarted"
|
||||
|
||||
# Re-check GPU after restart
|
||||
import time
|
||||
time.sleep(5) # Wait for container to initialize
|
||||
gpu_status = self.check_gpu_access()
|
||||
container_status = self.check_container_status()
|
||||
|
||||
if gpu_status.gpu_accessible and self.discord:
|
||||
self.discord.send_gpu_restored_alert(gpu_status)
|
||||
else:
|
||||
action_taken = "Restart failed"
|
||||
|
||||
overall_healthy = (
|
||||
container_status.running and
|
||||
gpu_status.gpu_accessible
|
||||
)
|
||||
|
||||
return HealthStatus(
|
||||
timestamp=timestamp,
|
||||
overall_healthy=overall_healthy,
|
||||
gpu_status=gpu_status,
|
||||
container_status=container_status,
|
||||
action_taken=action_taken
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Monitor Jellyfin GPU health')
|
||||
parser.add_argument('--container', default='jellyfin', help='Container name')
|
||||
parser.add_argument('--check', action='store_true', help='Perform health check')
|
||||
parser.add_argument('--discord-webhook',
|
||||
default='https://discord.com/api/webhooks/1404105821549498398/y2Ud1RK9rzFjv58xbypUfQNe3jrL7ZUq1FkQHa4_dfOHm2ylp93z0f4tY0O8Z-vQgKhD',
|
||||
help='Discord webhook URL')
|
||||
parser.add_argument('--discord-alerts', action='store_true', help='Enable Discord alerts')
|
||||
parser.add_argument('--discord-test', action='store_true', help='Test Discord integration')
|
||||
parser.add_argument('--auto-restart', action='store_true', help='Auto-restart on GPU loss')
|
||||
parser.add_argument('--ssh-host', default=None, help='SSH host for remote monitoring')
|
||||
parser.add_argument('--output', choices=['json', 'pretty'], default='pretty')
|
||||
parser.add_argument('--verbose', action='store_true', help='Verbose logging')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Configure logging
|
||||
level = logging.DEBUG if args.verbose else logging.INFO
|
||||
logging.basicConfig(
|
||||
level=level,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
# Discord test
|
||||
if args.discord_test:
|
||||
notifier = DiscordNotifier(args.discord_webhook)
|
||||
success = notifier.send_alert(
|
||||
title="Jellyfin GPU Monitor Test",
|
||||
description="Discord integration is working correctly.",
|
||||
color=0x00ff00,
|
||||
fields=[
|
||||
{"name": "Container", "value": args.container, "inline": True},
|
||||
{"name": "Status", "value": "Test successful", "inline": True}
|
||||
]
|
||||
)
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
# Health check
|
||||
if args.check:
|
||||
monitor = JellyfinGPUMonitor(
|
||||
container_name=args.container,
|
||||
discord_webhook=args.discord_webhook,
|
||||
enable_discord=args.discord_alerts,
|
||||
auto_restart=args.auto_restart,
|
||||
ssh_host=args.ssh_host
|
||||
)
|
||||
|
||||
result = monitor.health_check()
|
||||
|
||||
if args.output == 'json':
|
||||
print(json.dumps(asdict(result), indent=2))
|
||||
else:
|
||||
print(f"=== Jellyfin GPU Health Check - {result.timestamp} ===")
|
||||
status_icon = "" if result.overall_healthy else ""
|
||||
print(f"Overall: {status_icon} {'Healthy' if result.overall_healthy else 'UNHEALTHY'}")
|
||||
print(f"\nContainer: {result.container_status.status}")
|
||||
print(f"GPU Access: {'Yes' if result.gpu_status.gpu_accessible else 'NO'}")
|
||||
|
||||
if result.gpu_status.gpu_accessible:
|
||||
print(f"GPU: {result.gpu_status.gpu_name}")
|
||||
print(f"Temperature: {result.gpu_status.gpu_temp}C")
|
||||
print(f"Driver: {result.gpu_status.driver_version}")
|
||||
else:
|
||||
print(f"Error: {result.gpu_status.error}")
|
||||
|
||||
if result.action_taken:
|
||||
print(f"\nAction: {result.action_taken}")
|
||||
|
||||
sys.exit(0 if result.overall_healthy else 1)
|
||||
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Loading…
Reference in New Issue
Block a user