From d723924bdfe9ed695b6e4942d288ac9f35a49b48 Mon Sep 17 00:00:00 2001 From: Cal Corum Date: Sat, 9 Aug 2025 00:47:12 -0500 Subject: [PATCH] CLAUDE: Add complete GPU transcoding solution for Tdarr containers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add working Podman-based GPU Tdarr startup script for Fedora systems - Document critical Docker Desktop GPU issues on Fedora/Nobara systems - Add comprehensive Tdarr configuration examples (CPU and GPU variants) - Add GPU acceleration patterns and troubleshooting documentation - Provide working solution for NVIDIA RTX GPU hardware transcoding Key insight: Podman works immediately for GPU access on Fedora systems where Docker Desktop fails due to virtualization layer conflicts. ๐Ÿค– Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- examples/docker/tdarr-node-configurations.md | 158 +++++++++++++++++ patterns/docker/gpu-acceleration.md | 140 +++++++++++++++ .../docker/nvidia-gpu-troubleshooting.md | 161 ++++++++++++++++++ start-tdarr-gpu-podman.sh | 76 +++++++++ 4 files changed, 535 insertions(+) create mode 100644 examples/docker/tdarr-node-configurations.md create mode 100644 patterns/docker/gpu-acceleration.md create mode 100644 reference/docker/nvidia-gpu-troubleshooting.md create mode 100755 start-tdarr-gpu-podman.sh diff --git a/examples/docker/tdarr-node-configurations.md b/examples/docker/tdarr-node-configurations.md new file mode 100644 index 0000000..ce94b7d --- /dev/null +++ b/examples/docker/tdarr-node-configurations.md @@ -0,0 +1,158 @@ +# Tdarr Node Container Configurations + +## Overview +Complete examples for running Tdarr transcoding nodes in containers, covering both CPU-only and GPU-accelerated setups. + +## CPU-Only Configuration (Docker Compose) + +For systems without GPU or when GPU isn't needed: + +```yaml +version: "3.4" +services: + tdarr-node: + container_name: tdarr-node-cpu + image: ghcr.io/haveagitgat/tdarr_node:latest + restart: unless-stopped + environment: + - TZ=America/Chicago + - UMASK_SET=002 + - nodeName=local-workstation-cpu + - serverIP=YOUR_TDARR_SERVER_IP # Replace with your tdarr server IP + - serverPort=8266 + - inContainer=true + - ffmpegVersion=6 + volumes: + # Mount your media from the same NAS share as the server + - /path/to/your/media:/media # Replace with your local media mount + # Temp directory for transcoding cache + - ./temp:/temp +``` + +**Use case**: +- CPU-only transcoding +- Testing Tdarr functionality +- Systems without dedicated GPU +- When GPU drivers aren't available + +## GPU-Accelerated Configuration (Podman) + +**Recommended for Fedora/RHEL/CentOS/Nobara systems:** + +```bash +podman run -d --name tdarr-node-gpu \ + --device nvidia.com/gpu=all \ + --restart unless-stopped \ + -e TZ=America/Chicago \ + -e UMASK_SET=002 \ + -e nodeName=local-workstation-gpu \ + -e serverIP=10.10.0.43 \ + -e serverPort=8266 \ + -e inContainer=true \ + -e ffmpegVersion=6 \ + -e NVIDIA_DRIVER_CAPABILITIES=all \ + -e NVIDIA_VISIBLE_DEVICES=all \ + -v ./media:/media \ + -v ./temp:/temp \ + ghcr.io/haveagitgat/tdarr_node:latest +``` + +**Use case**: +- Hardware video encoding/decoding (NVENC/NVDEC) +- High-performance transcoding +- Multiple concurrent streams +- Fedora-based systems where Podman works better than Docker + +## GPU-Accelerated Configuration (Docker) + +**For Ubuntu/Debian systems where Docker GPU support works:** + +```yaml +version: "3.4" +services: + tdarr-node: + container_name: tdarr-node-gpu + image: ghcr.io/haveagitgat/tdarr_node:latest + restart: unless-stopped + environment: + - TZ=America/Chicago + - UMASK_SET=002 + - nodeName=local-workstation-gpu + - serverIP=YOUR_TDARR_SERVER_IP + - serverPort=8266 + - inContainer=true + - ffmpegVersion=6 + - NVIDIA_DRIVER_CAPABILITIES=all + - NVIDIA_VISIBLE_DEVICES=all + volumes: + - /path/to/your/media:/media + - ./temp:/temp + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] +``` + +## Configuration Parameters + +### Required Environment Variables +- `TZ`: Timezone (e.g., `America/Chicago`) +- `nodeName`: Unique identifier for this node +- `serverIP`: IP address of Tdarr server +- `serverPort`: Tdarr server port (typically 8266) +- `inContainer`: Set to `true` for containerized deployments +- `ffmpegVersion`: FFmpeg version to use (6 recommended) + +### GPU-Specific Variables +- `NVIDIA_DRIVER_CAPABILITIES`: Set to `all` for full GPU access +- `NVIDIA_VISIBLE_DEVICES`: `all` for all GPUs, or specific GPU IDs + +### Volume Mounts +- `/media`: Mount point for media files (must match server configuration) +- `/temp`: Temporary directory for transcoding cache + +## Platform-Specific Recommendations + +### Fedora/RHEL/CentOS/Nobara +- **GPU**: Use Podman (Docker Desktop has GPU issues) +- **CPU**: Docker or Podman both work fine + +### Ubuntu/Debian +- **GPU**: Use Docker with nvidia-container-toolkit +- **CPU**: Docker recommended + +### Testing GPU Functionality + +Verify GPU access inside container: +```bash +# For Podman +podman exec tdarr-node-gpu nvidia-smi + +# For Docker +docker exec tdarr-node-gpu nvidia-smi +``` + +Test NVENC encoding: +```bash +# For Podman +podman exec tdarr-node-gpu /usr/local/bin/tdarr-ffmpeg -f lavfi -i testsrc2=duration=5:size=1920x1080:rate=30 -c:v h264_nvenc -t 5 /tmp/test.mp4 + +# For Docker +docker exec tdarr-node-gpu /usr/local/bin/tdarr-ffmpeg -f lavfi -i testsrc2=duration=5:size=1920x1080:rate=30 -c:v h264_nvenc -t 5 /tmp/test.mp4 +``` + +## Troubleshooting + +- **GPU not detected**: See `reference/docker/nvidia-gpu-troubleshooting.md` +- **Permission issues**: Ensure proper UMASK_SET and volume permissions +- **Connection issues**: Verify serverIP and firewall settings +- **Performance issues**: Monitor CPU/GPU utilization during transcoding + +## Related Documentation + +- `patterns/docker/gpu-acceleration.md` - GPU acceleration patterns +- `reference/docker/nvidia-gpu-troubleshooting.md` - Detailed GPU troubleshooting +- `start-tdarr-gpu-podman.sh` - Ready-to-use Podman startup script \ No newline at end of file diff --git a/patterns/docker/gpu-acceleration.md b/patterns/docker/gpu-acceleration.md new file mode 100644 index 0000000..e4ae81f --- /dev/null +++ b/patterns/docker/gpu-acceleration.md @@ -0,0 +1,140 @@ +# GPU Acceleration in Docker Containers + +## Overview +Patterns for enabling GPU acceleration in Docker containers, particularly for media transcoding workloads. + +## NVIDIA Container Toolkit Approach + +### Modern Method (CDI - Container Device Interface) +```bash +# Generate CDI configuration +sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml + +# Use in docker-compose +services: + app: + devices: + - nvidia.com/gpu=all +``` + +### Legacy Method (Runtime) +```bash +# Configure runtime +sudo nvidia-ctk runtime configure --runtime=docker + +# Use in docker-compose +services: + app: + runtime: nvidia + environment: + - NVIDIA_VISIBLE_DEVICES=all +``` + +### Compose v3 Method (Deploy) +```yaml +services: + app: + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] +``` + +## Hardware Considerations + +### High-End Consumer GPUs (RTX 4080/4090) +- Excellent NVENC/NVDEC performance +- Multiple concurrent transcoding streams +- High VRAM for large files + +### Multi-GPU Setups +```yaml +environment: + - NVIDIA_VISIBLE_DEVICES=0,1 # Specific GPUs + # or + - NVIDIA_VISIBLE_DEVICES=all # All GPUs +``` + +## Troubleshooting Patterns + +### Gradual Enablement +1. Start with CPU-only configuration +2. Verify container functionality +3. Add GPU support incrementally +4. Test with simple workloads first + +### Fallback Strategy +```yaml +# Include both GPU and CPU fallback +devices: + - /dev/dri:/dev/dri # Intel/AMD GPU fallback +deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] +``` + +## Common Issues +- Docker service restart failures after toolkit install +- CDI vs runtime configuration conflicts +- Distribution-specific package differences +- Permission issues with device access + +## Critical Fedora/Nobara GPU Issue + +### Problem: Docker Desktop GPU Integration Failure +On Fedora-based systems (Fedora, RHEL, CentOS, Nobara), Docker Desktop has significant compatibility issues with NVIDIA Container Toolkit, resulting in: +- `CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected` +- `unknown or invalid runtime name: nvidia` +- Manual device mounting works but CUDA runtime fails + +### Solution: Use Podman Instead +```bash +# Podman works immediately on Fedora systems +podman run -d --name container-name \ + --device nvidia.com/gpu=all \ + --restart unless-stopped \ + -e NVIDIA_DRIVER_CAPABILITIES=all \ + -e NVIDIA_VISIBLE_DEVICES=all \ + image:tag +``` + +### Why Podman Works Better on Fedora +- Native systemd integration +- Direct hardware access (no VM layer) +- Default container engine for RHEL/Fedora +- Superior NVIDIA Container Toolkit compatibility + +### Testing Commands +```bash +# Test Docker (often fails on Fedora) +docker run --rm --gpus all ubuntu:20.04 nvidia-smi + +# Test Podman (works on Fedora) +podman run --rm --device nvidia.com/gpu=all ubuntu:20.04 nvidia-smi +``` + +### Recommendation by OS +- **Fedora/RHEL/CentOS/Nobara**: Use Podman +- **Ubuntu/Debian**: Use Docker +- **When in doubt**: Test both, use what works + +## Media Transcoding Example (Tdarr) +```bash +# Working Podman command for Tdarr on Fedora +podman run -d --name tdarr-node-gpu \ + --device nvidia.com/gpu=all \ + --restart unless-stopped \ + -e nodeName=workstation-gpu \ + -e serverIP=10.10.0.43 \ + -e NVIDIA_VISIBLE_DEVICES=all \ + -v ./media:/media \ + -v ./tmp:/temp \ + ghcr.io/haveagitgat/tdarr_node:latest +``` \ No newline at end of file diff --git a/reference/docker/nvidia-gpu-troubleshooting.md b/reference/docker/nvidia-gpu-troubleshooting.md new file mode 100644 index 0000000..b8fbcb8 --- /dev/null +++ b/reference/docker/nvidia-gpu-troubleshooting.md @@ -0,0 +1,161 @@ +# NVIDIA GPU Container Troubleshooting Guide + +## Key Insights from Fedora/Nobara GPU Container Issues + +### Problem: Docker Desktop vs Podman GPU Support on Fedora-based Systems + +**Issue**: Docker Desktop on Fedora/Nobara systems has significant compatibility issues with NVIDIA Container Toolkit integration, even when properly configured. + +**Symptoms**: +- `CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected` +- `unknown or invalid runtime name: nvidia` +- Device nodes created but CUDA runtime fails to initialize +- Manual device creation (`mknod`) works but CUDA still fails + +**Root Cause**: Docker Desktop's virtualization layer interferes with direct hardware access on Fedora-based systems. + +## Solution: Use Podman Instead of Docker + +### Why Podman Works Better on Fedora +- **Native integration**: Better integration with systemd and Linux security contexts +- **Direct hardware access**: No VM layer interfering with GPU communication +- **Superior NVIDIA toolkit support**: Works with same nvidia-container-toolkit installation +- **Built for Fedora**: Designed as the default container engine for RHEL/Fedora systems + +### Verification Commands +```bash +# Test basic GPU access with Podman (should work) +podman run --rm --device nvidia.com/gpu=all ubuntu:20.04 nvidia-smi + +# Test basic GPU access with Docker (often fails on Fedora) +docker run --rm --gpus all ubuntu:20.04 nvidia-smi +``` + +## Complete GPU Container Setup for Fedora/Nobara + +### Prerequisites +1. NVIDIA drivers installed and working (`nvidia-smi` functional) +2. nvidia-container-toolkit installed via DNF +3. Podman installed (`dnf install podman`) + +### NVIDIA Container Toolkit Installation +```bash +# Install NVIDIA container toolkit +sudo dnf install nvidia-container-toolkit + +# Configure Docker runtime (may not work but worth trying) +sudo nvidia-ctk runtime configure --runtime=docker + +# The key insight: Podman works without additional configuration! +``` + +### Working Podman Command Template +```bash +podman run -d --name container-name \ + --device nvidia.com/gpu=all \ + --restart unless-stopped \ + -e NVIDIA_DRIVER_CAPABILITIES=all \ + -e NVIDIA_VISIBLE_DEVICES=all \ + [other options] \ + image:tag +``` + +## Troubleshooting Steps (In Order) + +### 1. Verify Host GPU Access +```bash +nvidia-smi # Should show GPU info +lsmod | grep nvidia # Should show nvidia modules loaded +ls -la /dev/nvidia* # Should show device files +``` + +### 2. Test Container Runtime +```bash +# Try Podman first (recommended for Fedora) +podman run --rm --device nvidia.com/gpu=all ubuntu:20.04 nvidia-smi + +# If Podman works but Docker doesn't, use Podman for production +``` + +### 3. Check NVIDIA Container Toolkit +```bash +rpm -qa | grep nvidia-container-toolkit +nvidia-ctk --version +``` + +### 4. Verify CUDA Library Locations +```bash +# Find CUDA libraries +rpm -ql nvidia-driver-cuda-libs | grep libcuda +ldconfig -p | grep cuda + +# Common locations: +# /usr/lib64/libcuda.so* +# /usr/lib64/libnvidia-encode.so* +``` + +## Common Misconceptions + +### โŒ Docker Should Always Work +**Wrong**: Docker Desktop has known issues with GPU access on some Linux distributions, especially Fedora-based systems. + +### โŒ More Privileges = Better GPU Access +**Wrong**: Adding `privileged: true` or manual device mounting doesn't solve Docker Desktop's fundamental GPU integration issues. + +### โŒ NVIDIA Container Toolkit Problems +**Wrong**: The toolkit works fine - the issue is Docker Desktop's compatibility with it on Fedora systems. + +## Best Practices + +### For Fedora/RHEL/CentOS Systems +1. **Use Podman by default** for GPU containers +2. Test Docker as fallback, but expect issues +3. Podman Compose works for orchestration +4. No special configuration needed beyond nvidia-container-toolkit + +### For Production Deployments +1. Test both Docker and Podman in your environment +2. Use whichever works reliably (often Podman on Fedora) +3. Document which container runtime is used +4. Include runtime in deployment scripts + +## Success Indicators + +### GPU Container Working Correctly +- `nvidia-smi` runs inside container +- NVENC/CUDA applications detect GPU +- No "CUDA_ERROR_NO_DEVICE" errors +- Hardware encoder shows as available in applications + +### Example: Successful Tdarr Node +```bash +# Container logs should show: +# h264_nvenc-true-true,hevc_nvenc-true-true,av1_nvenc-true-true + +# FFmpeg test should succeed: +podman exec container-name ffmpeg -f lavfi -i testsrc2=duration=1:size=320x240:rate=1 -c:v h264_nvenc -t 1 /tmp/test.mp4 +``` + +## System-Specific Notes + +### Nobara/Fedora 42 +- Docker Desktop: โŒ GPU support problematic +- Podman: โœ… GPU support works out of the box +- NVIDIA Driver version: 570.169 (tested working) +- Container Toolkit version: 1.17.8 (tested working) + +### Key Files and Locations +- GPU devices: `/dev/nvidia*` (auto-created) +- CUDA libraries: `/usr/lib64/libcuda.so*` (via nvidia-driver-cuda-libs package) +- Container toolkit: `nvidia-ctk` command available +- Docker daemon config: `/etc/docker/daemon.json` (may not help) + +## Future Reference + +When encountering GPU container issues on Fedora-based systems: +1. Try Podman first - it likely works immediately +2. Don't waste time troubleshooting Docker Desktop GPU issues +3. Use the same container images and configurations +4. Podman commands are nearly identical to Docker commands + +This approach saves hours of debugging Docker Desktop GPU integration issues on Fedora systems. \ No newline at end of file diff --git a/start-tdarr-gpu-podman.sh b/start-tdarr-gpu-podman.sh new file mode 100755 index 0000000..9f5d6c8 --- /dev/null +++ b/start-tdarr-gpu-podman.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# Tdarr Node with GPU Support - Podman Script +# This script starts a Tdarr node container with NVIDIA GPU acceleration using Podman + +set -e + +CONTAINER_NAME="tdarr-node-gpu" +SERVER_IP="10.10.0.43" +SERVER_PORT="8266" +NODE_NAME="local-workstation-gpu" + +echo "๐Ÿš€ Starting Tdarr Node with GPU support using Podman..." + +# Stop and remove existing container if it exists +if podman ps -a --format "{{.Names}}" | grep -q "^${CONTAINER_NAME}$"; then + echo "๐Ÿ›‘ Stopping existing container: ${CONTAINER_NAME}" + podman stop "${CONTAINER_NAME}" 2>/dev/null || true + podman rm "${CONTAINER_NAME}" 2>/dev/null || true +fi + +# Create required directories +echo "๐Ÿ“ Creating required directories..." +mkdir -p ./media ./tmp + +# Start Tdarr node with GPU support +echo "๐ŸŽฌ Starting Tdarr Node container..." +podman run -d --name "${CONTAINER_NAME}" \ + --device nvidia.com/gpu=all \ + --restart unless-stopped \ + -e TZ=America/Chicago \ + -e UMASK_SET=002 \ + -e nodeName="${NODE_NAME}" \ + -e serverIP="${SERVER_IP}" \ + -e serverPort="${SERVER_PORT}" \ + -e inContainer=true \ + -e ffmpegVersion=6 \ + -e NVIDIA_DRIVER_CAPABILITIES=all \ + -e NVIDIA_VISIBLE_DEVICES=all \ + -v "$(pwd)/media:/media" \ + -v "$(pwd)/tmp:/temp" \ + ghcr.io/haveagitgat/tdarr_node:latest + +echo "โณ Waiting for container to initialize..." +sleep 5 + +# Check container status +if podman ps --format "{{.Names}}" | grep -q "^${CONTAINER_NAME}$"; then + echo "โœ… Tdarr Node is running successfully!" + echo "" + echo "๐Ÿ“Š Container Status:" + podman ps --filter "name=${CONTAINER_NAME}" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" + echo "" + echo "๐Ÿ” Testing GPU Access:" + if podman exec "${CONTAINER_NAME}" nvidia-smi --query-gpu=name --format=csv,noheader,nounits 2>/dev/null; then + echo "๐ŸŽ‰ GPU is accessible in container!" + else + echo "โš ๏ธ GPU test failed, but container is running" + fi + echo "" + echo "๐ŸŒ Connection Details:" + echo " Server: ${SERVER_IP}:${SERVER_PORT}" + echo " Node Name: ${NODE_NAME}" + echo "" + echo "๐Ÿงช Test NVENC encoding:" + echo " podman exec ${CONTAINER_NAME} /usr/local/bin/tdarr-ffmpeg -f lavfi -i testsrc2=duration=5:size=1920x1080:rate=30 -c:v h264_nvenc -preset fast -t 5 /tmp/test.mp4" + echo "" + echo "๐Ÿ“‹ Container Management:" + echo " View logs: podman logs ${CONTAINER_NAME}" + echo " Stop: podman stop ${CONTAINER_NAME}" + echo " Remove: podman rm ${CONTAINER_NAME}" +else + echo "โŒ Failed to start container" + echo "๐Ÿ“‹ Checking logs..." + podman logs "${CONTAINER_NAME}" --tail 10 + exit 1 +fi \ No newline at end of file