claude-home/scripts/tdarr/start-tdarr-gpu-podman-clean.sh

#!/bin/bash
# Tdarr Unmapped Node with GPU Support - System Stability Optimized
# This script starts an unmapped Tdarr node with resource limits and local NVMe cache
# Updated 2025-08-11: Added container security measures to prevent kernel crashes
# Updated 2025-08-11: Fixed GPU parameter to use Podman CDI standard (--device nvidia.com/gpu=all)

set -e

CONTAINER_NAME="tdarr-node-gpu-unmapped"
SERVER_IP="10.10.0.43"
SERVER_PORT="8266"  # Standard server port
NODE_NAME="nobara-pc-gpu-unmapped"

echo "🚀 Starting UNMAPPED Tdarr Node with GPU support and resource limits..."

# Check for root privileges (required for memlock and other resource limits)
if [ "$EUID" -ne 0 ]; then
    echo ""
    echo "❌ This script requires root privileges for secure container resource limits."
    echo ""
    echo "🔒 Root privileges needed for:"
    echo "   - Memory lock limits (512MB) - prevent GPU memory exhaustion"
    echo "   - System-level resource limits - protect against container resource abuse"
    echo "   - GPU device access - privileged container operations"
    echo "   - Memory/CPU/I/O constraints - full cgroups resource control"
    echo ""
    echo "🚀 Please run with sudo:"
    echo "   sudo $0"
    echo ""
    exit 1
fi

# Check system requirements
echo "🔍 Checking system requirements..."
if ! command -v nvidia-smi &> /dev/null; then
    echo "⚠️  Warning: nvidia-smi not found. GPU access may not work."
fi

if [ ! -f "/etc/cdi/nvidia.yaml" ]; then
    echo "⚠️  Warning: NVIDIA CDI configuration not found at /etc/cdi/nvidia.yaml"
    echo "   Run: nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml"
fi

echo "✅ Running with root privileges - full resource limits enabled"

# Stop and remove existing container if it exists
if podman ps -a --format "{{.Names}}" | grep -q "^${CONTAINER_NAME}$"; then
    echo "🛑 Stopping existing container: ${CONTAINER_NAME}"
    podman stop "${CONTAINER_NAME}" 2>/dev/null || true
    podman rm "${CONTAINER_NAME}" 2>/dev/null || true
fi

# Start Tdarr node with GPU support - UNMAPPED VERSION with Resource Limits
echo "🎬 Starting Unmapped Tdarr Node container with resource limits..."
podman run -d --name "${CONTAINER_NAME}" \
    --device nvidia.com/gpu=all \
    --restart unless-stopped \
    --memory=32g \
    --memory-swap=40g \
    --cpus="14" \
    --pids-limit=1000 \
    --ulimit nofile=65536:65536 \
    --ulimit memlock=536870912:536870912 \
    --device-read-bps /dev/nvme0n1:1g \
    --device-write-bps /dev/nvme0n1:1g \
    -e TZ=America/Chicago \
    -e UMASK_SET=002 \
    -e nodeName="${NODE_NAME}" \
    -e serverIP="${SERVER_IP}" \
    -e serverPort="${SERVER_PORT}" \
    -e inContainer=true \
    -e ffmpegVersion=6 \
    -e logLevel=DEBUG \
    -e NVIDIA_DRIVER_CAPABILITIES=all \
    -e NVIDIA_VISIBLE_DEVICES=all \
    -e nodeType=unmapped \
    -e unmappedNodeCache=/cache \
    -v "/mnt/NV2/tdarr-cache:/cache" \
    ghcr.io/haveagitgat/tdarr_node:latest

echo "⏳ Waiting for container to initialize..."
sleep 5

# Check container status
if podman ps --format "{{.Names}}" | grep -q "^${CONTAINER_NAME}$"; then
    echo "✅ Unmapped Tdarr Node is running successfully!"
    echo ""
    echo "📊 Container Status:"
    podman ps --filter "name=${CONTAINER_NAME}" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
    echo ""
    echo "🔍 Testing GPU Access (using Podman CDI standard):"
    if podman exec "${CONTAINER_NAME}" nvidia-smi --query-gpu=name --format=csv,noheader,nounits 2>/dev/null; then
        echo "🎉 GPU is accessible in container!"
    else
        echo "⚠️  GPU test failed, but container is running"
        echo "   Check: sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml"
    fi
    echo ""
    echo "🛡️  Resource Limits Applied:"
    echo "   Memory: 32GB limit + 8GB swap"
    echo "   CPUs: 14 cores (2 reserved for system)"
    echo "   PIDs: 1000 limit"
    echo "   NVMe I/O: 1GB/s read/write limit"
    echo ""
    echo "🏗️  Architecture: Unmapped Node"
    echo "   - No direct media volume mounts"
    echo "   - Downloads files to local NVMe cache"
    echo "   - Prevents CIFS streaming during transcoding"
    echo "   - Eliminates kernel memory corruption risk"
    echo ""
    echo "🌐 Connection Details:"
    echo "   Server: ${SERVER_IP}:${SERVER_PORT}"
    echo "   Node Name: ${NODE_NAME}"
    echo "   Node Type: Unmapped"
    echo "   Web UI: http://${SERVER_IP}:8265"
    echo ""
    echo "📋 Container Management:"
    echo "   View logs: podman logs ${CONTAINER_NAME}"
    echo "   Stop:      podman stop ${CONTAINER_NAME}"
    echo "   Remove:    podman rm ${CONTAINER_NAME}"
    echo ""
    echo "⚠️  Important Configuration Requirements:"
    echo "   - Server must have 'Allow unmapped Nodes' enabled"
    echo "   - NVIDIA CDI configuration required for GPU access"
    echo "   - cgroups V2 recommended for full resource limit support"
else
    echo "❌ Failed to start container"
    echo "📋 Checking logs..."
    podman logs "${CONTAINER_NAME}" --tail 10
    exit 1
fi