claude-home/scripts/tdarr/start-tdarr-gpu-podman-clean.sh
Cal Corum 7edb4a3a9c CLAUDE: Update VM management patterns and Tdarr operational scripts
- Update patterns/vm-management/README.md: Add comprehensive automation workflows
  - Cloud-init deployment strategies and post-install automation
  - SSH key management integration and security hardening patterns
  - Implementation workflows for new and existing VM provisioning

- Add complete VM management examples and reference documentation
  - examples/vm-management/: Proxmox automation and provisioning examples
  - reference/vm-management/: Troubleshooting guides and best practices
  - scripts/vm-management/: Operational scripts for automated VM setup

- Update reference/docker/tdarr-monitoring-configuration.md: API monitoring integration
  - Document new tdarr_monitor.py integration with existing Discord monitoring
  - Add API-based health checks and cron scheduling examples
  - Enhanced gaming scheduler integration with health verification

- Update Tdarr operational scripts with stability improvements
  - scripts/tdarr/start-tdarr-gpu-podman-clean.sh: Resource limits and CDI GPU access
  - scripts/tdarr/tdarr-schedule-manager.sh: Updated container name references
  - scripts/monitoring/tdarr-timeout-monitor.sh: Enhanced completion monitoring

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-12 12:18:43 -05:00

131 lines
5.0 KiB
Bash
Executable File

#!/bin/bash
# Tdarr Unmapped Node with GPU Support - System Stability Optimized
# This script starts an unmapped Tdarr node with resource limits and local NVMe cache
# Updated 2025-08-11: Added container security measures to prevent kernel crashes
# Updated 2025-08-11: Fixed GPU parameter to use Podman CDI standard (--device nvidia.com/gpu=all)
set -e
CONTAINER_NAME="tdarr-node-gpu-unmapped"
SERVER_IP="10.10.0.43"
SERVER_PORT="8266" # Standard server port
NODE_NAME="nobara-pc-gpu-unmapped"
echo "🚀 Starting UNMAPPED Tdarr Node with GPU support and resource limits..."
# Check for root privileges (required for memlock and other resource limits)
if [ "$EUID" -ne 0 ]; then
echo ""
echo "❌ This script requires root privileges for secure container resource limits."
echo ""
echo "🔒 Root privileges needed for:"
echo " - Memory lock limits (512MB) - prevent GPU memory exhaustion"
echo " - System-level resource limits - protect against container resource abuse"
echo " - GPU device access - privileged container operations"
echo " - Memory/CPU/I/O constraints - full cgroups resource control"
echo ""
echo "🚀 Please run with sudo:"
echo " sudo $0"
echo ""
exit 1
fi
# Check system requirements
echo "🔍 Checking system requirements..."
if ! command -v nvidia-smi &> /dev/null; then
echo "⚠️ Warning: nvidia-smi not found. GPU access may not work."
fi
if [ ! -f "/etc/cdi/nvidia.yaml" ]; then
echo "⚠️ Warning: NVIDIA CDI configuration not found at /etc/cdi/nvidia.yaml"
echo " Run: nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml"
fi
echo "✅ Running with root privileges - full resource limits enabled"
# Stop and remove existing container if it exists
if podman ps -a --format "{{.Names}}" | grep -q "^${CONTAINER_NAME}$"; then
echo "🛑 Stopping existing container: ${CONTAINER_NAME}"
podman stop "${CONTAINER_NAME}" 2>/dev/null || true
podman rm "${CONTAINER_NAME}" 2>/dev/null || true
fi
# Start Tdarr node with GPU support - UNMAPPED VERSION with Resource Limits
echo "🎬 Starting Unmapped Tdarr Node container with resource limits..."
podman run -d --name "${CONTAINER_NAME}" \
--device nvidia.com/gpu=all \
--restart unless-stopped \
--memory=32g \
--memory-swap=40g \
--cpus="14" \
--pids-limit=1000 \
--ulimit nofile=65536:65536 \
--ulimit memlock=536870912:536870912 \
--device-read-bps /dev/nvme0n1:1g \
--device-write-bps /dev/nvme0n1:1g \
-e TZ=America/Chicago \
-e UMASK_SET=002 \
-e nodeName="${NODE_NAME}" \
-e serverIP="${SERVER_IP}" \
-e serverPort="${SERVER_PORT}" \
-e inContainer=true \
-e ffmpegVersion=6 \
-e logLevel=DEBUG \
-e NVIDIA_DRIVER_CAPABILITIES=all \
-e NVIDIA_VISIBLE_DEVICES=all \
-e nodeType=unmapped \
-e unmappedNodeCache=/cache \
-v "/mnt/NV2/tdarr-cache:/cache" \
ghcr.io/haveagitgat/tdarr_node:latest
echo "⏳ Waiting for container to initialize..."
sleep 5
# Check container status
if podman ps --format "{{.Names}}" | grep -q "^${CONTAINER_NAME}$"; then
echo "✅ Unmapped Tdarr Node is running successfully!"
echo ""
echo "📊 Container Status:"
podman ps --filter "name=${CONTAINER_NAME}" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
echo ""
echo "🔍 Testing GPU Access (using Podman CDI standard):"
if podman exec "${CONTAINER_NAME}" nvidia-smi --query-gpu=name --format=csv,noheader,nounits 2>/dev/null; then
echo "🎉 GPU is accessible in container!"
else
echo "⚠️ GPU test failed, but container is running"
echo " Check: sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml"
fi
echo ""
echo "🛡️ Resource Limits Applied:"
echo " Memory: 32GB limit + 8GB swap"
echo " CPUs: 14 cores (2 reserved for system)"
echo " PIDs: 1000 limit"
echo " NVMe I/O: 1GB/s read/write limit"
echo ""
echo "🏗️ Architecture: Unmapped Node"
echo " - No direct media volume mounts"
echo " - Downloads files to local NVMe cache"
echo " - Prevents CIFS streaming during transcoding"
echo " - Eliminates kernel memory corruption risk"
echo ""
echo "🌐 Connection Details:"
echo " Server: ${SERVER_IP}:${SERVER_PORT}"
echo " Node Name: ${NODE_NAME}"
echo " Node Type: Unmapped"
echo " Web UI: http://${SERVER_IP}:8265"
echo ""
echo "📋 Container Management:"
echo " View logs: podman logs ${CONTAINER_NAME}"
echo " Stop: podman stop ${CONTAINER_NAME}"
echo " Remove: podman rm ${CONTAINER_NAME}"
echo ""
echo "⚠️ Important Configuration Requirements:"
echo " - Server must have 'Allow unmapped Nodes' enabled"
echo " - NVIDIA CDI configuration required for GPU access"
echo " - cgroups V2 recommended for full resource limit support"
else
echo "❌ Failed to start container"
echo "📋 Checking logs..."
podman logs "${CONTAINER_NAME}" --tail 10
exit 1
fi