From 26f5b82afad5ead481811e9a226a3cc2422a0fb0 Mon Sep 17 00:00:00 2001 From: Cal Corum Date: Sun, 10 Aug 2025 16:22:57 -0500 Subject: [PATCH] CLAUDE: Enhance operational scripts and add mobile SSH documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SSH Homelab Setup: - Add mobile device SSH access documentation (Termius setup) - Include prerequisites checklist and key transfer process - Document network discovery commands for mobile access Tdarr Timeout Monitor: - Add comprehensive debug logging with structured levels (INFO/DEBUG/ERROR/WARN/SUCCESS) - Implement command execution timing and detailed error tracking - Enhance container status verification and error handling - Add log entry counting and detailed output analysis - Improve cleanup operations with better failure detection - Add performance metrics and duration tracking for all operations Tdarr Node Startup: - Add unmapped node cache volume mapping for media access - Complete production configuration for distributed transcoding 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- examples/networking/ssh-homelab-setup.md | 44 +++ scripts/monitoring/tdarr-timeout-monitor.sh | 250 +++++++++++++++--- scripts/tdarr/start-tdarr-gpu-podman-clean.sh | 1 + 3 files changed, 265 insertions(+), 30 deletions(-) diff --git a/examples/networking/ssh-homelab-setup.md b/examples/networking/ssh-homelab-setup.md index 4b76c38..aec4f37 100644 --- a/examples/networking/ssh-homelab-setup.md +++ b/examples/networking/ssh-homelab-setup.md @@ -318,6 +318,50 @@ AllowUsers cal # Restrict SSH access sudo systemctl restart sshd ``` +## Mobile Device SSH Access + +### Termius Setup (iPhone/iPad) +Connect to local development machine using existing homelab keys: + +```yaml +Connection Details: + Host: 10.0.0.206 (or current eno1 IP) + Username: cal + Port: 22 + Key: homelab_rsa (same key used for server access) +``` + +### Prerequisites Checklist +Before mobile SSH access works, ensure: + +1. **SSH service is running:** + ```bash + sudo systemctl start sshd + sudo systemctl enable sshd + ``` + +2. **Firewall allows SSH:** + ```bash + sudo firewall-cmd --list-services # Should include 'ssh' + # If not present, SSH was already allowed in this setup + ``` + +3. **Authorized keys configured:** + ```bash + # homelab_rsa.pub should be in ~/.ssh/authorized_keys + cat ~/.ssh/homelab_rsa.pub >> ~/.ssh/authorized_keys + chmod 600 ~/.ssh/authorized_keys + ``` + +### Key Transfer Process +1. Copy private key: `cp ~/.ssh/homelab_rsa ./homelab_rsa_for_mobile` +2. Transfer securely to mobile device (AirDrop, secure file share) +3. Import into Termius app +4. Clean up: `rm homelab_rsa_for_mobile` + +### Network Discovery +Find local machine IP: `ip addr show eno1 | grep 'inet ' | awk '{print $2}' | cut -d'/' -f1` + ## Related Documentation - Patterns: `patterns/networking/ssh-key-management.md` diff --git a/scripts/monitoring/tdarr-timeout-monitor.sh b/scripts/monitoring/tdarr-timeout-monitor.sh index 278545c..b99bfba 100755 --- a/scripts/monitoring/tdarr-timeout-monitor.sh +++ b/scripts/monitoring/tdarr-timeout-monitor.sh @@ -67,9 +67,49 @@ $ping_message" # Create script directory mkdir -p "$SCRIPT_DIR" -# Logging functions +# Enhanced logging functions log_message() { - echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE" + echo "$(date '+%Y-%m-%d %H:%M:%S') [INFO] - $1" >> "$LOG_FILE" +} + +log_debug() { + echo "$(date '+%Y-%m-%d %H:%M:%S') [DEBUG] - $1" >> "$LOG_FILE" +} + +log_error() { + echo "$(date '+%Y-%m-%d %H:%M:%S') [ERROR] - $1" >> "$LOG_FILE" +} + +log_warning() { + echo "$(date '+%Y-%m-%d %H:%M:%S') [WARN] - $1" >> "$LOG_FILE" +} + +log_success() { + echo "$(date '+%Y-%m-%d %H:%M:%S') [SUCCESS] - $1" >> "$LOG_FILE" +} + +# Log command execution with timing +log_command() { + local cmd="$1" + local start_time=$(date +%s.%N) + log_debug "Executing command: $cmd" + + local result=$(eval "$cmd" 2>&1) + local exit_code=$? + local end_time=$(date +%s.%N) + local duration=$(echo "$end_time - $start_time" | bc 2>/dev/null || echo "0") + + if [[ $exit_code -eq 0 ]]; then + log_debug "Command completed in ${duration}s (exit: $exit_code)" + if [[ -n "$result" ]]; then + log_debug "Command output: $result" + fi + else + log_error "Command failed in ${duration}s (exit: $exit_code): $result" + fi + + echo "$result" + return $exit_code } rotate_log() { @@ -97,20 +137,43 @@ CURRENT_TIME=$(date +%s) # Function to check server logs for limbo timeouts check_server_timeouts() { - log_message "Checking server logs for limbo timeouts" + local start_time=$(date +%s.%N) + log_message "Starting server timeout check" # Get server logs since last check (convert to docker logs format) local since_docker=$(date -d "@$LAST_CHECK" -u +%Y-%m-%dT%H:%M:%S.000000000Z) + log_debug "Checking server logs since: $since_docker (timestamp: $LAST_CHECK)" - local timeouts=$(ssh "$SERVER_HOST" "docker logs --since='$since_docker' tdarr-clean 2>&1" | \ - grep -i "has been in limbo" | \ + # Execute SSH command with detailed logging + local ssh_cmd="ssh \"$SERVER_HOST\" \"docker logs --since='$since_docker' tdarr-clean 2>&1\"" + local server_logs=$(log_command "$ssh_cmd") + local ssh_exit=$? + + if [[ $ssh_exit -ne 0 ]]; then + log_error "Failed to retrieve server logs via SSH" + return 1 + fi + + log_debug "Retrieved $(echo "$server_logs" | wc -l) lines of server logs" + + # Process logs for limbo timeouts + local limbo_lines=$(echo "$server_logs" | grep -i "has been in limbo") + local limbo_count=$(echo "$limbo_lines" | grep -c "has been in limbo" 2>/dev/null || echo "0") + log_debug "Found $limbo_count limbo timeout entries in logs" + + local timeouts=$(echo "$limbo_lines" | \ grep -o "/media/[^']*" | \ sed 's|/media/||') if [[ -n "$timeouts" ]]; then local count=$(echo "$timeouts" | wc -l) local files=$(echo "$timeouts" | head -3) # Show first 3 files - log_message "Found $count file(s) timed out in staging section" + log_warning "Found $count file(s) timed out in staging section" + + # Log each timed out file for debugging + echo "$timeouts" | while IFS= read -r file; do + [[ -n "$file" ]] && log_debug "Timed out file: $file" + done local message="# 🎬 Tdarr Monitor **$count file(s) timed out in staging section:**" @@ -130,21 +193,55 @@ $file_list" Files were automatically removed from staging and will retry." send_discord_notification "$message" + log_success "Sent Discord notification for $count timed out files" + else + log_debug "No files found timed out in staging section" fi + + local end_time=$(date +%s.%N) + local duration=$(echo "$end_time - $start_time" | bc 2>/dev/null || echo "0") + log_message "Server timeout check completed in ${duration}s" } # Function to check node logs for worker stalls check_node_stalls() { - log_message "Checking node logs for worker stalls" + local start_time=$(date +%s.%N) + log_message "Starting node stall check" + + # Check if node container is running + local container_status=$(podman ps --format "{{.Status}}" --filter "name=$NODE_CONTAINER" 2>/dev/null) + if [[ -z "$container_status" ]]; then + log_warning "Node container '$NODE_CONTAINER' not found or not running" + return 1 + fi + log_debug "Node container status: $container_status" # Get node logs since last check - local stalls=$(podman logs --since="@$LAST_CHECK" "$NODE_CONTAINER" 2>&1 | \ - grep -i "worker.*stalled\|worker.*disconnected") + local node_cmd="podman logs --since=\"@$LAST_CHECK\" \"$NODE_CONTAINER\" 2>&1" + local node_logs=$(log_command "$node_cmd") + local node_exit=$? + + if [[ $node_exit -ne 0 ]]; then + log_error "Failed to retrieve node logs" + return 1 + fi + + log_debug "Retrieved $(echo "$node_logs" | wc -l) lines of node logs" + + # Process logs for worker stalls + local stalls=$(echo "$node_logs" | grep -i "worker.*stalled\|worker.*disconnected") + local stall_count=$(echo "$stalls" | grep -c "worker.*" 2>/dev/null || echo "0") + log_debug "Found $stall_count potential worker stall entries" if [[ -n "$stalls" ]]; then local count=$(echo "$stalls" | wc -l) local workers=$(echo "$stalls" | grep -o "Worker [^ ]*" | sort -u | head -3) - log_message "Found $count worker stall(s)" + log_warning "Found $count worker stall(s)" + + # Log each stalled worker for debugging + echo "$stalls" | while IFS= read -r stall_line; do + [[ -n "$stall_line" ]] && log_debug "Worker stall: $stall_line" + done local message="# 🎬 Tdarr Monitor **$count worker stall(s) detected:**" @@ -157,42 +254,75 @@ $worker_list Workers were automatically cancelled and will restart." send_discord_notification "$message" + log_success "Sent Discord notification for $count worker stalls" + else + log_debug "No worker stalls detected" fi + + local end_time=$(date +%s.%N) + local duration=$(echo "$end_time - $start_time" | bc 2>/dev/null || echo "0") + log_message "Node stall check completed in ${duration}s" } # Function to check and clean stuck work directories check_stuck_workdirs() { - log_message "Checking for stuck work directories" + local start_time=$(date +%s.%N) + log_message "Starting stuck work directory check" # Find work directories that are failing to be cleaned up - local stuck_dirs=$(ssh "$SERVER_HOST" "docker logs --since='30m' tdarr-clean 2>&1" | \ - grep "ENOTEMPTY.*tdarr-workDir" | \ + local workdir_cmd="ssh \"$SERVER_HOST\" \"docker logs --since='30m' tdarr-clean 2>&1\"" + local workdir_logs=$(log_command "$workdir_cmd") + local workdir_exit=$? + + if [[ $workdir_exit -ne 0 ]]; then + log_error "Failed to retrieve work directory logs from server" + return 1 + fi + + local enotempty_lines=$(echo "$workdir_logs" | grep "ENOTEMPTY.*tdarr-workDir") + local enotempty_count=$(echo "$enotempty_lines" | grep -c "ENOTEMPTY" 2>/dev/null || echo "0") + log_debug "Found $enotempty_count ENOTEMPTY errors for work directories" + + local stuck_dirs=$(echo "$enotempty_lines" | \ grep -o "tdarr-workDir[^']*" | \ sort -u) if [[ -n "$stuck_dirs" ]]; then local count=$(echo "$stuck_dirs" | wc -l) local cleaned=0 + log_warning "Found $count stuck work directories to clean" echo "$stuck_dirs" | while IFS= read -r dir; do if [[ -n "$dir" ]]; then - log_message "Attempting to clean stuck directory: $dir" + log_debug "Attempting to clean stuck directory: $dir" - # Force cleanup of stuck directory - ssh "$SERVER_HOST" "docker exec tdarr-clean sh -c ' + # Force cleanup of stuck directory with detailed logging + local cleanup_cmd="ssh \"$SERVER_HOST\" \"docker exec tdarr-clean sh -c ' if [ -d \"/temp/$dir\" ]; then echo \"Cleaning /temp/$dir\" find \"/temp/$dir\" -type f -name \"*.tmp\" -delete 2>/dev/null find \"/temp/$dir\" -type f -delete 2>/dev/null find \"/temp/$dir\" -name \".*\" -delete 2>/dev/null rmdir \"/temp/$dir\" 2>/dev/null && echo \"Successfully removed $dir\" + else + echo \"Directory /temp/$dir not found\" fi - '" && ((cleaned++)) + '\"" + + local cleanup_result=$(log_command "$cleanup_cmd") + local cleanup_exit=$? + + if [[ $cleanup_exit -eq 0 ]] && [[ "$cleanup_result" == *"Successfully removed"* ]]; then + ((cleaned++)) + log_success "Successfully cleaned directory: $dir" + else + log_error "Failed to clean directory: $dir (exit: $cleanup_exit, output: $cleanup_result)" + fi fi done if [[ $cleaned -gt 0 ]]; then - log_message "Successfully cleaned $cleaned stuck work directories" + log_success "Successfully cleaned $cleaned of $count stuck work directories" local message="# 🎬 Tdarr Monitor **Successfully cleaned $cleaned stuck work directories:** - Removed partial download files (.tmp) @@ -201,7 +331,7 @@ check_stuck_workdirs() { System maintenance completed automatically." send_discord_notification "$message" else - log_message "Failed to clean $count stuck work directories" + log_error "Failed to clean any of the $count stuck work directories" local dir_list=$(echo "$stuck_dirs" | sed 's/^/- /') local message="# 🎬 Tdarr Monitor **$count stuck work directories detected:** @@ -210,20 +340,45 @@ $dir_list Cleanup failed - manual intervention may be needed <@258104532423147520>." send_discord_notification "$message" fi + else + log_debug "No stuck work directories detected" fi + + local end_time=$(date +%s.%N) + local duration=$(echo "$end_time - $start_time" | bc 2>/dev/null || echo "0") + log_message "Stuck work directory check completed in ${duration}s" } # Function to check for successful completions check_completions() { - log_message "Checking for successful transcodes" + local start_time=$(date +%s.%N) + log_message "Starting completion check" # Check server logs for successful transcodes local since_docker=$(date -d "@$LAST_CHECK" -u +%Y-%m-%dT%H:%M:%S.000000000Z) + log_debug "Checking for successful transcodes since: $since_docker" - local successes=$(ssh "$SERVER_HOST" "docker logs --since='$since_docker' tdarr-clean 2>&1" | \ - grep -i "transcode.*success\|transcode.*complete" | wc -l) + local completion_cmd="ssh \"$SERVER_HOST\" \"docker logs --since='$since_docker' tdarr-clean 2>&1\"" + local completion_logs=$(log_command "$completion_cmd") + local completion_exit=$? + + if [[ $completion_exit -ne 0 ]]; then + log_error "Failed to retrieve completion logs from server" + return 1 + fi + + local success_lines=$(echo "$completion_logs" | grep -i "transcode.*success\|transcode.*complete") + local successes=$(echo "$success_lines" | wc -l) + log_debug "Found $successes successful transcode completion entries" if [[ $successes -gt 0 ]]; then + log_success "Detected $successes successful transcodes" + + # Log sample success entries for debugging + echo "$success_lines" | head -3 | while IFS= read -r success_line; do + [[ -n "$success_line" ]] && log_debug "Success entry: $success_line" + done + local message="# 🎬 Tdarr Monitor **$successes transcode(s) completed successfully:** - Processing completed without errors @@ -231,31 +386,66 @@ check_completions() { System operating normally." send_discord_notification "$message" + log_success "Sent Discord notification for $successes completions" + else + log_debug "No successful transcodes detected" fi + + local end_time=$(date +%s.%N) + local duration=$(echo "$end_time - $start_time" | bc 2>/dev/null || echo "0") + log_message "Completion check completed in ${duration}s" } # Main monitoring logic main() { + local script_start=$(date +%s.%N) rotate_log - log_message "Starting Tdarr timeout monitor check (last: $(date -d "@$LAST_CHECK"), current: $(date))" + + log_message "=== Starting Tdarr timeout monitor check ===" + log_debug "Last check: $(date -d "@$LAST_CHECK" 2>/dev/null || echo "Invalid timestamp: $LAST_CHECK")" + log_debug "Current time: $(date)" + log_debug "Time difference: $((CURRENT_TIME - LAST_CHECK)) seconds" + log_debug "Server host: $SERVER_HOST" + log_debug "Node container: $NODE_CONTAINER" + log_debug "Log file: $LOG_FILE (size: $(stat -c%s "$LOG_FILE" 2>/dev/null || echo "0") bytes)" # Only proceed if more than 15 minutes (900 seconds) since last check if [[ $((CURRENT_TIME - LAST_CHECK)) -lt 900 ]]; then - log_message "Less than 15 minutes since last check, skipping" + log_warning "Less than 15 minutes since last check, skipping ($(($CURRENT_TIME - $LAST_CHECK))s elapsed)" exit 0 fi - # Perform checks - check_server_timeouts - check_node_stalls - check_stuck_workdirs + # Perform checks with error handling + log_message "Beginning monitoring checks..." + + if ! check_server_timeouts; then + log_error "Server timeout check failed" + fi + + if ! check_node_stalls; then + log_error "Node stall check failed" + fi + + if ! check_stuck_workdirs; then + log_error "Stuck work directory check failed" + fi # Optional: Check for successes (comment out if too noisy) - # check_completions + # if ! check_completions; then + # log_error "Completion check failed" + # fi # Update timestamp echo "$CURRENT_TIME" > "$LAST_CHECK_FILE" - log_message "Monitor check completed" + if [[ $? -eq 0 ]]; then + log_debug "Updated timestamp file: $LAST_CHECK_FILE" + else + log_error "Failed to update timestamp file: $LAST_CHECK_FILE" + fi + + local script_end=$(date +%s.%N) + local total_duration=$(echo "$script_end - $script_start" | bc 2>/dev/null || echo "0") + log_success "=== Monitor check completed in ${total_duration}s ===" } # Run main function diff --git a/scripts/tdarr/start-tdarr-gpu-podman-clean.sh b/scripts/tdarr/start-tdarr-gpu-podman-clean.sh index c9ad6fd..ca4960a 100755 --- a/scripts/tdarr/start-tdarr-gpu-podman-clean.sh +++ b/scripts/tdarr/start-tdarr-gpu-podman-clean.sh @@ -39,6 +39,7 @@ podman run -d --name "${CONTAINER_NAME}" \ -e NVIDIA_DRIVER_CAPABILITIES=all \ -e NVIDIA_VISIBLE_DEVICES=all \ -v "/mnt/NV2/tdarr-cache:/cache" \ + -v "/mnt/media:/app/unmappedNodeCache/nobara-pc-gpu-unmapped/media" \ ghcr.io/haveagitgat/tdarr_node:latest echo "⏳ Waiting for container to initialize..."