claude-home/scripts/monitoring/tdarr-timeout-monitor.sh
Cal Corum 26f5b82afa CLAUDE: Enhance operational scripts and add mobile SSH documentation
SSH Homelab Setup:
- Add mobile device SSH access documentation (Termius setup)
- Include prerequisites checklist and key transfer process
- Document network discovery commands for mobile access

Tdarr Timeout Monitor:
- Add comprehensive debug logging with structured levels (INFO/DEBUG/ERROR/WARN/SUCCESS)
- Implement command execution timing and detailed error tracking
- Enhance container status verification and error handling
- Add log entry counting and detailed output analysis
- Improve cleanup operations with better failure detection
- Add performance metrics and duration tracking for all operations

Tdarr Node Startup:
- Add unmapped node cache volume mapping for media access
- Complete production configuration for distributed transcoding

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-10 16:22:57 -05:00

452 lines
16 KiB
Bash
Executable File

#!/bin/bash
# Tdarr Enhanced Monitoring System v2.0
# Monitors Tdarr Server and Node for staging timeouts, worker stalls, and stuck work directories
# Features: Automatic cleanup, Discord notifications with markdown formatting, comprehensive logging
#
# RECENT IMPROVEMENTS (2025-08-10):
# - Added staging section timeout detection and automatic cleanup
# - Implemented structured Discord notifications with working user pings
# - Enhanced JSON handling with proper escaping for special characters
# - Added comprehensive logging with automatic rotation (1MB limit)
# - Fixed shell compatibility for Docker container execution
# - Separated markdown formatting from actionable alerts for proper Discord pings
#
# Runs every 20 minutes via cron: */20 * * * * /path/to/this/script
# Logs: /tmp/tdarr-monitor/monitor.log (auto-rotates at 1MB)
# Configuration
DISCORD_WEBHOOK="https://discord.com/api/webhooks/1404105821549498398/y2Ud1RK9rzFjv58xbypUfQNe3jrL7ZUq1FkQHa4_dfOHm2ylp93z0f4tY0O8Z-vQgKhD"
SERVER_HOST="tdarr" # SSH alias for Tdarr server
NODE_CONTAINER="tdarr-node-gpu-unmapped"
SCRIPT_DIR="/tmp/tdarr-monitor"
LAST_CHECK_FILE="$SCRIPT_DIR/last_check.timestamp"
LOG_FILE="$SCRIPT_DIR/monitor.log"
MAX_LOG_SIZE="1048576" # 1MB in bytes
# Function to send Discord notification
send_discord_notification() {
local message="$1"
local color="15158332" # Red color for alerts
if [[ "$message" == *"success"* ]] || [[ "$message" == *"started"* ]]; then
color="3066993" # Green color for success
fi
# Check if message contains a ping - extract it to send separately
local ping_message=""
local clean_message="$message"
if [[ "$message" == *"<@"* ]]; then
# Extract the line with the ping
ping_message=$(echo "$message" | grep -o ".*<@[0-9]*>.*")
# Remove the ping line from the main message
clean_message=$(echo "$message" | grep -v "<@[0-9]*>")
fi
# Wrap main message in markdown code block
local markdown_message="\`\`\`md
$clean_message
\`\`\`"
# Add ping message after the markdown block if it exists
if [[ -n "$ping_message" ]]; then
markdown_message="$markdown_message
$ping_message"
fi
# Properly escape for JSON: backslashes, quotes, and newlines
local escaped_message=$(echo "$markdown_message" | sed 's/\\/\\\\/g; s/"/\\"/g; :a;N;$!ba;s/\n/\\n/g')
curl -H "Content-Type: application/json" \
-X POST \
-d "{\"content\": \"$escaped_message\"}" \
"$DISCORD_WEBHOOK" 2>/dev/null
}
# Create script directory
mkdir -p "$SCRIPT_DIR"
# Enhanced logging functions
log_message() {
echo "$(date '+%Y-%m-%d %H:%M:%S') [INFO] - $1" >> "$LOG_FILE"
}
log_debug() {
echo "$(date '+%Y-%m-%d %H:%M:%S') [DEBUG] - $1" >> "$LOG_FILE"
}
log_error() {
echo "$(date '+%Y-%m-%d %H:%M:%S') [ERROR] - $1" >> "$LOG_FILE"
}
log_warning() {
echo "$(date '+%Y-%m-%d %H:%M:%S') [WARN] - $1" >> "$LOG_FILE"
}
log_success() {
echo "$(date '+%Y-%m-%d %H:%M:%S') [SUCCESS] - $1" >> "$LOG_FILE"
}
# Log command execution with timing
log_command() {
local cmd="$1"
local start_time=$(date +%s.%N)
log_debug "Executing command: $cmd"
local result=$(eval "$cmd" 2>&1)
local exit_code=$?
local end_time=$(date +%s.%N)
local duration=$(echo "$end_time - $start_time" | bc 2>/dev/null || echo "0")
if [[ $exit_code -eq 0 ]]; then
log_debug "Command completed in ${duration}s (exit: $exit_code)"
if [[ -n "$result" ]]; then
log_debug "Command output: $result"
fi
else
log_error "Command failed in ${duration}s (exit: $exit_code): $result"
fi
echo "$result"
return $exit_code
}
rotate_log() {
if [[ -f "$LOG_FILE" ]] && [[ $(stat -f%z "$LOG_FILE" 2>/dev/null || stat -c%s "$LOG_FILE" 2>/dev/null) -gt $MAX_LOG_SIZE ]]; then
mv "$LOG_FILE" "$LOG_FILE.old"
log_message "Log rotated due to size limit"
fi
}
# Initialize timestamp file if it doesn't exist
if [[ ! -f "$LAST_CHECK_FILE" ]]; then
date +%s > "$LAST_CHECK_FILE"
local message="# 🎬 Tdarr Monitor
**Timeout monitoring started:**
- Checking every 20 minutes for staging timeouts
- Automatic cleanup of stuck work directories
- Discord notifications enabled
System monitoring active."
send_discord_notification "$message"
fi
LAST_CHECK=$(cat "$LAST_CHECK_FILE")
CURRENT_TIME=$(date +%s)
# Function to check server logs for limbo timeouts
check_server_timeouts() {
local start_time=$(date +%s.%N)
log_message "Starting server timeout check"
# Get server logs since last check (convert to docker logs format)
local since_docker=$(date -d "@$LAST_CHECK" -u +%Y-%m-%dT%H:%M:%S.000000000Z)
log_debug "Checking server logs since: $since_docker (timestamp: $LAST_CHECK)"
# Execute SSH command with detailed logging
local ssh_cmd="ssh \"$SERVER_HOST\" \"docker logs --since='$since_docker' tdarr-clean 2>&1\""
local server_logs=$(log_command "$ssh_cmd")
local ssh_exit=$?
if [[ $ssh_exit -ne 0 ]]; then
log_error "Failed to retrieve server logs via SSH"
return 1
fi
log_debug "Retrieved $(echo "$server_logs" | wc -l) lines of server logs"
# Process logs for limbo timeouts
local limbo_lines=$(echo "$server_logs" | grep -i "has been in limbo")
local limbo_count=$(echo "$limbo_lines" | grep -c "has been in limbo" 2>/dev/null || echo "0")
log_debug "Found $limbo_count limbo timeout entries in logs"
local timeouts=$(echo "$limbo_lines" | \
grep -o "/media/[^']*" | \
sed 's|/media/||')
if [[ -n "$timeouts" ]]; then
local count=$(echo "$timeouts" | wc -l)
local files=$(echo "$timeouts" | head -3) # Show first 3 files
log_warning "Found $count file(s) timed out in staging section"
# Log each timed out file for debugging
echo "$timeouts" | while IFS= read -r file; do
[[ -n "$file" ]] && log_debug "Timed out file: $file"
done
local message="# 🎬 Tdarr Monitor
**$count file(s) timed out in staging section:**"
# Convert files to bullet points
local file_list=$(echo "$files" | sed 's/^/- /')
message="$message
$file_list"
if [[ $count -gt 3 ]]; then
message="$message
- ... and $(($count - 3)) more files"
fi
message="$message
Files were automatically removed from staging and will retry."
send_discord_notification "$message"
log_success "Sent Discord notification for $count timed out files"
else
log_debug "No files found timed out in staging section"
fi
local end_time=$(date +%s.%N)
local duration=$(echo "$end_time - $start_time" | bc 2>/dev/null || echo "0")
log_message "Server timeout check completed in ${duration}s"
}
# Function to check node logs for worker stalls
check_node_stalls() {
local start_time=$(date +%s.%N)
log_message "Starting node stall check"
# Check if node container is running
local container_status=$(podman ps --format "{{.Status}}" --filter "name=$NODE_CONTAINER" 2>/dev/null)
if [[ -z "$container_status" ]]; then
log_warning "Node container '$NODE_CONTAINER' not found or not running"
return 1
fi
log_debug "Node container status: $container_status"
# Get node logs since last check
local node_cmd="podman logs --since=\"@$LAST_CHECK\" \"$NODE_CONTAINER\" 2>&1"
local node_logs=$(log_command "$node_cmd")
local node_exit=$?
if [[ $node_exit -ne 0 ]]; then
log_error "Failed to retrieve node logs"
return 1
fi
log_debug "Retrieved $(echo "$node_logs" | wc -l) lines of node logs"
# Process logs for worker stalls
local stalls=$(echo "$node_logs" | grep -i "worker.*stalled\|worker.*disconnected")
local stall_count=$(echo "$stalls" | grep -c "worker.*" 2>/dev/null || echo "0")
log_debug "Found $stall_count potential worker stall entries"
if [[ -n "$stalls" ]]; then
local count=$(echo "$stalls" | wc -l)
local workers=$(echo "$stalls" | grep -o "Worker [^ ]*" | sort -u | head -3)
log_warning "Found $count worker stall(s)"
# Log each stalled worker for debugging
echo "$stalls" | while IFS= read -r stall_line; do
[[ -n "$stall_line" ]] && log_debug "Worker stall: $stall_line"
done
local message="# 🎬 Tdarr Monitor
**$count worker stall(s) detected:**"
# Convert workers to bullet points
local worker_list=$(echo "$workers" | sed 's/^/- /')
message="$message
$worker_list
Workers were automatically cancelled and will restart."
send_discord_notification "$message"
log_success "Sent Discord notification for $count worker stalls"
else
log_debug "No worker stalls detected"
fi
local end_time=$(date +%s.%N)
local duration=$(echo "$end_time - $start_time" | bc 2>/dev/null || echo "0")
log_message "Node stall check completed in ${duration}s"
}
# Function to check and clean stuck work directories
check_stuck_workdirs() {
local start_time=$(date +%s.%N)
log_message "Starting stuck work directory check"
# Find work directories that are failing to be cleaned up
local workdir_cmd="ssh \"$SERVER_HOST\" \"docker logs --since='30m' tdarr-clean 2>&1\""
local workdir_logs=$(log_command "$workdir_cmd")
local workdir_exit=$?
if [[ $workdir_exit -ne 0 ]]; then
log_error "Failed to retrieve work directory logs from server"
return 1
fi
local enotempty_lines=$(echo "$workdir_logs" | grep "ENOTEMPTY.*tdarr-workDir")
local enotempty_count=$(echo "$enotempty_lines" | grep -c "ENOTEMPTY" 2>/dev/null || echo "0")
log_debug "Found $enotempty_count ENOTEMPTY errors for work directories"
local stuck_dirs=$(echo "$enotempty_lines" | \
grep -o "tdarr-workDir[^']*" | \
sort -u)
if [[ -n "$stuck_dirs" ]]; then
local count=$(echo "$stuck_dirs" | wc -l)
local cleaned=0
log_warning "Found $count stuck work directories to clean"
echo "$stuck_dirs" | while IFS= read -r dir; do
if [[ -n "$dir" ]]; then
log_debug "Attempting to clean stuck directory: $dir"
# Force cleanup of stuck directory with detailed logging
local cleanup_cmd="ssh \"$SERVER_HOST\" \"docker exec tdarr-clean sh -c '
if [ -d \"/temp/$dir\" ]; then
echo \"Cleaning /temp/$dir\"
find \"/temp/$dir\" -type f -name \"*.tmp\" -delete 2>/dev/null
find \"/temp/$dir\" -type f -delete 2>/dev/null
find \"/temp/$dir\" -name \".*\" -delete 2>/dev/null
rmdir \"/temp/$dir\" 2>/dev/null && echo \"Successfully removed $dir\"
else
echo \"Directory /temp/$dir not found\"
fi
'\""
local cleanup_result=$(log_command "$cleanup_cmd")
local cleanup_exit=$?
if [[ $cleanup_exit -eq 0 ]] && [[ "$cleanup_result" == *"Successfully removed"* ]]; then
((cleaned++))
log_success "Successfully cleaned directory: $dir"
else
log_error "Failed to clean directory: $dir (exit: $cleanup_exit, output: $cleanup_result)"
fi
fi
done
if [[ $cleaned -gt 0 ]]; then
log_success "Successfully cleaned $cleaned of $count stuck work directories"
local message="# 🎬 Tdarr Monitor
**Successfully cleaned $cleaned stuck work directories:**
- Removed partial download files (.tmp)
- Cleared blocking staging section cleanup
System maintenance completed automatically."
send_discord_notification "$message"
else
log_error "Failed to clean any of the $count stuck work directories"
local dir_list=$(echo "$stuck_dirs" | sed 's/^/- /')
local message="# 🎬 Tdarr Monitor
**$count stuck work directories detected:**
$dir_list
Cleanup failed - manual intervention may be needed <@258104532423147520>."
send_discord_notification "$message"
fi
else
log_debug "No stuck work directories detected"
fi
local end_time=$(date +%s.%N)
local duration=$(echo "$end_time - $start_time" | bc 2>/dev/null || echo "0")
log_message "Stuck work directory check completed in ${duration}s"
}
# Function to check for successful completions
check_completions() {
local start_time=$(date +%s.%N)
log_message "Starting completion check"
# Check server logs for successful transcodes
local since_docker=$(date -d "@$LAST_CHECK" -u +%Y-%m-%dT%H:%M:%S.000000000Z)
log_debug "Checking for successful transcodes since: $since_docker"
local completion_cmd="ssh \"$SERVER_HOST\" \"docker logs --since='$since_docker' tdarr-clean 2>&1\""
local completion_logs=$(log_command "$completion_cmd")
local completion_exit=$?
if [[ $completion_exit -ne 0 ]]; then
log_error "Failed to retrieve completion logs from server"
return 1
fi
local success_lines=$(echo "$completion_logs" | grep -i "transcode.*success\|transcode.*complete")
local successes=$(echo "$success_lines" | wc -l)
log_debug "Found $successes successful transcode completion entries"
if [[ $successes -gt 0 ]]; then
log_success "Detected $successes successful transcodes"
# Log sample success entries for debugging
echo "$success_lines" | head -3 | while IFS= read -r success_line; do
[[ -n "$success_line" ]] && log_debug "Success entry: $success_line"
done
local message="# 🎬 Tdarr Monitor
**$successes transcode(s) completed successfully:**
- Processing completed without errors
- Files ready for use
System operating normally."
send_discord_notification "$message"
log_success "Sent Discord notification for $successes completions"
else
log_debug "No successful transcodes detected"
fi
local end_time=$(date +%s.%N)
local duration=$(echo "$end_time - $start_time" | bc 2>/dev/null || echo "0")
log_message "Completion check completed in ${duration}s"
}
# Main monitoring logic
main() {
local script_start=$(date +%s.%N)
rotate_log
log_message "=== Starting Tdarr timeout monitor check ==="
log_debug "Last check: $(date -d "@$LAST_CHECK" 2>/dev/null || echo "Invalid timestamp: $LAST_CHECK")"
log_debug "Current time: $(date)"
log_debug "Time difference: $((CURRENT_TIME - LAST_CHECK)) seconds"
log_debug "Server host: $SERVER_HOST"
log_debug "Node container: $NODE_CONTAINER"
log_debug "Log file: $LOG_FILE (size: $(stat -c%s "$LOG_FILE" 2>/dev/null || echo "0") bytes)"
# Only proceed if more than 15 minutes (900 seconds) since last check
if [[ $((CURRENT_TIME - LAST_CHECK)) -lt 900 ]]; then
log_warning "Less than 15 minutes since last check, skipping ($(($CURRENT_TIME - $LAST_CHECK))s elapsed)"
exit 0
fi
# Perform checks with error handling
log_message "Beginning monitoring checks..."
if ! check_server_timeouts; then
log_error "Server timeout check failed"
fi
if ! check_node_stalls; then
log_error "Node stall check failed"
fi
if ! check_stuck_workdirs; then
log_error "Stuck work directory check failed"
fi
# Optional: Check for successes (comment out if too noisy)
# if ! check_completions; then
# log_error "Completion check failed"
# fi
# Update timestamp
echo "$CURRENT_TIME" > "$LAST_CHECK_FILE"
if [[ $? -eq 0 ]]; then
log_debug "Updated timestamp file: $LAST_CHECK_FILE"
else
log_error "Failed to update timestamp file: $LAST_CHECK_FILE"
fi
local script_end=$(date +%s.%N)
local total_duration=$(echo "$script_end - $script_start" | bc 2>/dev/null || echo "0")
log_success "=== Monitor check completed in ${total_duration}s ==="
}
# Run main function
main "$@"