claude-home/scripts/monitoring/tdarr-timeout-monitor.sh

#!/bin/bash

# Tdarr Enhanced Monitoring System v2.0
# Monitors Tdarr Server and Node for staging timeouts, worker stalls, and stuck work directories
# Features: Automatic cleanup, Discord notifications with markdown formatting, comprehensive logging
#
# RECENT IMPROVEMENTS (2025-08-10):
# - Added staging section timeout detection and automatic cleanup
# - Implemented structured Discord notifications with working user pings
# - Enhanced JSON handling with proper escaping for special characters
# - Added comprehensive logging with automatic rotation (1MB limit)
# - Fixed shell compatibility for Docker container execution
# - Separated markdown formatting from actionable alerts for proper Discord pings
#
# Runs every 20 minutes via cron: */20 * * * * /path/to/this/script
# Logs: /tmp/tdarr-monitor/monitor.log (auto-rotates at 1MB)

# Configuration
DISCORD_WEBHOOK="https://discord.com/api/webhooks/1404105821549498398/y2Ud1RK9rzFjv58xbypUfQNe3jrL7ZUq1FkQHa4_dfOHm2ylp93z0f4tY0O8Z-vQgKhD"
SERVER_HOST="tdarr"  # SSH alias for Tdarr server
NODE_CONTAINER="tdarr-node-gpu-unmapped"
SCRIPT_DIR="/tmp/tdarr-monitor"
LAST_CHECK_FILE="$SCRIPT_DIR/last_check.timestamp"
LOG_FILE="$SCRIPT_DIR/monitor.log"
MAX_LOG_SIZE="1048576"  # 1MB in bytes

# Function to send Discord notification
send_discord_notification() {
    local message="$1"
    local color="15158332"  # Red color for alerts

    if [[ "$message" == *"success"* ]] || [[ "$message" == *"started"* ]]; then
        color="3066993"  # Green color for success
    fi

    # Check if message contains a ping - extract it to send separately
    local ping_message=""
    local clean_message="$message"

    if [[ "$message" == *"<@"* ]]; then
        # Extract the line with the ping
        ping_message=$(echo "$message" | grep -o ".*<@[0-9]*>.*")
        # Remove the ping line from the main message
        clean_message=$(echo "$message" | grep -v "<@[0-9]*>")
    fi

    # Wrap main message in markdown code block
    local markdown_message="\`\`\`md
$clean_message
\`\`\`"

    # Add ping message after the markdown block if it exists
    if [[ -n "$ping_message" ]]; then
        markdown_message="$markdown_message
$ping_message"
    fi

    # Properly escape for JSON: backslashes, quotes, and newlines
    local escaped_message=$(echo "$markdown_message" | sed 's/\\/\\\\/g; s/"/\\"/g; :a;N;$!ba;s/\n/\\n/g')

    curl -H "Content-Type: application/json" \
         -X POST \
         -d "{\"content\": \"$escaped_message\"}" \
         "$DISCORD_WEBHOOK" 2>/dev/null
}

# Create script directory
mkdir -p "$SCRIPT_DIR"

# Logging functions
log_message() {
    echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
}

rotate_log() {
    if [[ -f "$LOG_FILE" ]] && [[ $(stat -f%z "$LOG_FILE" 2>/dev/null || stat -c%s "$LOG_FILE" 2>/dev/null) -gt $MAX_LOG_SIZE ]]; then
        mv "$LOG_FILE" "$LOG_FILE.old"
        log_message "Log rotated due to size limit"
    fi
}

# Initialize timestamp file if it doesn't exist
if [[ ! -f "$LAST_CHECK_FILE" ]]; then
    date +%s > "$LAST_CHECK_FILE"
    local message="# 🎬 Tdarr Monitor
**Timeout monitoring started:**
- Checking every 20 minutes for staging timeouts
- Automatic cleanup of stuck work directories
- Discord notifications enabled

System monitoring active."
    send_discord_notification "$message"
fi

LAST_CHECK=$(cat "$LAST_CHECK_FILE")
CURRENT_TIME=$(date +%s)

# Function to check server logs for limbo timeouts
check_server_timeouts() {
    log_message "Checking server logs for limbo timeouts"

    # Get server logs since last check (convert to docker logs format)
    local since_docker=$(date -d "@$LAST_CHECK" -u +%Y-%m-%dT%H:%M:%S.000000000Z)

    local timeouts=$(ssh "$SERVER_HOST" "docker logs --since='$since_docker' tdarr-clean 2>&1" | \
                    grep -i "has been in limbo" | \
                    grep -o "/media/[^']*" | \
                    sed 's|/media/||')

    if [[ -n "$timeouts" ]]; then
        local count=$(echo "$timeouts" | wc -l)
        local files=$(echo "$timeouts" | head -3)  # Show first 3 files
        log_message "Found $count file(s) timed out in staging section"

        local message="# 🎬 Tdarr Monitor
**$count file(s) timed out in staging section:**"

        # Convert files to bullet points
        local file_list=$(echo "$files" | sed 's/^/- /')
        message="$message
$file_list"

        if [[ $count -gt 3 ]]; then
            message="$message
- ... and $(($count - 3)) more files"
        fi

        message="$message

Files were automatically removed from staging and will retry."

        send_discord_notification "$message"
    fi
}

# Function to check node logs for worker stalls
check_node_stalls() {
    log_message "Checking node logs for worker stalls"

    # Get node logs since last check
    local stalls=$(podman logs --since="@$LAST_CHECK" "$NODE_CONTAINER" 2>&1 | \
                  grep -i "worker.*stalled\|worker.*disconnected")

    if [[ -n "$stalls" ]]; then
        local count=$(echo "$stalls" | wc -l)
        local workers=$(echo "$stalls" | grep -o "Worker [^ ]*" | sort -u | head -3)
        log_message "Found $count worker stall(s)"

        local message="# 🎬 Tdarr Monitor
**$count worker stall(s) detected:**"

        # Convert workers to bullet points
        local worker_list=$(echo "$workers" | sed 's/^/- /')
        message="$message
$worker_list

Workers were automatically cancelled and will restart."

        send_discord_notification "$message"
    fi
}

# Function to check and clean stuck work directories
check_stuck_workdirs() {
    log_message "Checking for stuck work directories"

    # Find work directories that are failing to be cleaned up
    local stuck_dirs=$(ssh "$SERVER_HOST" "docker logs --since='30m' tdarr-clean 2>&1" | \
                      grep "ENOTEMPTY.*tdarr-workDir" | \
                      grep -o "tdarr-workDir[^']*" | \
                      sort -u)

    if [[ -n "$stuck_dirs" ]]; then
        local count=$(echo "$stuck_dirs" | wc -l)
        local cleaned=0

        echo "$stuck_dirs" | while IFS= read -r dir; do
            if [[ -n "$dir" ]]; then
                log_message "Attempting to clean stuck directory: $dir"

                # Force cleanup of stuck directory
                ssh "$SERVER_HOST" "docker exec tdarr-clean sh -c '
                    if [ -d \"/temp/$dir\" ]; then
                        echo \"Cleaning /temp/$dir\"
                        find \"/temp/$dir\" -type f -name \"*.tmp\" -delete 2>/dev/null
                        find \"/temp/$dir\" -type f -delete 2>/dev/null
                        find \"/temp/$dir\" -name \".*\" -delete 2>/dev/null
                        rmdir \"/temp/$dir\" 2>/dev/null && echo \"Successfully removed $dir\"
                    fi
                '" && ((cleaned++))
            fi
        done

        if [[ $cleaned -gt 0 ]]; then
            log_message "Successfully cleaned $cleaned stuck work directories"
            local message="# 🎬 Tdarr Monitor
**Successfully cleaned $cleaned stuck work directories:**
- Removed partial download files (.tmp)
- Cleared blocking staging section cleanup

System maintenance completed automatically."
            send_discord_notification "$message"
        else
            log_message "Failed to clean $count stuck work directories"
            local dir_list=$(echo "$stuck_dirs" | sed 's/^/- /')
            local message="# 🎬 Tdarr Monitor
**$count stuck work directories detected:**
$dir_list

Cleanup failed - manual intervention may be needed <@258104532423147520>."
            send_discord_notification "$message"
        fi
    fi
}

# Function to check for successful completions
check_completions() {
    log_message "Checking for successful transcodes"

    # Check server logs for successful transcodes
    local since_docker=$(date -d "@$LAST_CHECK" -u +%Y-%m-%dT%H:%M:%S.000000000Z)

    local successes=$(ssh "$SERVER_HOST" "docker logs --since='$since_docker' tdarr-clean 2>&1" | \
                     grep -i "transcode.*success\|transcode.*complete" | wc -l)

    if [[ $successes -gt 0 ]]; then
        local message="# 🎬 Tdarr Monitor
**$successes transcode(s) completed successfully:**
- Processing completed without errors
- Files ready for use

System operating normally."
        send_discord_notification "$message"
    fi
}

# Main monitoring logic
main() {
    rotate_log
    log_message "Starting Tdarr timeout monitor check (last: $(date -d "@$LAST_CHECK"), current: $(date))"

    # Only proceed if more than 15 minutes (900 seconds) since last check
    if [[ $((CURRENT_TIME - LAST_CHECK)) -lt 900 ]]; then
        log_message "Less than 15 minutes since last check, skipping"
        exit 0
    fi

    # Perform checks
    check_server_timeouts
    check_node_stalls
    check_stuck_workdirs

    # Optional: Check for successes (comment out if too noisy)
    # check_completions

    # Update timestamp
    echo "$CURRENT_TIME" > "$LAST_CHECK_FILE"
    log_message "Monitor check completed"
}

# Run main function
main "$@"