CLAUDE: Enhance operational scripts and add mobile SSH documentation
SSH Homelab Setup: - Add mobile device SSH access documentation (Termius setup) - Include prerequisites checklist and key transfer process - Document network discovery commands for mobile access Tdarr Timeout Monitor: - Add comprehensive debug logging with structured levels (INFO/DEBUG/ERROR/WARN/SUCCESS) - Implement command execution timing and detailed error tracking - Enhance container status verification and error handling - Add log entry counting and detailed output analysis - Improve cleanup operations with better failure detection - Add performance metrics and duration tracking for all operations Tdarr Node Startup: - Add unmapped node cache volume mapping for media access - Complete production configuration for distributed transcoding 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
bd49e9d61d
commit
26f5b82afa
@ -318,6 +318,50 @@ AllowUsers cal # Restrict SSH access
|
|||||||
sudo systemctl restart sshd
|
sudo systemctl restart sshd
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Mobile Device SSH Access
|
||||||
|
|
||||||
|
### Termius Setup (iPhone/iPad)
|
||||||
|
Connect to local development machine using existing homelab keys:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
Connection Details:
|
||||||
|
Host: 10.0.0.206 (or current eno1 IP)
|
||||||
|
Username: cal
|
||||||
|
Port: 22
|
||||||
|
Key: homelab_rsa (same key used for server access)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Prerequisites Checklist
|
||||||
|
Before mobile SSH access works, ensure:
|
||||||
|
|
||||||
|
1. **SSH service is running:**
|
||||||
|
```bash
|
||||||
|
sudo systemctl start sshd
|
||||||
|
sudo systemctl enable sshd
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Firewall allows SSH:**
|
||||||
|
```bash
|
||||||
|
sudo firewall-cmd --list-services # Should include 'ssh'
|
||||||
|
# If not present, SSH was already allowed in this setup
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Authorized keys configured:**
|
||||||
|
```bash
|
||||||
|
# homelab_rsa.pub should be in ~/.ssh/authorized_keys
|
||||||
|
cat ~/.ssh/homelab_rsa.pub >> ~/.ssh/authorized_keys
|
||||||
|
chmod 600 ~/.ssh/authorized_keys
|
||||||
|
```
|
||||||
|
|
||||||
|
### Key Transfer Process
|
||||||
|
1. Copy private key: `cp ~/.ssh/homelab_rsa ./homelab_rsa_for_mobile`
|
||||||
|
2. Transfer securely to mobile device (AirDrop, secure file share)
|
||||||
|
3. Import into Termius app
|
||||||
|
4. Clean up: `rm homelab_rsa_for_mobile`
|
||||||
|
|
||||||
|
### Network Discovery
|
||||||
|
Find local machine IP: `ip addr show eno1 | grep 'inet ' | awk '{print $2}' | cut -d'/' -f1`
|
||||||
|
|
||||||
## Related Documentation
|
## Related Documentation
|
||||||
|
|
||||||
- Patterns: `patterns/networking/ssh-key-management.md`
|
- Patterns: `patterns/networking/ssh-key-management.md`
|
||||||
|
|||||||
@ -67,9 +67,49 @@ $ping_message"
|
|||||||
# Create script directory
|
# Create script directory
|
||||||
mkdir -p "$SCRIPT_DIR"
|
mkdir -p "$SCRIPT_DIR"
|
||||||
|
|
||||||
# Logging functions
|
# Enhanced logging functions
|
||||||
log_message() {
|
log_message() {
|
||||||
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
|
echo "$(date '+%Y-%m-%d %H:%M:%S') [INFO] - $1" >> "$LOG_FILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_debug() {
|
||||||
|
echo "$(date '+%Y-%m-%d %H:%M:%S') [DEBUG] - $1" >> "$LOG_FILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_error() {
|
||||||
|
echo "$(date '+%Y-%m-%d %H:%M:%S') [ERROR] - $1" >> "$LOG_FILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_warning() {
|
||||||
|
echo "$(date '+%Y-%m-%d %H:%M:%S') [WARN] - $1" >> "$LOG_FILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_success() {
|
||||||
|
echo "$(date '+%Y-%m-%d %H:%M:%S') [SUCCESS] - $1" >> "$LOG_FILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Log command execution with timing
|
||||||
|
log_command() {
|
||||||
|
local cmd="$1"
|
||||||
|
local start_time=$(date +%s.%N)
|
||||||
|
log_debug "Executing command: $cmd"
|
||||||
|
|
||||||
|
local result=$(eval "$cmd" 2>&1)
|
||||||
|
local exit_code=$?
|
||||||
|
local end_time=$(date +%s.%N)
|
||||||
|
local duration=$(echo "$end_time - $start_time" | bc 2>/dev/null || echo "0")
|
||||||
|
|
||||||
|
if [[ $exit_code -eq 0 ]]; then
|
||||||
|
log_debug "Command completed in ${duration}s (exit: $exit_code)"
|
||||||
|
if [[ -n "$result" ]]; then
|
||||||
|
log_debug "Command output: $result"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log_error "Command failed in ${duration}s (exit: $exit_code): $result"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "$result"
|
||||||
|
return $exit_code
|
||||||
}
|
}
|
||||||
|
|
||||||
rotate_log() {
|
rotate_log() {
|
||||||
@ -97,20 +137,43 @@ CURRENT_TIME=$(date +%s)
|
|||||||
|
|
||||||
# Function to check server logs for limbo timeouts
|
# Function to check server logs for limbo timeouts
|
||||||
check_server_timeouts() {
|
check_server_timeouts() {
|
||||||
log_message "Checking server logs for limbo timeouts"
|
local start_time=$(date +%s.%N)
|
||||||
|
log_message "Starting server timeout check"
|
||||||
|
|
||||||
# Get server logs since last check (convert to docker logs format)
|
# Get server logs since last check (convert to docker logs format)
|
||||||
local since_docker=$(date -d "@$LAST_CHECK" -u +%Y-%m-%dT%H:%M:%S.000000000Z)
|
local since_docker=$(date -d "@$LAST_CHECK" -u +%Y-%m-%dT%H:%M:%S.000000000Z)
|
||||||
|
log_debug "Checking server logs since: $since_docker (timestamp: $LAST_CHECK)"
|
||||||
|
|
||||||
local timeouts=$(ssh "$SERVER_HOST" "docker logs --since='$since_docker' tdarr-clean 2>&1" | \
|
# Execute SSH command with detailed logging
|
||||||
grep -i "has been in limbo" | \
|
local ssh_cmd="ssh \"$SERVER_HOST\" \"docker logs --since='$since_docker' tdarr-clean 2>&1\""
|
||||||
|
local server_logs=$(log_command "$ssh_cmd")
|
||||||
|
local ssh_exit=$?
|
||||||
|
|
||||||
|
if [[ $ssh_exit -ne 0 ]]; then
|
||||||
|
log_error "Failed to retrieve server logs via SSH"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_debug "Retrieved $(echo "$server_logs" | wc -l) lines of server logs"
|
||||||
|
|
||||||
|
# Process logs for limbo timeouts
|
||||||
|
local limbo_lines=$(echo "$server_logs" | grep -i "has been in limbo")
|
||||||
|
local limbo_count=$(echo "$limbo_lines" | grep -c "has been in limbo" 2>/dev/null || echo "0")
|
||||||
|
log_debug "Found $limbo_count limbo timeout entries in logs"
|
||||||
|
|
||||||
|
local timeouts=$(echo "$limbo_lines" | \
|
||||||
grep -o "/media/[^']*" | \
|
grep -o "/media/[^']*" | \
|
||||||
sed 's|/media/||')
|
sed 's|/media/||')
|
||||||
|
|
||||||
if [[ -n "$timeouts" ]]; then
|
if [[ -n "$timeouts" ]]; then
|
||||||
local count=$(echo "$timeouts" | wc -l)
|
local count=$(echo "$timeouts" | wc -l)
|
||||||
local files=$(echo "$timeouts" | head -3) # Show first 3 files
|
local files=$(echo "$timeouts" | head -3) # Show first 3 files
|
||||||
log_message "Found $count file(s) timed out in staging section"
|
log_warning "Found $count file(s) timed out in staging section"
|
||||||
|
|
||||||
|
# Log each timed out file for debugging
|
||||||
|
echo "$timeouts" | while IFS= read -r file; do
|
||||||
|
[[ -n "$file" ]] && log_debug "Timed out file: $file"
|
||||||
|
done
|
||||||
|
|
||||||
local message="# 🎬 Tdarr Monitor
|
local message="# 🎬 Tdarr Monitor
|
||||||
**$count file(s) timed out in staging section:**"
|
**$count file(s) timed out in staging section:**"
|
||||||
@ -130,21 +193,55 @@ $file_list"
|
|||||||
Files were automatically removed from staging and will retry."
|
Files were automatically removed from staging and will retry."
|
||||||
|
|
||||||
send_discord_notification "$message"
|
send_discord_notification "$message"
|
||||||
|
log_success "Sent Discord notification for $count timed out files"
|
||||||
|
else
|
||||||
|
log_debug "No files found timed out in staging section"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
local end_time=$(date +%s.%N)
|
||||||
|
local duration=$(echo "$end_time - $start_time" | bc 2>/dev/null || echo "0")
|
||||||
|
log_message "Server timeout check completed in ${duration}s"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Function to check node logs for worker stalls
|
# Function to check node logs for worker stalls
|
||||||
check_node_stalls() {
|
check_node_stalls() {
|
||||||
log_message "Checking node logs for worker stalls"
|
local start_time=$(date +%s.%N)
|
||||||
|
log_message "Starting node stall check"
|
||||||
|
|
||||||
|
# Check if node container is running
|
||||||
|
local container_status=$(podman ps --format "{{.Status}}" --filter "name=$NODE_CONTAINER" 2>/dev/null)
|
||||||
|
if [[ -z "$container_status" ]]; then
|
||||||
|
log_warning "Node container '$NODE_CONTAINER' not found or not running"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
log_debug "Node container status: $container_status"
|
||||||
|
|
||||||
# Get node logs since last check
|
# Get node logs since last check
|
||||||
local stalls=$(podman logs --since="@$LAST_CHECK" "$NODE_CONTAINER" 2>&1 | \
|
local node_cmd="podman logs --since=\"@$LAST_CHECK\" \"$NODE_CONTAINER\" 2>&1"
|
||||||
grep -i "worker.*stalled\|worker.*disconnected")
|
local node_logs=$(log_command "$node_cmd")
|
||||||
|
local node_exit=$?
|
||||||
|
|
||||||
|
if [[ $node_exit -ne 0 ]]; then
|
||||||
|
log_error "Failed to retrieve node logs"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_debug "Retrieved $(echo "$node_logs" | wc -l) lines of node logs"
|
||||||
|
|
||||||
|
# Process logs for worker stalls
|
||||||
|
local stalls=$(echo "$node_logs" | grep -i "worker.*stalled\|worker.*disconnected")
|
||||||
|
local stall_count=$(echo "$stalls" | grep -c "worker.*" 2>/dev/null || echo "0")
|
||||||
|
log_debug "Found $stall_count potential worker stall entries"
|
||||||
|
|
||||||
if [[ -n "$stalls" ]]; then
|
if [[ -n "$stalls" ]]; then
|
||||||
local count=$(echo "$stalls" | wc -l)
|
local count=$(echo "$stalls" | wc -l)
|
||||||
local workers=$(echo "$stalls" | grep -o "Worker [^ ]*" | sort -u | head -3)
|
local workers=$(echo "$stalls" | grep -o "Worker [^ ]*" | sort -u | head -3)
|
||||||
log_message "Found $count worker stall(s)"
|
log_warning "Found $count worker stall(s)"
|
||||||
|
|
||||||
|
# Log each stalled worker for debugging
|
||||||
|
echo "$stalls" | while IFS= read -r stall_line; do
|
||||||
|
[[ -n "$stall_line" ]] && log_debug "Worker stall: $stall_line"
|
||||||
|
done
|
||||||
|
|
||||||
local message="# 🎬 Tdarr Monitor
|
local message="# 🎬 Tdarr Monitor
|
||||||
**$count worker stall(s) detected:**"
|
**$count worker stall(s) detected:**"
|
||||||
@ -157,42 +254,75 @@ $worker_list
|
|||||||
Workers were automatically cancelled and will restart."
|
Workers were automatically cancelled and will restart."
|
||||||
|
|
||||||
send_discord_notification "$message"
|
send_discord_notification "$message"
|
||||||
|
log_success "Sent Discord notification for $count worker stalls"
|
||||||
|
else
|
||||||
|
log_debug "No worker stalls detected"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
local end_time=$(date +%s.%N)
|
||||||
|
local duration=$(echo "$end_time - $start_time" | bc 2>/dev/null || echo "0")
|
||||||
|
log_message "Node stall check completed in ${duration}s"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Function to check and clean stuck work directories
|
# Function to check and clean stuck work directories
|
||||||
check_stuck_workdirs() {
|
check_stuck_workdirs() {
|
||||||
log_message "Checking for stuck work directories"
|
local start_time=$(date +%s.%N)
|
||||||
|
log_message "Starting stuck work directory check"
|
||||||
|
|
||||||
# Find work directories that are failing to be cleaned up
|
# Find work directories that are failing to be cleaned up
|
||||||
local stuck_dirs=$(ssh "$SERVER_HOST" "docker logs --since='30m' tdarr-clean 2>&1" | \
|
local workdir_cmd="ssh \"$SERVER_HOST\" \"docker logs --since='30m' tdarr-clean 2>&1\""
|
||||||
grep "ENOTEMPTY.*tdarr-workDir" | \
|
local workdir_logs=$(log_command "$workdir_cmd")
|
||||||
|
local workdir_exit=$?
|
||||||
|
|
||||||
|
if [[ $workdir_exit -ne 0 ]]; then
|
||||||
|
log_error "Failed to retrieve work directory logs from server"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
local enotempty_lines=$(echo "$workdir_logs" | grep "ENOTEMPTY.*tdarr-workDir")
|
||||||
|
local enotempty_count=$(echo "$enotempty_lines" | grep -c "ENOTEMPTY" 2>/dev/null || echo "0")
|
||||||
|
log_debug "Found $enotempty_count ENOTEMPTY errors for work directories"
|
||||||
|
|
||||||
|
local stuck_dirs=$(echo "$enotempty_lines" | \
|
||||||
grep -o "tdarr-workDir[^']*" | \
|
grep -o "tdarr-workDir[^']*" | \
|
||||||
sort -u)
|
sort -u)
|
||||||
|
|
||||||
if [[ -n "$stuck_dirs" ]]; then
|
if [[ -n "$stuck_dirs" ]]; then
|
||||||
local count=$(echo "$stuck_dirs" | wc -l)
|
local count=$(echo "$stuck_dirs" | wc -l)
|
||||||
local cleaned=0
|
local cleaned=0
|
||||||
|
log_warning "Found $count stuck work directories to clean"
|
||||||
|
|
||||||
echo "$stuck_dirs" | while IFS= read -r dir; do
|
echo "$stuck_dirs" | while IFS= read -r dir; do
|
||||||
if [[ -n "$dir" ]]; then
|
if [[ -n "$dir" ]]; then
|
||||||
log_message "Attempting to clean stuck directory: $dir"
|
log_debug "Attempting to clean stuck directory: $dir"
|
||||||
|
|
||||||
# Force cleanup of stuck directory
|
# Force cleanup of stuck directory with detailed logging
|
||||||
ssh "$SERVER_HOST" "docker exec tdarr-clean sh -c '
|
local cleanup_cmd="ssh \"$SERVER_HOST\" \"docker exec tdarr-clean sh -c '
|
||||||
if [ -d \"/temp/$dir\" ]; then
|
if [ -d \"/temp/$dir\" ]; then
|
||||||
echo \"Cleaning /temp/$dir\"
|
echo \"Cleaning /temp/$dir\"
|
||||||
find \"/temp/$dir\" -type f -name \"*.tmp\" -delete 2>/dev/null
|
find \"/temp/$dir\" -type f -name \"*.tmp\" -delete 2>/dev/null
|
||||||
find \"/temp/$dir\" -type f -delete 2>/dev/null
|
find \"/temp/$dir\" -type f -delete 2>/dev/null
|
||||||
find \"/temp/$dir\" -name \".*\" -delete 2>/dev/null
|
find \"/temp/$dir\" -name \".*\" -delete 2>/dev/null
|
||||||
rmdir \"/temp/$dir\" 2>/dev/null && echo \"Successfully removed $dir\"
|
rmdir \"/temp/$dir\" 2>/dev/null && echo \"Successfully removed $dir\"
|
||||||
|
else
|
||||||
|
echo \"Directory /temp/$dir not found\"
|
||||||
fi
|
fi
|
||||||
'" && ((cleaned++))
|
'\""
|
||||||
|
|
||||||
|
local cleanup_result=$(log_command "$cleanup_cmd")
|
||||||
|
local cleanup_exit=$?
|
||||||
|
|
||||||
|
if [[ $cleanup_exit -eq 0 ]] && [[ "$cleanup_result" == *"Successfully removed"* ]]; then
|
||||||
|
((cleaned++))
|
||||||
|
log_success "Successfully cleaned directory: $dir"
|
||||||
|
else
|
||||||
|
log_error "Failed to clean directory: $dir (exit: $cleanup_exit, output: $cleanup_result)"
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
if [[ $cleaned -gt 0 ]]; then
|
if [[ $cleaned -gt 0 ]]; then
|
||||||
log_message "Successfully cleaned $cleaned stuck work directories"
|
log_success "Successfully cleaned $cleaned of $count stuck work directories"
|
||||||
local message="# 🎬 Tdarr Monitor
|
local message="# 🎬 Tdarr Monitor
|
||||||
**Successfully cleaned $cleaned stuck work directories:**
|
**Successfully cleaned $cleaned stuck work directories:**
|
||||||
- Removed partial download files (.tmp)
|
- Removed partial download files (.tmp)
|
||||||
@ -201,7 +331,7 @@ check_stuck_workdirs() {
|
|||||||
System maintenance completed automatically."
|
System maintenance completed automatically."
|
||||||
send_discord_notification "$message"
|
send_discord_notification "$message"
|
||||||
else
|
else
|
||||||
log_message "Failed to clean $count stuck work directories"
|
log_error "Failed to clean any of the $count stuck work directories"
|
||||||
local dir_list=$(echo "$stuck_dirs" | sed 's/^/- /')
|
local dir_list=$(echo "$stuck_dirs" | sed 's/^/- /')
|
||||||
local message="# 🎬 Tdarr Monitor
|
local message="# 🎬 Tdarr Monitor
|
||||||
**$count stuck work directories detected:**
|
**$count stuck work directories detected:**
|
||||||
@ -210,20 +340,45 @@ $dir_list
|
|||||||
Cleanup failed - manual intervention may be needed <@258104532423147520>."
|
Cleanup failed - manual intervention may be needed <@258104532423147520>."
|
||||||
send_discord_notification "$message"
|
send_discord_notification "$message"
|
||||||
fi
|
fi
|
||||||
|
else
|
||||||
|
log_debug "No stuck work directories detected"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
local end_time=$(date +%s.%N)
|
||||||
|
local duration=$(echo "$end_time - $start_time" | bc 2>/dev/null || echo "0")
|
||||||
|
log_message "Stuck work directory check completed in ${duration}s"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Function to check for successful completions
|
# Function to check for successful completions
|
||||||
check_completions() {
|
check_completions() {
|
||||||
log_message "Checking for successful transcodes"
|
local start_time=$(date +%s.%N)
|
||||||
|
log_message "Starting completion check"
|
||||||
|
|
||||||
# Check server logs for successful transcodes
|
# Check server logs for successful transcodes
|
||||||
local since_docker=$(date -d "@$LAST_CHECK" -u +%Y-%m-%dT%H:%M:%S.000000000Z)
|
local since_docker=$(date -d "@$LAST_CHECK" -u +%Y-%m-%dT%H:%M:%S.000000000Z)
|
||||||
|
log_debug "Checking for successful transcodes since: $since_docker"
|
||||||
|
|
||||||
local successes=$(ssh "$SERVER_HOST" "docker logs --since='$since_docker' tdarr-clean 2>&1" | \
|
local completion_cmd="ssh \"$SERVER_HOST\" \"docker logs --since='$since_docker' tdarr-clean 2>&1\""
|
||||||
grep -i "transcode.*success\|transcode.*complete" | wc -l)
|
local completion_logs=$(log_command "$completion_cmd")
|
||||||
|
local completion_exit=$?
|
||||||
|
|
||||||
|
if [[ $completion_exit -ne 0 ]]; then
|
||||||
|
log_error "Failed to retrieve completion logs from server"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
local success_lines=$(echo "$completion_logs" | grep -i "transcode.*success\|transcode.*complete")
|
||||||
|
local successes=$(echo "$success_lines" | wc -l)
|
||||||
|
log_debug "Found $successes successful transcode completion entries"
|
||||||
|
|
||||||
if [[ $successes -gt 0 ]]; then
|
if [[ $successes -gt 0 ]]; then
|
||||||
|
log_success "Detected $successes successful transcodes"
|
||||||
|
|
||||||
|
# Log sample success entries for debugging
|
||||||
|
echo "$success_lines" | head -3 | while IFS= read -r success_line; do
|
||||||
|
[[ -n "$success_line" ]] && log_debug "Success entry: $success_line"
|
||||||
|
done
|
||||||
|
|
||||||
local message="# 🎬 Tdarr Monitor
|
local message="# 🎬 Tdarr Monitor
|
||||||
**$successes transcode(s) completed successfully:**
|
**$successes transcode(s) completed successfully:**
|
||||||
- Processing completed without errors
|
- Processing completed without errors
|
||||||
@ -231,31 +386,66 @@ check_completions() {
|
|||||||
|
|
||||||
System operating normally."
|
System operating normally."
|
||||||
send_discord_notification "$message"
|
send_discord_notification "$message"
|
||||||
|
log_success "Sent Discord notification for $successes completions"
|
||||||
|
else
|
||||||
|
log_debug "No successful transcodes detected"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
local end_time=$(date +%s.%N)
|
||||||
|
local duration=$(echo "$end_time - $start_time" | bc 2>/dev/null || echo "0")
|
||||||
|
log_message "Completion check completed in ${duration}s"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Main monitoring logic
|
# Main monitoring logic
|
||||||
main() {
|
main() {
|
||||||
|
local script_start=$(date +%s.%N)
|
||||||
rotate_log
|
rotate_log
|
||||||
log_message "Starting Tdarr timeout monitor check (last: $(date -d "@$LAST_CHECK"), current: $(date))"
|
|
||||||
|
log_message "=== Starting Tdarr timeout monitor check ==="
|
||||||
|
log_debug "Last check: $(date -d "@$LAST_CHECK" 2>/dev/null || echo "Invalid timestamp: $LAST_CHECK")"
|
||||||
|
log_debug "Current time: $(date)"
|
||||||
|
log_debug "Time difference: $((CURRENT_TIME - LAST_CHECK)) seconds"
|
||||||
|
log_debug "Server host: $SERVER_HOST"
|
||||||
|
log_debug "Node container: $NODE_CONTAINER"
|
||||||
|
log_debug "Log file: $LOG_FILE (size: $(stat -c%s "$LOG_FILE" 2>/dev/null || echo "0") bytes)"
|
||||||
|
|
||||||
# Only proceed if more than 15 minutes (900 seconds) since last check
|
# Only proceed if more than 15 minutes (900 seconds) since last check
|
||||||
if [[ $((CURRENT_TIME - LAST_CHECK)) -lt 900 ]]; then
|
if [[ $((CURRENT_TIME - LAST_CHECK)) -lt 900 ]]; then
|
||||||
log_message "Less than 15 minutes since last check, skipping"
|
log_warning "Less than 15 minutes since last check, skipping ($(($CURRENT_TIME - $LAST_CHECK))s elapsed)"
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Perform checks
|
# Perform checks with error handling
|
||||||
check_server_timeouts
|
log_message "Beginning monitoring checks..."
|
||||||
check_node_stalls
|
|
||||||
check_stuck_workdirs
|
if ! check_server_timeouts; then
|
||||||
|
log_error "Server timeout check failed"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! check_node_stalls; then
|
||||||
|
log_error "Node stall check failed"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! check_stuck_workdirs; then
|
||||||
|
log_error "Stuck work directory check failed"
|
||||||
|
fi
|
||||||
|
|
||||||
# Optional: Check for successes (comment out if too noisy)
|
# Optional: Check for successes (comment out if too noisy)
|
||||||
# check_completions
|
# if ! check_completions; then
|
||||||
|
# log_error "Completion check failed"
|
||||||
|
# fi
|
||||||
|
|
||||||
# Update timestamp
|
# Update timestamp
|
||||||
echo "$CURRENT_TIME" > "$LAST_CHECK_FILE"
|
echo "$CURRENT_TIME" > "$LAST_CHECK_FILE"
|
||||||
log_message "Monitor check completed"
|
if [[ $? -eq 0 ]]; then
|
||||||
|
log_debug "Updated timestamp file: $LAST_CHECK_FILE"
|
||||||
|
else
|
||||||
|
log_error "Failed to update timestamp file: $LAST_CHECK_FILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
local script_end=$(date +%s.%N)
|
||||||
|
local total_duration=$(echo "$script_end - $script_start" | bc 2>/dev/null || echo "0")
|
||||||
|
log_success "=== Monitor check completed in ${total_duration}s ==="
|
||||||
}
|
}
|
||||||
|
|
||||||
# Run main function
|
# Run main function
|
||||||
|
|||||||
@ -39,6 +39,7 @@ podman run -d --name "${CONTAINER_NAME}" \
|
|||||||
-e NVIDIA_DRIVER_CAPABILITIES=all \
|
-e NVIDIA_DRIVER_CAPABILITIES=all \
|
||||||
-e NVIDIA_VISIBLE_DEVICES=all \
|
-e NVIDIA_VISIBLE_DEVICES=all \
|
||||||
-v "/mnt/NV2/tdarr-cache:/cache" \
|
-v "/mnt/NV2/tdarr-cache:/cache" \
|
||||||
|
-v "/mnt/media:/app/unmappedNodeCache/nobara-pc-gpu-unmapped/media" \
|
||||||
ghcr.io/haveagitgat/tdarr_node:latest
|
ghcr.io/haveagitgat/tdarr_node:latest
|
||||||
|
|
||||||
echo "⏳ Waiting for container to initialize..."
|
echo "⏳ Waiting for container to initialize..."
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user