diff --git a/monitoring/scripts/homelab-audit.sh b/monitoring/scripts/homelab-audit.sh index 15d228c..00c39fd 100755 --- a/monitoring/scripts/homelab-audit.sh +++ b/monitoring/scripts/homelab-audit.sh @@ -28,6 +28,7 @@ DISK_CRIT=90 LOAD_WARN=2.0 MEM_WARN=85 ZOMBIE_WARN=1 +SWAP_WARN=512 while [[ $# -gt 0 ]]; do case "$1" in @@ -83,10 +84,28 @@ stuck_procs() { paste -sd, } +zombie_parents() { + ps -eo pid=,ppid=,stat= | awk '\''$3 ~ /^Z/ {print $2}'\'' | sort -u | \ + xargs -I{} ps -o comm= -p {} 2>/dev/null | paste -sd, +} + +swap_mb() { + free | awk '\''/^Swap:/ {printf "%.0f", $3/1024}'\'' +} + +oom_events() { + local count + count=$(journalctl -k --since "7 days ago" 2>/dev/null | grep -ci "out of memory") || true + echo "${count:-0}" +} + echo "CPU_LOAD=$(cpu_load)" echo "MEM_PCT=$(mem_pct)" echo "ZOMBIES=$(zombie_count)" echo "STUCK_PROCS=$(stuck_procs)" +echo "ZOMBIE_PARENTS=$(zombie_parents)" +echo "SWAP_MB=$(swap_mb)" +echo "OOM_EVENTS=$(oom_events)" disk_usage | while read -r pct mnt; do echo "DISK $pct $mnt" done @@ -194,6 +213,24 @@ parse_and_report() { echo "WARN $label: D-state procs with CPU>=${STUCK_PROC_CPU_WARN}%: ${procs}" >>"$FINDINGS_FILE" fi ;; + ZOMBIE_PARENTS=*) + local zparents="${line#ZOMBIE_PARENTS=}" + if [[ -n "$zparents" ]]; then + echo "INFO $label: zombie parent process(es): ${zparents}" >>"$FINDINGS_FILE" + fi + ;; + SWAP_MB=*) + local swap="${line#SWAP_MB=}" + if [[ -n "$swap" ]] && ((swap >= SWAP_WARN)); then + echo "WARN $label: swap usage ${swap} MB >= ${SWAP_WARN} MB" >>"$FINDINGS_FILE" + fi + ;; + OOM_EVENTS=*) + local ooms="${line#OOM_EVENTS=}" + if [[ -n "$ooms" ]] && ((ooms > 0)); then + echo "WARN $label: ${ooms} OOM kill event(s) in last 7 days" >>"$FINDINGS_FILE" + fi + ;; DISK\ *) local pct mnt read -r _ pct mnt <<<"$line" diff --git a/monitoring/scripts/test-audit-collectors.sh b/monitoring/scripts/test-audit-collectors.sh new file mode 100644 index 0000000..95d7e85 --- /dev/null +++ b/monitoring/scripts/test-audit-collectors.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +# test-audit-collectors.sh — validates homelab-audit.sh collector output format +# +# Extracts each collector function from the audit script's COLLECTOR_SCRIPT +# heredoc and runs it locally, checking that output matches the expected +# key=value format. Exits non-zero on first failure. + +set -euo pipefail + +PASS=0 +FAIL=0 + +pass() { + ((PASS++)) || true + echo " PASS: $1" +} +fail() { + ((FAIL++)) || true + echo " FAIL: $1 — $2" +} + +echo "=== Collector output format tests ===" + +# Run each collector function locally and validate output format +# These functions are designed to work on any Linux host + +# --- cpu_load --- +result=$(uptime | awk -F'load average:' '{print $2}' | awk -F'[, ]+' '{print $2}') +if [[ "$result" =~ ^[0-9]+\.?[0-9]*$ ]]; then + pass "cpu_load returns numeric value: $result" +else + fail "cpu_load" "expected numeric, got: '$result'" +fi + +# --- mem_pct --- +result=$(free | awk '/^Mem:/ {printf "%.0f", $3/$2*100}') +if [[ "$result" =~ ^[0-9]+$ ]] && ((result >= 0 && result <= 100)); then + pass "mem_pct returns percentage: $result" +else + fail "mem_pct" "expected 0-100, got: '$result'" +fi + +# --- zombie_count --- +result=$(ps -eo stat= | grep -c "^Z" || true) +if [[ "$result" =~ ^[0-9]+$ ]]; then + pass "zombie_count returns integer: $result" +else + fail "zombie_count" "expected integer, got: '$result'" +fi + +# --- zombie_parents --- +# May be empty if no zombies — that's valid +result=$(ps -eo pid=,ppid=,stat= | awk '$3 ~ /^Z/ {print $2}' | sort -u | + xargs -I{} ps -o comm= -p {} 2>/dev/null | paste -sd, || true) +if [[ -z "$result" || "$result" =~ ^[a-zA-Z0-9_.,/-]+$ ]]; then + pass "zombie_parents returns csv or empty: '${result:-}'" +else + fail "zombie_parents" "unexpected format: '$result'" +fi + +# --- swap_mb --- +result=$(free | awk '/^Swap:/ {printf "%.0f", $3/1024}') +if [[ "$result" =~ ^[0-9]+$ ]]; then + pass "swap_mb returns integer MB: $result" +else + fail "swap_mb" "expected integer, got: '$result'" +fi + +# --- oom_events --- +result=$(journalctl -k --since "7 days ago" 2>/dev/null | grep -ci "out of memory") || true +result="${result:-0}" +if [[ "$result" =~ ^[0-9]+$ ]]; then + pass "oom_events returns integer: $result" +else + fail "oom_events" "expected integer, got: '$result'" +fi + +# --- stuck_procs --- +# May be empty — that's valid +result=$(ps -eo stat=,pcpu=,comm= | + awk '$1 ~ /^D/ && $2+0 >= 10 {print $3}' | paste -sd, || true) +if [[ -z "$result" || "$result" =~ ^[a-zA-Z0-9_.,/-]+$ ]]; then + pass "stuck_procs returns csv or empty: '${result:-}'" +else + fail "stuck_procs" "unexpected format: '$result'" +fi + +# --- disk_usage format --- +result=$(df --output=pcent,target -x tmpfs -x devtmpfs 2>/dev/null | tail -n +2 | head -1 | + while read -r pct mnt; do echo "${pct%%%} $mnt"; done) +if [[ "$result" =~ ^[0-9]+\ / ]]; then + pass "disk_usage returns 'pct mount' format: $result" +else + fail "disk_usage" "expected 'N /path', got: '$result'" +fi + +echo "" +echo "=== Results: $PASS passed, $FAIL failed ===" +((FAIL == 0)) diff --git a/server-configs/ubuntu-manticore/docker-compose/tdarr/docker-compose.yml b/server-configs/ubuntu-manticore/docker-compose/tdarr/docker-compose.yml index cec8ed0..9f51d38 100644 --- a/server-configs/ubuntu-manticore/docker-compose/tdarr/docker-compose.yml +++ b/server-configs/ubuntu-manticore/docker-compose/tdarr/docker-compose.yml @@ -3,6 +3,7 @@ services: tdarr: image: ghcr.io/haveagitgat/tdarr:latest container_name: tdarr-server + init: true restart: unless-stopped ports: - "8265:8265" # Web UI @@ -23,7 +24,10 @@ services: tdarr-node: image: ghcr.io/haveagitgat/tdarr_node:latest container_name: tdarr-node + init: true restart: unless-stopped + mem_limit: 28g + memswap_limit: 30g environment: - PUID=1000 - PGID=1000