From f28dfeb4bf66674d325dd6c0e2d63faf83e14bae Mon Sep 17 00:00:00 2001 From: Cal Corum Date: Thu, 2 Apr 2026 21:02:05 -0500 Subject: [PATCH 1/2] feat: add zombie parent, swap, and OOM metrics to audit; harden Tdarr containers Extend homelab-audit.sh collector with zombie_parents(), swap_mb(), and oom_events() functions so the audit identifies which process spawns zombies, flags high swap usage, and reports recent OOM kills. Add init: true to both Tdarr docker-compose services so tini reaps orphaned ffmpeg children, and cap tdarr-node at 28g RAM / 30g total to prevent unbounded memory use. Closes #30 Co-Authored-By: Claude Opus 4.6 (1M context) --- monitoring/scripts/homelab-audit.sh | 37 +++++++ monitoring/scripts/test-audit-collectors.sh | 99 +++++++++++++++++++ .../docker-compose/tdarr/docker-compose.yml | 4 + 3 files changed, 140 insertions(+) create mode 100644 monitoring/scripts/test-audit-collectors.sh diff --git a/monitoring/scripts/homelab-audit.sh b/monitoring/scripts/homelab-audit.sh index 15d228c..00c39fd 100755 --- a/monitoring/scripts/homelab-audit.sh +++ b/monitoring/scripts/homelab-audit.sh @@ -28,6 +28,7 @@ DISK_CRIT=90 LOAD_WARN=2.0 MEM_WARN=85 ZOMBIE_WARN=1 +SWAP_WARN=512 while [[ $# -gt 0 ]]; do case "$1" in @@ -83,10 +84,28 @@ stuck_procs() { paste -sd, } +zombie_parents() { + ps -eo pid=,ppid=,stat= | awk '\''$3 ~ /^Z/ {print $2}'\'' | sort -u | \ + xargs -I{} ps -o comm= -p {} 2>/dev/null | paste -sd, +} + +swap_mb() { + free | awk '\''/^Swap:/ {printf "%.0f", $3/1024}'\'' +} + +oom_events() { + local count + count=$(journalctl -k --since "7 days ago" 2>/dev/null | grep -ci "out of memory") || true + echo "${count:-0}" +} + echo "CPU_LOAD=$(cpu_load)" echo "MEM_PCT=$(mem_pct)" echo "ZOMBIES=$(zombie_count)" echo "STUCK_PROCS=$(stuck_procs)" +echo "ZOMBIE_PARENTS=$(zombie_parents)" +echo "SWAP_MB=$(swap_mb)" +echo "OOM_EVENTS=$(oom_events)" disk_usage | while read -r pct mnt; do echo "DISK $pct $mnt" done @@ -194,6 +213,24 @@ parse_and_report() { echo "WARN $label: D-state procs with CPU>=${STUCK_PROC_CPU_WARN}%: ${procs}" >>"$FINDINGS_FILE" fi ;; + ZOMBIE_PARENTS=*) + local zparents="${line#ZOMBIE_PARENTS=}" + if [[ -n "$zparents" ]]; then + echo "INFO $label: zombie parent process(es): ${zparents}" >>"$FINDINGS_FILE" + fi + ;; + SWAP_MB=*) + local swap="${line#SWAP_MB=}" + if [[ -n "$swap" ]] && ((swap >= SWAP_WARN)); then + echo "WARN $label: swap usage ${swap} MB >= ${SWAP_WARN} MB" >>"$FINDINGS_FILE" + fi + ;; + OOM_EVENTS=*) + local ooms="${line#OOM_EVENTS=}" + if [[ -n "$ooms" ]] && ((ooms > 0)); then + echo "WARN $label: ${ooms} OOM kill event(s) in last 7 days" >>"$FINDINGS_FILE" + fi + ;; DISK\ *) local pct mnt read -r _ pct mnt <<<"$line" diff --git a/monitoring/scripts/test-audit-collectors.sh b/monitoring/scripts/test-audit-collectors.sh new file mode 100644 index 0000000..95d7e85 --- /dev/null +++ b/monitoring/scripts/test-audit-collectors.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +# test-audit-collectors.sh — validates homelab-audit.sh collector output format +# +# Extracts each collector function from the audit script's COLLECTOR_SCRIPT +# heredoc and runs it locally, checking that output matches the expected +# key=value format. Exits non-zero on first failure. + +set -euo pipefail + +PASS=0 +FAIL=0 + +pass() { + ((PASS++)) || true + echo " PASS: $1" +} +fail() { + ((FAIL++)) || true + echo " FAIL: $1 — $2" +} + +echo "=== Collector output format tests ===" + +# Run each collector function locally and validate output format +# These functions are designed to work on any Linux host + +# --- cpu_load --- +result=$(uptime | awk -F'load average:' '{print $2}' | awk -F'[, ]+' '{print $2}') +if [[ "$result" =~ ^[0-9]+\.?[0-9]*$ ]]; then + pass "cpu_load returns numeric value: $result" +else + fail "cpu_load" "expected numeric, got: '$result'" +fi + +# --- mem_pct --- +result=$(free | awk '/^Mem:/ {printf "%.0f", $3/$2*100}') +if [[ "$result" =~ ^[0-9]+$ ]] && ((result >= 0 && result <= 100)); then + pass "mem_pct returns percentage: $result" +else + fail "mem_pct" "expected 0-100, got: '$result'" +fi + +# --- zombie_count --- +result=$(ps -eo stat= | grep -c "^Z" || true) +if [[ "$result" =~ ^[0-9]+$ ]]; then + pass "zombie_count returns integer: $result" +else + fail "zombie_count" "expected integer, got: '$result'" +fi + +# --- zombie_parents --- +# May be empty if no zombies — that's valid +result=$(ps -eo pid=,ppid=,stat= | awk '$3 ~ /^Z/ {print $2}' | sort -u | + xargs -I{} ps -o comm= -p {} 2>/dev/null | paste -sd, || true) +if [[ -z "$result" || "$result" =~ ^[a-zA-Z0-9_.,/-]+$ ]]; then + pass "zombie_parents returns csv or empty: '${result:-}'" +else + fail "zombie_parents" "unexpected format: '$result'" +fi + +# --- swap_mb --- +result=$(free | awk '/^Swap:/ {printf "%.0f", $3/1024}') +if [[ "$result" =~ ^[0-9]+$ ]]; then + pass "swap_mb returns integer MB: $result" +else + fail "swap_mb" "expected integer, got: '$result'" +fi + +# --- oom_events --- +result=$(journalctl -k --since "7 days ago" 2>/dev/null | grep -ci "out of memory") || true +result="${result:-0}" +if [[ "$result" =~ ^[0-9]+$ ]]; then + pass "oom_events returns integer: $result" +else + fail "oom_events" "expected integer, got: '$result'" +fi + +# --- stuck_procs --- +# May be empty — that's valid +result=$(ps -eo stat=,pcpu=,comm= | + awk '$1 ~ /^D/ && $2+0 >= 10 {print $3}' | paste -sd, || true) +if [[ -z "$result" || "$result" =~ ^[a-zA-Z0-9_.,/-]+$ ]]; then + pass "stuck_procs returns csv or empty: '${result:-}'" +else + fail "stuck_procs" "unexpected format: '$result'" +fi + +# --- disk_usage format --- +result=$(df --output=pcent,target -x tmpfs -x devtmpfs 2>/dev/null | tail -n +2 | head -1 | + while read -r pct mnt; do echo "${pct%%%} $mnt"; done) +if [[ "$result" =~ ^[0-9]+\ / ]]; then + pass "disk_usage returns 'pct mount' format: $result" +else + fail "disk_usage" "expected 'N /path', got: '$result'" +fi + +echo "" +echo "=== Results: $PASS passed, $FAIL failed ===" +((FAIL == 0)) diff --git a/server-configs/ubuntu-manticore/docker-compose/tdarr/docker-compose.yml b/server-configs/ubuntu-manticore/docker-compose/tdarr/docker-compose.yml index cec8ed0..9f51d38 100644 --- a/server-configs/ubuntu-manticore/docker-compose/tdarr/docker-compose.yml +++ b/server-configs/ubuntu-manticore/docker-compose/tdarr/docker-compose.yml @@ -3,6 +3,7 @@ services: tdarr: image: ghcr.io/haveagitgat/tdarr:latest container_name: tdarr-server + init: true restart: unless-stopped ports: - "8265:8265" # Web UI @@ -23,7 +24,10 @@ services: tdarr-node: image: ghcr.io/haveagitgat/tdarr_node:latest container_name: tdarr-node + init: true restart: unless-stopped + mem_limit: 28g + memswap_limit: 30g environment: - PUID=1000 - PGID=1000 From e58c5b8cc10ede3db1f16013215bff7da28eb483 Mon Sep 17 00:00:00 2001 From: Cal Corum Date: Thu, 2 Apr 2026 21:05:12 -0500 Subject: [PATCH 2/2] =?UTF-8?q?fix:=20address=20PR=20review=20=E2=80=94=20?= =?UTF-8?q?move=20memory=20limits=20to=20deploy=20block,=20handle=20swap-l?= =?UTF-8?q?ess=20hosts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move mem_limit/memswap_limit to deploy.resources.limits.memory so the constraint is actually enforced under Compose v3. Add END clause to swap_mb() so hosts without a Swap line report 0 instead of empty output. Fix test script header comment accuracy. Co-Authored-By: Claude Opus 4.6 (1M context) --- monitoring/scripts/homelab-audit.sh | 2 +- monitoring/scripts/test-audit-collectors.sh | 5 ++--- .../ubuntu-manticore/docker-compose/tdarr/docker-compose.yml | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/monitoring/scripts/homelab-audit.sh b/monitoring/scripts/homelab-audit.sh index 00c39fd..3e25aa3 100755 --- a/monitoring/scripts/homelab-audit.sh +++ b/monitoring/scripts/homelab-audit.sh @@ -90,7 +90,7 @@ zombie_parents() { } swap_mb() { - free | awk '\''/^Swap:/ {printf "%.0f", $3/1024}'\'' + free | awk '\''/^Swap:/ {printf "%.0f", $3/1024; found=1} END {if (!found) print "0"}'\'' } oom_events() { diff --git a/monitoring/scripts/test-audit-collectors.sh b/monitoring/scripts/test-audit-collectors.sh index 95d7e85..149aa98 100644 --- a/monitoring/scripts/test-audit-collectors.sh +++ b/monitoring/scripts/test-audit-collectors.sh @@ -1,9 +1,8 @@ #!/usr/bin/env bash # test-audit-collectors.sh — validates homelab-audit.sh collector output format # -# Extracts each collector function from the audit script's COLLECTOR_SCRIPT -# heredoc and runs it locally, checking that output matches the expected -# key=value format. Exits non-zero on first failure. +# Re-implements each collector function inline and runs it locally, checking +# that output matches the expected format. Exits non-zero on any failure. set -euo pipefail diff --git a/server-configs/ubuntu-manticore/docker-compose/tdarr/docker-compose.yml b/server-configs/ubuntu-manticore/docker-compose/tdarr/docker-compose.yml index 9f51d38..73e8c70 100644 --- a/server-configs/ubuntu-manticore/docker-compose/tdarr/docker-compose.yml +++ b/server-configs/ubuntu-manticore/docker-compose/tdarr/docker-compose.yml @@ -26,8 +26,6 @@ services: container_name: tdarr-node init: true restart: unless-stopped - mem_limit: 28g - memswap_limit: 30g environment: - PUID=1000 - PGID=1000 @@ -41,6 +39,8 @@ services: - /mnt/NV2/tdarr-cache:/temp deploy: resources: + limits: + memory: 28g reservations: devices: - driver: nvidia