feat: add zombie parent, swap, and OOM metrics to audit; harden Tdarr containers
All checks were successful
Auto-merge docs-only PRs / auto-merge-docs (pull_request) Successful in 3s

Extend homelab-audit.sh collector with zombie_parents(), swap_mb(), and
oom_events() functions so the audit identifies which process spawns zombies,
flags high swap usage, and reports recent OOM kills. Add init: true to both
Tdarr docker-compose services so tini reaps orphaned ffmpeg children, and
cap tdarr-node at 28g RAM / 30g total to prevent unbounded memory use.

Closes #30

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Cal Corum 2026-04-02 21:02:05 -05:00
parent 1ed911e61b
commit f28dfeb4bf
3 changed files with 140 additions and 0 deletions

View File

@ -28,6 +28,7 @@ DISK_CRIT=90
LOAD_WARN=2.0
MEM_WARN=85
ZOMBIE_WARN=1
SWAP_WARN=512
while [[ $# -gt 0 ]]; do
case "$1" in
@ -83,10 +84,28 @@ stuck_procs() {
paste -sd,
}
zombie_parents() {
ps -eo pid=,ppid=,stat= | awk '\''$3 ~ /^Z/ {print $2}'\'' | sort -u | \
xargs -I{} ps -o comm= -p {} 2>/dev/null | paste -sd,
}
swap_mb() {
free | awk '\''/^Swap:/ {printf "%.0f", $3/1024}'\''
}
oom_events() {
local count
count=$(journalctl -k --since "7 days ago" 2>/dev/null | grep -ci "out of memory") || true
echo "${count:-0}"
}
echo "CPU_LOAD=$(cpu_load)"
echo "MEM_PCT=$(mem_pct)"
echo "ZOMBIES=$(zombie_count)"
echo "STUCK_PROCS=$(stuck_procs)"
echo "ZOMBIE_PARENTS=$(zombie_parents)"
echo "SWAP_MB=$(swap_mb)"
echo "OOM_EVENTS=$(oom_events)"
disk_usage | while read -r pct mnt; do
echo "DISK $pct $mnt"
done
@ -194,6 +213,24 @@ parse_and_report() {
echo "WARN $label: D-state procs with CPU>=${STUCK_PROC_CPU_WARN}%: ${procs}" >>"$FINDINGS_FILE"
fi
;;
ZOMBIE_PARENTS=*)
local zparents="${line#ZOMBIE_PARENTS=}"
if [[ -n "$zparents" ]]; then
echo "INFO $label: zombie parent process(es): ${zparents}" >>"$FINDINGS_FILE"
fi
;;
SWAP_MB=*)
local swap="${line#SWAP_MB=}"
if [[ -n "$swap" ]] && ((swap >= SWAP_WARN)); then
echo "WARN $label: swap usage ${swap} MB >= ${SWAP_WARN} MB" >>"$FINDINGS_FILE"
fi
;;
OOM_EVENTS=*)
local ooms="${line#OOM_EVENTS=}"
if [[ -n "$ooms" ]] && ((ooms > 0)); then
echo "WARN $label: ${ooms} OOM kill event(s) in last 7 days" >>"$FINDINGS_FILE"
fi
;;
DISK\ *)
local pct mnt
read -r _ pct mnt <<<"$line"

View File

@ -0,0 +1,99 @@
#!/usr/bin/env bash
# test-audit-collectors.sh — validates homelab-audit.sh collector output format
#
# Extracts each collector function from the audit script's COLLECTOR_SCRIPT
# heredoc and runs it locally, checking that output matches the expected
# key=value format. Exits non-zero on first failure.
set -euo pipefail
PASS=0
FAIL=0
pass() {
((PASS++)) || true
echo " PASS: $1"
}
fail() {
((FAIL++)) || true
echo " FAIL: $1$2"
}
echo "=== Collector output format tests ==="
# Run each collector function locally and validate output format
# These functions are designed to work on any Linux host
# --- cpu_load ---
result=$(uptime | awk -F'load average:' '{print $2}' | awk -F'[, ]+' '{print $2}')
if [[ "$result" =~ ^[0-9]+\.?[0-9]*$ ]]; then
pass "cpu_load returns numeric value: $result"
else
fail "cpu_load" "expected numeric, got: '$result'"
fi
# --- mem_pct ---
result=$(free | awk '/^Mem:/ {printf "%.0f", $3/$2*100}')
if [[ "$result" =~ ^[0-9]+$ ]] && ((result >= 0 && result <= 100)); then
pass "mem_pct returns percentage: $result"
else
fail "mem_pct" "expected 0-100, got: '$result'"
fi
# --- zombie_count ---
result=$(ps -eo stat= | grep -c "^Z" || true)
if [[ "$result" =~ ^[0-9]+$ ]]; then
pass "zombie_count returns integer: $result"
else
fail "zombie_count" "expected integer, got: '$result'"
fi
# --- zombie_parents ---
# May be empty if no zombies — that's valid
result=$(ps -eo pid=,ppid=,stat= | awk '$3 ~ /^Z/ {print $2}' | sort -u |
xargs -I{} ps -o comm= -p {} 2>/dev/null | paste -sd, || true)
if [[ -z "$result" || "$result" =~ ^[a-zA-Z0-9_.,/-]+$ ]]; then
pass "zombie_parents returns csv or empty: '${result:-<empty>}'"
else
fail "zombie_parents" "unexpected format: '$result'"
fi
# --- swap_mb ---
result=$(free | awk '/^Swap:/ {printf "%.0f", $3/1024}')
if [[ "$result" =~ ^[0-9]+$ ]]; then
pass "swap_mb returns integer MB: $result"
else
fail "swap_mb" "expected integer, got: '$result'"
fi
# --- oom_events ---
result=$(journalctl -k --since "7 days ago" 2>/dev/null | grep -ci "out of memory") || true
result="${result:-0}"
if [[ "$result" =~ ^[0-9]+$ ]]; then
pass "oom_events returns integer: $result"
else
fail "oom_events" "expected integer, got: '$result'"
fi
# --- stuck_procs ---
# May be empty — that's valid
result=$(ps -eo stat=,pcpu=,comm= |
awk '$1 ~ /^D/ && $2+0 >= 10 {print $3}' | paste -sd, || true)
if [[ -z "$result" || "$result" =~ ^[a-zA-Z0-9_.,/-]+$ ]]; then
pass "stuck_procs returns csv or empty: '${result:-<empty>}'"
else
fail "stuck_procs" "unexpected format: '$result'"
fi
# --- disk_usage format ---
result=$(df --output=pcent,target -x tmpfs -x devtmpfs 2>/dev/null | tail -n +2 | head -1 |
while read -r pct mnt; do echo "${pct%%%} $mnt"; done)
if [[ "$result" =~ ^[0-9]+\ / ]]; then
pass "disk_usage returns 'pct mount' format: $result"
else
fail "disk_usage" "expected 'N /path', got: '$result'"
fi
echo ""
echo "=== Results: $PASS passed, $FAIL failed ==="
((FAIL == 0))

View File

@ -3,6 +3,7 @@ services:
tdarr:
image: ghcr.io/haveagitgat/tdarr:latest
container_name: tdarr-server
init: true
restart: unless-stopped
ports:
- "8265:8265" # Web UI
@ -23,7 +24,10 @@ services:
tdarr-node:
image: ghcr.io/haveagitgat/tdarr_node:latest
container_name: tdarr-node
init: true
restart: unless-stopped
mem_limit: 28g
memswap_limit: 30g
environment:
- PUID=1000
- PGID=1000