Merge pull request 'feat: zombie parent, swap, and OOM metrics + Tdarr hardening' (#35) from chore/30-investigate-manticore-zombies-swap into main

This commit is contained in:
cal 2026-04-03 02:05:46 +00:00
commit 3e3d2ada31
3 changed files with 139 additions and 0 deletions

View File

@ -28,6 +28,7 @@ DISK_CRIT=90
LOAD_WARN=2.0
MEM_WARN=85
ZOMBIE_WARN=1
SWAP_WARN=512
while [[ $# -gt 0 ]]; do
case "$1" in
@ -83,10 +84,28 @@ stuck_procs() {
paste -sd,
}
zombie_parents() {
ps -eo pid=,ppid=,stat= | awk '\''$3 ~ /^Z/ {print $2}'\'' | sort -u | \
xargs -I{} ps -o comm= -p {} 2>/dev/null | paste -sd,
}
swap_mb() {
free | awk '\''/^Swap:/ {printf "%.0f", $3/1024; found=1} END {if (!found) print "0"}'\''
}
oom_events() {
local count
count=$(journalctl -k --since "7 days ago" 2>/dev/null | grep -ci "out of memory") || true
echo "${count:-0}"
}
echo "CPU_LOAD=$(cpu_load)"
echo "MEM_PCT=$(mem_pct)"
echo "ZOMBIES=$(zombie_count)"
echo "STUCK_PROCS=$(stuck_procs)"
echo "ZOMBIE_PARENTS=$(zombie_parents)"
echo "SWAP_MB=$(swap_mb)"
echo "OOM_EVENTS=$(oom_events)"
disk_usage | while read -r pct mnt; do
echo "DISK $pct $mnt"
done
@ -194,6 +213,24 @@ parse_and_report() {
echo "WARN $label: D-state procs with CPU>=${STUCK_PROC_CPU_WARN}%: ${procs}" >>"$FINDINGS_FILE"
fi
;;
ZOMBIE_PARENTS=*)
local zparents="${line#ZOMBIE_PARENTS=}"
if [[ -n "$zparents" ]]; then
echo "INFO $label: zombie parent process(es): ${zparents}" >>"$FINDINGS_FILE"
fi
;;
SWAP_MB=*)
local swap="${line#SWAP_MB=}"
if [[ -n "$swap" ]] && ((swap >= SWAP_WARN)); then
echo "WARN $label: swap usage ${swap} MB >= ${SWAP_WARN} MB" >>"$FINDINGS_FILE"
fi
;;
OOM_EVENTS=*)
local ooms="${line#OOM_EVENTS=}"
if [[ -n "$ooms" ]] && ((ooms > 0)); then
echo "WARN $label: ${ooms} OOM kill event(s) in last 7 days" >>"$FINDINGS_FILE"
fi
;;
DISK\ *)
local pct mnt
read -r _ pct mnt <<<"$line"

View File

@ -0,0 +1,98 @@
#!/usr/bin/env bash
# test-audit-collectors.sh — validates homelab-audit.sh collector output format
#
# Re-implements each collector function inline and runs it locally, checking
# that output matches the expected format. Exits non-zero on any failure.
set -euo pipefail
PASS=0
FAIL=0
pass() {
((PASS++)) || true
echo " PASS: $1"
}
fail() {
((FAIL++)) || true
echo " FAIL: $1$2"
}
echo "=== Collector output format tests ==="
# Run each collector function locally and validate output format
# These functions are designed to work on any Linux host
# --- cpu_load ---
result=$(uptime | awk -F'load average:' '{print $2}' | awk -F'[, ]+' '{print $2}')
if [[ "$result" =~ ^[0-9]+\.?[0-9]*$ ]]; then
pass "cpu_load returns numeric value: $result"
else
fail "cpu_load" "expected numeric, got: '$result'"
fi
# --- mem_pct ---
result=$(free | awk '/^Mem:/ {printf "%.0f", $3/$2*100}')
if [[ "$result" =~ ^[0-9]+$ ]] && ((result >= 0 && result <= 100)); then
pass "mem_pct returns percentage: $result"
else
fail "mem_pct" "expected 0-100, got: '$result'"
fi
# --- zombie_count ---
result=$(ps -eo stat= | grep -c "^Z" || true)
if [[ "$result" =~ ^[0-9]+$ ]]; then
pass "zombie_count returns integer: $result"
else
fail "zombie_count" "expected integer, got: '$result'"
fi
# --- zombie_parents ---
# May be empty if no zombies — that's valid
result=$(ps -eo pid=,ppid=,stat= | awk '$3 ~ /^Z/ {print $2}' | sort -u |
xargs -I{} ps -o comm= -p {} 2>/dev/null | paste -sd, || true)
if [[ -z "$result" || "$result" =~ ^[a-zA-Z0-9_.,/-]+$ ]]; then
pass "zombie_parents returns csv or empty: '${result:-<empty>}'"
else
fail "zombie_parents" "unexpected format: '$result'"
fi
# --- swap_mb ---
result=$(free | awk '/^Swap:/ {printf "%.0f", $3/1024}')
if [[ "$result" =~ ^[0-9]+$ ]]; then
pass "swap_mb returns integer MB: $result"
else
fail "swap_mb" "expected integer, got: '$result'"
fi
# --- oom_events ---
result=$(journalctl -k --since "7 days ago" 2>/dev/null | grep -ci "out of memory") || true
result="${result:-0}"
if [[ "$result" =~ ^[0-9]+$ ]]; then
pass "oom_events returns integer: $result"
else
fail "oom_events" "expected integer, got: '$result'"
fi
# --- stuck_procs ---
# May be empty — that's valid
result=$(ps -eo stat=,pcpu=,comm= |
awk '$1 ~ /^D/ && $2+0 >= 10 {print $3}' | paste -sd, || true)
if [[ -z "$result" || "$result" =~ ^[a-zA-Z0-9_.,/-]+$ ]]; then
pass "stuck_procs returns csv or empty: '${result:-<empty>}'"
else
fail "stuck_procs" "unexpected format: '$result'"
fi
# --- disk_usage format ---
result=$(df --output=pcent,target -x tmpfs -x devtmpfs 2>/dev/null | tail -n +2 | head -1 |
while read -r pct mnt; do echo "${pct%%%} $mnt"; done)
if [[ "$result" =~ ^[0-9]+\ / ]]; then
pass "disk_usage returns 'pct mount' format: $result"
else
fail "disk_usage" "expected 'N /path', got: '$result'"
fi
echo ""
echo "=== Results: $PASS passed, $FAIL failed ==="
((FAIL == 0))

View File

@ -3,6 +3,7 @@ services:
tdarr:
image: ghcr.io/haveagitgat/tdarr:latest
container_name: tdarr-server
init: true
restart: unless-stopped
ports:
- "8265:8265" # Web UI
@ -23,6 +24,7 @@ services:
tdarr-node:
image: ghcr.io/haveagitgat/tdarr_node:latest
container_name: tdarr-node
init: true
restart: unless-stopped
environment:
- PUID=1000
@ -37,6 +39,8 @@ services:
- /mnt/NV2/tdarr-cache:/temp
deploy:
resources:
limits:
memory: 28g
reservations:
devices:
- driver: nvidia