feat: add zombie parent, swap, and OOM metrics to audit; harden Tdarr containers
All checks were successful
Auto-merge docs-only PRs / auto-merge-docs (pull_request) Successful in 3s
All checks were successful
Auto-merge docs-only PRs / auto-merge-docs (pull_request) Successful in 3s
Extend homelab-audit.sh collector with zombie_parents(), swap_mb(), and oom_events() functions so the audit identifies which process spawns zombies, flags high swap usage, and reports recent OOM kills. Add init: true to both Tdarr docker-compose services so tini reaps orphaned ffmpeg children, and cap tdarr-node at 28g RAM / 30g total to prevent unbounded memory use. Closes #30 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1ed911e61b
commit
f28dfeb4bf
@ -28,6 +28,7 @@ DISK_CRIT=90
|
|||||||
LOAD_WARN=2.0
|
LOAD_WARN=2.0
|
||||||
MEM_WARN=85
|
MEM_WARN=85
|
||||||
ZOMBIE_WARN=1
|
ZOMBIE_WARN=1
|
||||||
|
SWAP_WARN=512
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
@ -83,10 +84,28 @@ stuck_procs() {
|
|||||||
paste -sd,
|
paste -sd,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
zombie_parents() {
|
||||||
|
ps -eo pid=,ppid=,stat= | awk '\''$3 ~ /^Z/ {print $2}'\'' | sort -u | \
|
||||||
|
xargs -I{} ps -o comm= -p {} 2>/dev/null | paste -sd,
|
||||||
|
}
|
||||||
|
|
||||||
|
swap_mb() {
|
||||||
|
free | awk '\''/^Swap:/ {printf "%.0f", $3/1024}'\''
|
||||||
|
}
|
||||||
|
|
||||||
|
oom_events() {
|
||||||
|
local count
|
||||||
|
count=$(journalctl -k --since "7 days ago" 2>/dev/null | grep -ci "out of memory") || true
|
||||||
|
echo "${count:-0}"
|
||||||
|
}
|
||||||
|
|
||||||
echo "CPU_LOAD=$(cpu_load)"
|
echo "CPU_LOAD=$(cpu_load)"
|
||||||
echo "MEM_PCT=$(mem_pct)"
|
echo "MEM_PCT=$(mem_pct)"
|
||||||
echo "ZOMBIES=$(zombie_count)"
|
echo "ZOMBIES=$(zombie_count)"
|
||||||
echo "STUCK_PROCS=$(stuck_procs)"
|
echo "STUCK_PROCS=$(stuck_procs)"
|
||||||
|
echo "ZOMBIE_PARENTS=$(zombie_parents)"
|
||||||
|
echo "SWAP_MB=$(swap_mb)"
|
||||||
|
echo "OOM_EVENTS=$(oom_events)"
|
||||||
disk_usage | while read -r pct mnt; do
|
disk_usage | while read -r pct mnt; do
|
||||||
echo "DISK $pct $mnt"
|
echo "DISK $pct $mnt"
|
||||||
done
|
done
|
||||||
@ -194,6 +213,24 @@ parse_and_report() {
|
|||||||
echo "WARN $label: D-state procs with CPU>=${STUCK_PROC_CPU_WARN}%: ${procs}" >>"$FINDINGS_FILE"
|
echo "WARN $label: D-state procs with CPU>=${STUCK_PROC_CPU_WARN}%: ${procs}" >>"$FINDINGS_FILE"
|
||||||
fi
|
fi
|
||||||
;;
|
;;
|
||||||
|
ZOMBIE_PARENTS=*)
|
||||||
|
local zparents="${line#ZOMBIE_PARENTS=}"
|
||||||
|
if [[ -n "$zparents" ]]; then
|
||||||
|
echo "INFO $label: zombie parent process(es): ${zparents}" >>"$FINDINGS_FILE"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
SWAP_MB=*)
|
||||||
|
local swap="${line#SWAP_MB=}"
|
||||||
|
if [[ -n "$swap" ]] && ((swap >= SWAP_WARN)); then
|
||||||
|
echo "WARN $label: swap usage ${swap} MB >= ${SWAP_WARN} MB" >>"$FINDINGS_FILE"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
OOM_EVENTS=*)
|
||||||
|
local ooms="${line#OOM_EVENTS=}"
|
||||||
|
if [[ -n "$ooms" ]] && ((ooms > 0)); then
|
||||||
|
echo "WARN $label: ${ooms} OOM kill event(s) in last 7 days" >>"$FINDINGS_FILE"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
DISK\ *)
|
DISK\ *)
|
||||||
local pct mnt
|
local pct mnt
|
||||||
read -r _ pct mnt <<<"$line"
|
read -r _ pct mnt <<<"$line"
|
||||||
|
|||||||
99
monitoring/scripts/test-audit-collectors.sh
Normal file
99
monitoring/scripts/test-audit-collectors.sh
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# test-audit-collectors.sh — validates homelab-audit.sh collector output format
|
||||||
|
#
|
||||||
|
# Extracts each collector function from the audit script's COLLECTOR_SCRIPT
|
||||||
|
# heredoc and runs it locally, checking that output matches the expected
|
||||||
|
# key=value format. Exits non-zero on first failure.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
PASS=0
|
||||||
|
FAIL=0
|
||||||
|
|
||||||
|
pass() {
|
||||||
|
((PASS++)) || true
|
||||||
|
echo " PASS: $1"
|
||||||
|
}
|
||||||
|
fail() {
|
||||||
|
((FAIL++)) || true
|
||||||
|
echo " FAIL: $1 — $2"
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "=== Collector output format tests ==="
|
||||||
|
|
||||||
|
# Run each collector function locally and validate output format
|
||||||
|
# These functions are designed to work on any Linux host
|
||||||
|
|
||||||
|
# --- cpu_load ---
|
||||||
|
result=$(uptime | awk -F'load average:' '{print $2}' | awk -F'[, ]+' '{print $2}')
|
||||||
|
if [[ "$result" =~ ^[0-9]+\.?[0-9]*$ ]]; then
|
||||||
|
pass "cpu_load returns numeric value: $result"
|
||||||
|
else
|
||||||
|
fail "cpu_load" "expected numeric, got: '$result'"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- mem_pct ---
|
||||||
|
result=$(free | awk '/^Mem:/ {printf "%.0f", $3/$2*100}')
|
||||||
|
if [[ "$result" =~ ^[0-9]+$ ]] && ((result >= 0 && result <= 100)); then
|
||||||
|
pass "mem_pct returns percentage: $result"
|
||||||
|
else
|
||||||
|
fail "mem_pct" "expected 0-100, got: '$result'"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- zombie_count ---
|
||||||
|
result=$(ps -eo stat= | grep -c "^Z" || true)
|
||||||
|
if [[ "$result" =~ ^[0-9]+$ ]]; then
|
||||||
|
pass "zombie_count returns integer: $result"
|
||||||
|
else
|
||||||
|
fail "zombie_count" "expected integer, got: '$result'"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- zombie_parents ---
|
||||||
|
# May be empty if no zombies — that's valid
|
||||||
|
result=$(ps -eo pid=,ppid=,stat= | awk '$3 ~ /^Z/ {print $2}' | sort -u |
|
||||||
|
xargs -I{} ps -o comm= -p {} 2>/dev/null | paste -sd, || true)
|
||||||
|
if [[ -z "$result" || "$result" =~ ^[a-zA-Z0-9_.,/-]+$ ]]; then
|
||||||
|
pass "zombie_parents returns csv or empty: '${result:-<empty>}'"
|
||||||
|
else
|
||||||
|
fail "zombie_parents" "unexpected format: '$result'"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- swap_mb ---
|
||||||
|
result=$(free | awk '/^Swap:/ {printf "%.0f", $3/1024}')
|
||||||
|
if [[ "$result" =~ ^[0-9]+$ ]]; then
|
||||||
|
pass "swap_mb returns integer MB: $result"
|
||||||
|
else
|
||||||
|
fail "swap_mb" "expected integer, got: '$result'"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- oom_events ---
|
||||||
|
result=$(journalctl -k --since "7 days ago" 2>/dev/null | grep -ci "out of memory") || true
|
||||||
|
result="${result:-0}"
|
||||||
|
if [[ "$result" =~ ^[0-9]+$ ]]; then
|
||||||
|
pass "oom_events returns integer: $result"
|
||||||
|
else
|
||||||
|
fail "oom_events" "expected integer, got: '$result'"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- stuck_procs ---
|
||||||
|
# May be empty — that's valid
|
||||||
|
result=$(ps -eo stat=,pcpu=,comm= |
|
||||||
|
awk '$1 ~ /^D/ && $2+0 >= 10 {print $3}' | paste -sd, || true)
|
||||||
|
if [[ -z "$result" || "$result" =~ ^[a-zA-Z0-9_.,/-]+$ ]]; then
|
||||||
|
pass "stuck_procs returns csv or empty: '${result:-<empty>}'"
|
||||||
|
else
|
||||||
|
fail "stuck_procs" "unexpected format: '$result'"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- disk_usage format ---
|
||||||
|
result=$(df --output=pcent,target -x tmpfs -x devtmpfs 2>/dev/null | tail -n +2 | head -1 |
|
||||||
|
while read -r pct mnt; do echo "${pct%%%} $mnt"; done)
|
||||||
|
if [[ "$result" =~ ^[0-9]+\ / ]]; then
|
||||||
|
pass "disk_usage returns 'pct mount' format: $result"
|
||||||
|
else
|
||||||
|
fail "disk_usage" "expected 'N /path', got: '$result'"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== Results: $PASS passed, $FAIL failed ==="
|
||||||
|
((FAIL == 0))
|
||||||
@ -3,6 +3,7 @@ services:
|
|||||||
tdarr:
|
tdarr:
|
||||||
image: ghcr.io/haveagitgat/tdarr:latest
|
image: ghcr.io/haveagitgat/tdarr:latest
|
||||||
container_name: tdarr-server
|
container_name: tdarr-server
|
||||||
|
init: true
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
ports:
|
ports:
|
||||||
- "8265:8265" # Web UI
|
- "8265:8265" # Web UI
|
||||||
@ -23,7 +24,10 @@ services:
|
|||||||
tdarr-node:
|
tdarr-node:
|
||||||
image: ghcr.io/haveagitgat/tdarr_node:latest
|
image: ghcr.io/haveagitgat/tdarr_node:latest
|
||||||
container_name: tdarr-node
|
container_name: tdarr-node
|
||||||
|
init: true
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
mem_limit: 28g
|
||||||
|
memswap_limit: 30g
|
||||||
environment:
|
environment:
|
||||||
- PUID=1000
|
- PUID=1000
|
||||||
- PGID=1000
|
- PGID=1000
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user