feat: zombie parent, swap, and OOM metrics + Tdarr hardening #35
@ -28,6 +28,7 @@ DISK_CRIT=90
|
||||
LOAD_WARN=2.0
|
||||
MEM_WARN=85
|
||||
ZOMBIE_WARN=1
|
||||
SWAP_WARN=512
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
@ -83,10 +84,28 @@ stuck_procs() {
|
||||
paste -sd,
|
||||
}
|
||||
|
||||
zombie_parents() {
|
||||
ps -eo pid=,ppid=,stat= | awk '\''$3 ~ /^Z/ {print $2}'\'' | sort -u | \
|
||||
xargs -I{} ps -o comm= -p {} 2>/dev/null | paste -sd,
|
||||
}
|
||||
|
||||
swap_mb() {
|
||||
free | awk '\''/^Swap:/ {printf "%.0f", $3/1024; found=1} END {if (!found) print "0"}'\''
|
||||
}
|
||||
|
||||
oom_events() {
|
||||
local count
|
||||
count=$(journalctl -k --since "7 days ago" 2>/dev/null | grep -ci "out of memory") || true
|
||||
echo "${count:-0}"
|
||||
}
|
||||
|
||||
echo "CPU_LOAD=$(cpu_load)"
|
||||
echo "MEM_PCT=$(mem_pct)"
|
||||
echo "ZOMBIES=$(zombie_count)"
|
||||
echo "STUCK_PROCS=$(stuck_procs)"
|
||||
echo "ZOMBIE_PARENTS=$(zombie_parents)"
|
||||
echo "SWAP_MB=$(swap_mb)"
|
||||
echo "OOM_EVENTS=$(oom_events)"
|
||||
disk_usage | while read -r pct mnt; do
|
||||
echo "DISK $pct $mnt"
|
||||
done
|
||||
@ -194,6 +213,24 @@ parse_and_report() {
|
||||
echo "WARN $label: D-state procs with CPU>=${STUCK_PROC_CPU_WARN}%: ${procs}" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
ZOMBIE_PARENTS=*)
|
||||
local zparents="${line#ZOMBIE_PARENTS=}"
|
||||
if [[ -n "$zparents" ]]; then
|
||||
echo "INFO $label: zombie parent process(es): ${zparents}" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
SWAP_MB=*)
|
||||
local swap="${line#SWAP_MB=}"
|
||||
if [[ -n "$swap" ]] && ((swap >= SWAP_WARN)); then
|
||||
echo "WARN $label: swap usage ${swap} MB >= ${SWAP_WARN} MB" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
OOM_EVENTS=*)
|
||||
local ooms="${line#OOM_EVENTS=}"
|
||||
if [[ -n "$ooms" ]] && ((ooms > 0)); then
|
||||
echo "WARN $label: ${ooms} OOM kill event(s) in last 7 days" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
DISK\ *)
|
||||
local pct mnt
|
||||
read -r _ pct mnt <<<"$line"
|
||||
|
||||
98
monitoring/scripts/test-audit-collectors.sh
Normal file
98
monitoring/scripts/test-audit-collectors.sh
Normal file
@ -0,0 +1,98 @@
|
||||
#!/usr/bin/env bash
|
||||
# test-audit-collectors.sh — validates homelab-audit.sh collector output format
|
||||
#
|
||||
# Re-implements each collector function inline and runs it locally, checking
|
||||
# that output matches the expected format. Exits non-zero on any failure.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
pass() {
|
||||
((PASS++)) || true
|
||||
echo " PASS: $1"
|
||||
}
|
||||
fail() {
|
||||
((FAIL++)) || true
|
||||
echo " FAIL: $1 — $2"
|
||||
}
|
||||
|
||||
echo "=== Collector output format tests ==="
|
||||
|
||||
# Run each collector function locally and validate output format
|
||||
# These functions are designed to work on any Linux host
|
||||
|
||||
# --- cpu_load ---
|
||||
result=$(uptime | awk -F'load average:' '{print $2}' | awk -F'[, ]+' '{print $2}')
|
||||
if [[ "$result" =~ ^[0-9]+\.?[0-9]*$ ]]; then
|
||||
pass "cpu_load returns numeric value: $result"
|
||||
else
|
||||
fail "cpu_load" "expected numeric, got: '$result'"
|
||||
fi
|
||||
|
||||
# --- mem_pct ---
|
||||
result=$(free | awk '/^Mem:/ {printf "%.0f", $3/$2*100}')
|
||||
if [[ "$result" =~ ^[0-9]+$ ]] && ((result >= 0 && result <= 100)); then
|
||||
pass "mem_pct returns percentage: $result"
|
||||
else
|
||||
fail "mem_pct" "expected 0-100, got: '$result'"
|
||||
fi
|
||||
|
||||
# --- zombie_count ---
|
||||
result=$(ps -eo stat= | grep -c "^Z" || true)
|
||||
if [[ "$result" =~ ^[0-9]+$ ]]; then
|
||||
pass "zombie_count returns integer: $result"
|
||||
else
|
||||
fail "zombie_count" "expected integer, got: '$result'"
|
||||
fi
|
||||
|
||||
# --- zombie_parents ---
|
||||
# May be empty if no zombies — that's valid
|
||||
result=$(ps -eo pid=,ppid=,stat= | awk '$3 ~ /^Z/ {print $2}' | sort -u |
|
||||
xargs -I{} ps -o comm= -p {} 2>/dev/null | paste -sd, || true)
|
||||
if [[ -z "$result" || "$result" =~ ^[a-zA-Z0-9_.,/-]+$ ]]; then
|
||||
pass "zombie_parents returns csv or empty: '${result:-<empty>}'"
|
||||
else
|
||||
fail "zombie_parents" "unexpected format: '$result'"
|
||||
fi
|
||||
|
||||
# --- swap_mb ---
|
||||
result=$(free | awk '/^Swap:/ {printf "%.0f", $3/1024}')
|
||||
if [[ "$result" =~ ^[0-9]+$ ]]; then
|
||||
pass "swap_mb returns integer MB: $result"
|
||||
else
|
||||
fail "swap_mb" "expected integer, got: '$result'"
|
||||
fi
|
||||
|
||||
# --- oom_events ---
|
||||
result=$(journalctl -k --since "7 days ago" 2>/dev/null | grep -ci "out of memory") || true
|
||||
result="${result:-0}"
|
||||
if [[ "$result" =~ ^[0-9]+$ ]]; then
|
||||
pass "oom_events returns integer: $result"
|
||||
else
|
||||
fail "oom_events" "expected integer, got: '$result'"
|
||||
fi
|
||||
|
||||
# --- stuck_procs ---
|
||||
# May be empty — that's valid
|
||||
result=$(ps -eo stat=,pcpu=,comm= |
|
||||
awk '$1 ~ /^D/ && $2+0 >= 10 {print $3}' | paste -sd, || true)
|
||||
if [[ -z "$result" || "$result" =~ ^[a-zA-Z0-9_.,/-]+$ ]]; then
|
||||
pass "stuck_procs returns csv or empty: '${result:-<empty>}'"
|
||||
else
|
||||
fail "stuck_procs" "unexpected format: '$result'"
|
||||
fi
|
||||
|
||||
# --- disk_usage format ---
|
||||
result=$(df --output=pcent,target -x tmpfs -x devtmpfs 2>/dev/null | tail -n +2 | head -1 |
|
||||
while read -r pct mnt; do echo "${pct%%%} $mnt"; done)
|
||||
if [[ "$result" =~ ^[0-9]+\ / ]]; then
|
||||
pass "disk_usage returns 'pct mount' format: $result"
|
||||
else
|
||||
fail "disk_usage" "expected 'N /path', got: '$result'"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== Results: $PASS passed, $FAIL failed ==="
|
||||
((FAIL == 0))
|
||||
@ -3,6 +3,7 @@ services:
|
||||
tdarr:
|
||||
image: ghcr.io/haveagitgat/tdarr:latest
|
||||
container_name: tdarr-server
|
||||
init: true
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "8265:8265" # Web UI
|
||||
@ -23,6 +24,7 @@ services:
|
||||
tdarr-node:
|
||||
image: ghcr.io/haveagitgat/tdarr_node:latest
|
||||
container_name: tdarr-node
|
||||
init: true
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- PUID=1000
|
||||
@ -37,6 +39,8 @@ services:
|
||||
- /mnt/NV2/tdarr-cache:/temp
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 28g
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
|
||||
Loading…
Reference in New Issue
Block a user