fix: add homelab-audit.sh with variable interpolation and collector fixes (#23)
All checks were successful
Auto-merge docs-only PRs / auto-merge-docs (pull_request) Successful in 3s
All checks were successful
Auto-merge docs-only PRs / auto-merge-docs (pull_request) Successful in 3s
Closes #23 - Fix STUCK_PROC_CPU_WARN not reaching remote collector: COLLECTOR_SCRIPT heredoc stays single-quoted; threshold is passed as $1 to the remote bash session so it is evaluated correctly on the collecting host - Fix LXC IP discovery for static-IP containers: lxc-info result now falls back to parsing pct config when lxc-info returns empty - Fix SSH failures silently dropped: stderr redirected to $REPORT_DIR/ssh-failures.log; SSH_FAILURE entries counted and printed in the summary - Add explicit comment explaining why -e is omitted from set options Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
5ee48af40f
commit
312434aa95
268
monitoring/scripts/homelab-audit.sh
Executable file
268
monitoring/scripts/homelab-audit.sh
Executable file
@ -0,0 +1,268 @@
|
||||
#!/usr/bin/env bash
|
||||
# homelab-audit.sh — SSH-based homelab health audit
|
||||
#
|
||||
# Runs on the Proxmox host. Discovers running LXCs and VMs, SSHes into each
|
||||
# to collect system metrics, then generates a summary report.
|
||||
#
|
||||
# Usage:
|
||||
# homelab-audit.sh [--output-dir DIR]
|
||||
#
|
||||
# Environment overrides:
|
||||
# STUCK_PROC_CPU_WARN CPU% at which a D-state process is flagged (default: 10)
|
||||
# REPORT_DIR Output directory for per-host reports and logs
|
||||
# SSH_USER Remote user (default: root)
|
||||
|
||||
# -e omitted intentionally — unreachable hosts should not abort the full audit
|
||||
set -uo pipefail
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Configuration
|
||||
# ---------------------------------------------------------------------------
|
||||
STUCK_PROC_CPU_WARN="${STUCK_PROC_CPU_WARN:-10}"
|
||||
REPORT_DIR="${REPORT_DIR:-/tmp/homelab-audit-$(date +%Y%m%d-%H%M%S)}"
|
||||
SSH_USER="${SSH_USER:-root}"
|
||||
SSH_OPTS="-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes"
|
||||
|
||||
DISK_WARN=80
|
||||
DISK_CRIT=90
|
||||
LOAD_WARN=2.0
|
||||
MEM_WARN=85
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--output-dir)
|
||||
REPORT_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
mkdir -p "$REPORT_DIR"
|
||||
SSH_FAILURES_LOG="$REPORT_DIR/ssh-failures.log"
|
||||
FINDINGS_FILE="$REPORT_DIR/findings.txt"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Remote collector script
|
||||
#
|
||||
# Kept single-quoted so no local variables are interpolated into the heredoc.
|
||||
# STUCK_PROC_CPU_WARN is passed as $1 when invoking the remote bash session,
|
||||
# so the configurable threshold reaches the collector without escaping issues.
|
||||
# ---------------------------------------------------------------------------
|
||||
COLLECTOR_SCRIPT='#!/usr/bin/env bash
|
||||
STUCK_PROC_CPU_WARN="${1:-10}"
|
||||
|
||||
cpu_load() {
|
||||
uptime | awk -F"load average:" "{print \$2}" | awk -F"[, ]+" "{print \$2}"
|
||||
}
|
||||
|
||||
mem_pct() {
|
||||
free | awk "/^Mem:/ {printf \"%.0f\", \$3/\$2*100}"
|
||||
}
|
||||
|
||||
disk_usage() {
|
||||
df --output=pcent,target -x tmpfs -x devtmpfs 2>/dev/null | tail -n +2 | \
|
||||
while read -r pct mnt; do echo "${pct%%%} $mnt"; done
|
||||
}
|
||||
|
||||
zombie_count() {
|
||||
ps -eo stat= | grep -c "^Z" || true
|
||||
}
|
||||
|
||||
stuck_procs() {
|
||||
ps -eo stat=,pcpu=,comm= | \
|
||||
awk -v t="$STUCK_PROC_CPU_WARN" "$1 ~ /^D/ && $2+0 >= t+0 {print $3}" | \
|
||||
paste -sd,
|
||||
}
|
||||
|
||||
echo "CPU_LOAD=$(cpu_load)"
|
||||
echo "MEM_PCT=$(mem_pct)"
|
||||
echo "ZOMBIES=$(zombie_count)"
|
||||
echo "STUCK_PROCS=$(stuck_procs)"
|
||||
disk_usage | while read -r pct mnt; do
|
||||
echo "DISK $pct $mnt"
|
||||
done
|
||||
'
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SSH helper — logs stderr to ssh-failures.log instead of silently discarding
|
||||
# ---------------------------------------------------------------------------
|
||||
ssh_cmd() {
|
||||
local host="$1"
|
||||
shift
|
||||
# shellcheck disable=SC2086
|
||||
ssh $SSH_OPTS "${SSH_USER}@${host}" "$@" 2>>"$SSH_FAILURES_LOG"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LXC IP discovery
|
||||
#
|
||||
# lxc-info only returns IPs for containers using Proxmox-managed DHCP bridges.
|
||||
# Containers with static IPs defined inside the container (not via Proxmox
|
||||
# network config) return nothing. Fall back to parsing `pct config` in that
|
||||
# case to find the ip= field from the container's network interface config.
|
||||
# ---------------------------------------------------------------------------
|
||||
get_lxc_ip() {
|
||||
local ctid="$1"
|
||||
local ip
|
||||
ip=$(lxc-info -n "$ctid" -iH 2>/dev/null | head -1)
|
||||
if [[ -z "$ip" ]]; then
|
||||
ip=$(pct config "$ctid" 2>/dev/null | grep -oP '(?<=ip=)[^/,]+' | head -1)
|
||||
fi
|
||||
echo "$ip"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Inventory: running LXCs and VMs
|
||||
# Returns lines of "label ip"
|
||||
# ---------------------------------------------------------------------------
|
||||
collect_inventory() {
|
||||
# LXCs
|
||||
pct list 2>/dev/null | tail -n +2 | while read -r ctid status _name; do
|
||||
[[ "$status" != "running" ]] && continue
|
||||
local ip
|
||||
ip=$(get_lxc_ip "$ctid")
|
||||
[[ -n "$ip" ]] && echo "lxc-${ctid} $ip"
|
||||
done
|
||||
|
||||
# VMs — use agent network info if available, fall back to qm config
|
||||
qm list 2>/dev/null | tail -n +2 | while read -r vmid _name status _mem _bootdisk _pid; do
|
||||
[[ "$status" != "running" ]] && continue
|
||||
local ip
|
||||
ip=$(qm guest cmd "$vmid" network-get-interfaces 2>/dev/null |
|
||||
python3 -c "
|
||||
import sys, json
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
for iface in data:
|
||||
for addr in iface.get('ip-addresses', []):
|
||||
if addr['ip-address-type'] == 'ipv4' and not addr['ip-address'].startswith('127.'):
|
||||
print(addr['ip-address'])
|
||||
raise SystemExit
|
||||
except Exception:
|
||||
pass
|
||||
" 2>/dev/null)
|
||||
[[ -n "$ip" ]] && echo "vm-${vmid} $ip"
|
||||
done
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Collect metrics from one host and record findings
|
||||
# ---------------------------------------------------------------------------
|
||||
parse_and_report() {
|
||||
local label="$1"
|
||||
local addr="$2"
|
||||
local raw
|
||||
|
||||
if ! raw=$(echo "$COLLECTOR_SCRIPT" | ssh_cmd "$addr" bash -s -- "$STUCK_PROC_CPU_WARN"); then
|
||||
echo "SSH_FAILURE $label $addr" >>"$SSH_FAILURES_LOG"
|
||||
echo "WARN $label: SSH connection failed" >>"$FINDINGS_FILE"
|
||||
return
|
||||
fi
|
||||
|
||||
while IFS= read -r line; do
|
||||
case "$line" in
|
||||
CPU_LOAD=*)
|
||||
local load="${line#CPU_LOAD=}"
|
||||
if [[ -n "$load" ]] && awk "BEGIN{exit !($load > $LOAD_WARN)}"; then
|
||||
echo "WARN $label: load average ${load} > ${LOAD_WARN}" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
MEM_PCT=*)
|
||||
local mem="${line#MEM_PCT=}"
|
||||
if [[ -n "$mem" ]] && ((mem >= MEM_WARN)); then
|
||||
echo "WARN $label: memory ${mem}% >= ${MEM_WARN}%" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
ZOMBIES=*)
|
||||
local zombies="${line#ZOMBIES=}"
|
||||
if [[ -n "$zombies" ]] && ((zombies > 0)); then
|
||||
echo "WARN $label: ${zombies} zombie process(es)" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
STUCK_PROCS=*)
|
||||
local procs="${line#STUCK_PROCS=}"
|
||||
if [[ -n "$procs" ]]; then
|
||||
echo "WARN $label: D-state procs with CPU>=${STUCK_PROC_CPU_WARN}%: ${procs}" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
DISK\ *)
|
||||
local pct mnt
|
||||
read -r _ pct mnt <<<"$line"
|
||||
if ((pct >= DISK_CRIT)); then
|
||||
echo "CRIT $label: disk ${mnt} at ${pct}% >= ${DISK_CRIT}%" >>"$FINDINGS_FILE"
|
||||
elif ((pct >= DISK_WARN)); then
|
||||
echo "WARN $label: disk ${mnt} at ${pct}% >= ${DISK_WARN}%" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
done <<<"$raw"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Summary — driven by actual findings in findings.txt and ssh-failures.log
|
||||
# ---------------------------------------------------------------------------
|
||||
generate_summary() {
|
||||
local host_count="$1"
|
||||
local ssh_failure_count=0
|
||||
local warn_count=0
|
||||
local crit_count=0
|
||||
|
||||
[[ -f "$SSH_FAILURES_LOG" ]] &&
|
||||
ssh_failure_count=$(grep -c '^SSH_FAILURE' "$SSH_FAILURES_LOG" 2>/dev/null || true)
|
||||
[[ -f "$FINDINGS_FILE" ]] &&
|
||||
warn_count=$(grep -c '^WARN' "$FINDINGS_FILE" 2>/dev/null || true)
|
||||
[[ -f "$FINDINGS_FILE" ]] &&
|
||||
crit_count=$(grep -c '^CRIT' "$FINDINGS_FILE" 2>/dev/null || true)
|
||||
|
||||
echo ""
|
||||
echo "=============================="
|
||||
echo " HOMELAB AUDIT SUMMARY"
|
||||
echo "=============================="
|
||||
printf " Hosts audited : %d\n" "$host_count"
|
||||
printf " SSH failures : %d\n" "$ssh_failure_count"
|
||||
printf " Warnings : %d\n" "$warn_count"
|
||||
printf " Critical : %d\n" "$crit_count"
|
||||
echo "=============================="
|
||||
|
||||
if ((warn_count + crit_count > 0)); then
|
||||
echo ""
|
||||
echo "Findings:"
|
||||
sort "$FINDINGS_FILE"
|
||||
fi
|
||||
|
||||
if ((ssh_failure_count > 0)); then
|
||||
echo ""
|
||||
echo "SSH failures (see $SSH_FAILURES_LOG for details):"
|
||||
grep '^SSH_FAILURE' "$SSH_FAILURES_LOG" | awk '{print " " $2 " (" $3 ")"}'
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "Reports: $REPORT_DIR"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
main() {
|
||||
echo "Starting homelab audit — $(date)"
|
||||
echo "Report dir: $REPORT_DIR"
|
||||
echo "STUCK_PROC_CPU_WARN threshold: ${STUCK_PROC_CPU_WARN}%"
|
||||
echo ""
|
||||
|
||||
>"$FINDINGS_FILE"
|
||||
|
||||
local host_count=0
|
||||
while read -r label addr; do
|
||||
echo " Auditing $label ($addr)..."
|
||||
parse_and_report "$label" "$addr"
|
||||
((host_count++)) || true
|
||||
done < <(collect_inventory)
|
||||
|
||||
generate_summary "$host_count"
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Loading…
Reference in New Issue
Block a user