#!/usr/bin/env bash # homelab-audit.sh — SSH-based homelab health audit # # Runs on the Proxmox host. Discovers running LXCs and VMs, SSHes into each # to collect system metrics, then generates a summary report. # # Usage: # homelab-audit.sh [--output-dir DIR] # # Environment overrides: # STUCK_PROC_CPU_WARN CPU% at which a D-state process is flagged (default: 10) # REPORT_DIR Output directory for per-host reports and logs # SSH_USER Remote user (default: root) # -e omitted intentionally — unreachable hosts should not abort the full audit set -uo pipefail # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- STUCK_PROC_CPU_WARN="${STUCK_PROC_CPU_WARN:-10}" REPORT_DIR="${REPORT_DIR:-/tmp/homelab-audit-$(date +%Y%m%d-%H%M%S)}" SSH_USER="${SSH_USER:-root}" SSH_OPTS="-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes" DISK_WARN=80 DISK_CRIT=90 LOAD_WARN=2.0 MEM_WARN=85 while [[ $# -gt 0 ]]; do case "$1" in --output-dir) REPORT_DIR="$2" shift 2 ;; *) echo "Unknown option: $1" >&2 exit 1 ;; esac done mkdir -p "$REPORT_DIR" SSH_FAILURES_LOG="$REPORT_DIR/ssh-failures.log" FINDINGS_FILE="$REPORT_DIR/findings.txt" # --------------------------------------------------------------------------- # Remote collector script # # Kept single-quoted so no local variables are interpolated into the heredoc. # STUCK_PROC_CPU_WARN is passed as $1 when invoking the remote bash session, # so the configurable threshold reaches the collector without escaping issues. # --------------------------------------------------------------------------- COLLECTOR_SCRIPT='#!/usr/bin/env bash STUCK_PROC_CPU_WARN="${1:-10}" cpu_load() { uptime | awk -F"load average:" "{print \$2}" | awk -F"[, ]+" "{print \$2}" } mem_pct() { free | awk "/^Mem:/ {printf \"%.0f\", \$3/\$2*100}" } disk_usage() { df --output=pcent,target -x tmpfs -x devtmpfs 2>/dev/null | tail -n +2 | \ while read -r pct mnt; do echo "${pct%%%} $mnt"; done } zombie_count() { ps -eo stat= | grep -c "^Z" || true } stuck_procs() { ps -eo stat=,pcpu=,comm= | \ awk -v t="$STUCK_PROC_CPU_WARN" "$1 ~ /^D/ && $2+0 >= t+0 {print $3}" | \ paste -sd, } echo "CPU_LOAD=$(cpu_load)" echo "MEM_PCT=$(mem_pct)" echo "ZOMBIES=$(zombie_count)" echo "STUCK_PROCS=$(stuck_procs)" disk_usage | while read -r pct mnt; do echo "DISK $pct $mnt" done ' # --------------------------------------------------------------------------- # SSH helper — logs stderr to ssh-failures.log instead of silently discarding # --------------------------------------------------------------------------- ssh_cmd() { local host="$1" shift # shellcheck disable=SC2086 ssh $SSH_OPTS "${SSH_USER}@${host}" "$@" 2>>"$SSH_FAILURES_LOG" } # --------------------------------------------------------------------------- # LXC IP discovery # # lxc-info only returns IPs for containers using Proxmox-managed DHCP bridges. # Containers with static IPs defined inside the container (not via Proxmox # network config) return nothing. Fall back to parsing `pct config` in that # case to find the ip= field from the container's network interface config. # --------------------------------------------------------------------------- get_lxc_ip() { local ctid="$1" local ip ip=$(lxc-info -n "$ctid" -iH 2>/dev/null | head -1) if [[ -z "$ip" ]]; then ip=$(pct config "$ctid" 2>/dev/null | grep -oP '(?<=ip=)[^/,]+' | head -1) fi echo "$ip" } # --------------------------------------------------------------------------- # Inventory: running LXCs and VMs # Returns lines of "label ip" # --------------------------------------------------------------------------- collect_inventory() { # LXCs pct list 2>/dev/null | tail -n +2 | while read -r ctid status _name; do [[ "$status" != "running" ]] && continue local ip ip=$(get_lxc_ip "$ctid") [[ -n "$ip" ]] && echo "lxc-${ctid} $ip" done # VMs — use agent network info if available, fall back to qm config qm list 2>/dev/null | tail -n +2 | while read -r vmid _name status _mem _bootdisk _pid; do [[ "$status" != "running" ]] && continue local ip ip=$(qm guest cmd "$vmid" network-get-interfaces 2>/dev/null | python3 -c " import sys, json try: data = json.load(sys.stdin) for iface in data: for addr in iface.get('ip-addresses', []): if addr['ip-address-type'] == 'ipv4' and not addr['ip-address'].startswith('127.'): print(addr['ip-address']) raise SystemExit except Exception: pass " 2>/dev/null) [[ -n "$ip" ]] && echo "vm-${vmid} $ip" done } # --------------------------------------------------------------------------- # Collect metrics from one host and record findings # --------------------------------------------------------------------------- parse_and_report() { local label="$1" local addr="$2" local raw if ! raw=$(echo "$COLLECTOR_SCRIPT" | ssh_cmd "$addr" bash -s -- "$STUCK_PROC_CPU_WARN"); then echo "SSH_FAILURE $label $addr" >>"$SSH_FAILURES_LOG" echo "WARN $label: SSH connection failed" >>"$FINDINGS_FILE" return fi while IFS= read -r line; do case "$line" in CPU_LOAD=*) local load="${line#CPU_LOAD=}" if [[ -n "$load" ]] && awk "BEGIN{exit !($load > $LOAD_WARN)}"; then echo "WARN $label: load average ${load} > ${LOAD_WARN}" >>"$FINDINGS_FILE" fi ;; MEM_PCT=*) local mem="${line#MEM_PCT=}" if [[ -n "$mem" ]] && ((mem >= MEM_WARN)); then echo "WARN $label: memory ${mem}% >= ${MEM_WARN}%" >>"$FINDINGS_FILE" fi ;; ZOMBIES=*) local zombies="${line#ZOMBIES=}" if [[ -n "$zombies" ]] && ((zombies > 0)); then echo "WARN $label: ${zombies} zombie process(es)" >>"$FINDINGS_FILE" fi ;; STUCK_PROCS=*) local procs="${line#STUCK_PROCS=}" if [[ -n "$procs" ]]; then echo "WARN $label: D-state procs with CPU>=${STUCK_PROC_CPU_WARN}%: ${procs}" >>"$FINDINGS_FILE" fi ;; DISK\ *) local pct mnt read -r _ pct mnt <<<"$line" if ((pct >= DISK_CRIT)); then echo "CRIT $label: disk ${mnt} at ${pct}% >= ${DISK_CRIT}%" >>"$FINDINGS_FILE" elif ((pct >= DISK_WARN)); then echo "WARN $label: disk ${mnt} at ${pct}% >= ${DISK_WARN}%" >>"$FINDINGS_FILE" fi ;; esac done <<<"$raw" } # --------------------------------------------------------------------------- # Summary — driven by actual findings in findings.txt and ssh-failures.log # --------------------------------------------------------------------------- generate_summary() { local host_count="$1" local ssh_failure_count=0 local warn_count=0 local crit_count=0 [[ -f "$SSH_FAILURES_LOG" ]] && ssh_failure_count=$(grep -c '^SSH_FAILURE' "$SSH_FAILURES_LOG" 2>/dev/null || true) [[ -f "$FINDINGS_FILE" ]] && warn_count=$(grep -c '^WARN' "$FINDINGS_FILE" 2>/dev/null || true) [[ -f "$FINDINGS_FILE" ]] && crit_count=$(grep -c '^CRIT' "$FINDINGS_FILE" 2>/dev/null || true) echo "" echo "==============================" echo " HOMELAB AUDIT SUMMARY" echo "==============================" printf " Hosts audited : %d\n" "$host_count" printf " SSH failures : %d\n" "$ssh_failure_count" printf " Warnings : %d\n" "$warn_count" printf " Critical : %d\n" "$crit_count" echo "==============================" if ((warn_count + crit_count > 0)); then echo "" echo "Findings:" sort "$FINDINGS_FILE" fi if ((ssh_failure_count > 0)); then echo "" echo "SSH failures (see $SSH_FAILURES_LOG for details):" grep '^SSH_FAILURE' "$SSH_FAILURES_LOG" | awk '{print " " $2 " (" $3 ")"}' fi echo "" echo "Reports: $REPORT_DIR" } # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- main() { echo "Starting homelab audit — $(date)" echo "Report dir: $REPORT_DIR" echo "STUCK_PROC_CPU_WARN threshold: ${STUCK_PROC_CPU_WARN}%" echo "" >"$FINDINGS_FILE" local host_count=0 while read -r label addr; do echo " Auditing $label ($addr)..." parse_and_report "$label" "$addr" ((host_count++)) || true done < <(collect_inventory) generate_summary "$host_count" } main "$@"