diff --git a/monitoring/scripts/homelab-audit.sh b/monitoring/scripts/homelab-audit.sh new file mode 100755 index 0000000..9199fba --- /dev/null +++ b/monitoring/scripts/homelab-audit.sh @@ -0,0 +1,268 @@ +#!/usr/bin/env bash +# homelab-audit.sh — SSH-based homelab health audit +# +# Runs on the Proxmox host. Discovers running LXCs and VMs, SSHes into each +# to collect system metrics, then generates a summary report. +# +# Usage: +# homelab-audit.sh [--output-dir DIR] +# +# Environment overrides: +# STUCK_PROC_CPU_WARN CPU% at which a D-state process is flagged (default: 10) +# REPORT_DIR Output directory for per-host reports and logs +# SSH_USER Remote user (default: root) + +# -e omitted intentionally — unreachable hosts should not abort the full audit +set -uo pipefail + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- +STUCK_PROC_CPU_WARN="${STUCK_PROC_CPU_WARN:-10}" +REPORT_DIR="${REPORT_DIR:-/tmp/homelab-audit-$(date +%Y%m%d-%H%M%S)}" +SSH_USER="${SSH_USER:-root}" +SSH_OPTS="-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes" + +DISK_WARN=80 +DISK_CRIT=90 +LOAD_WARN=2.0 +MEM_WARN=85 + +while [[ $# -gt 0 ]]; do + case "$1" in + --output-dir) + REPORT_DIR="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" >&2 + exit 1 + ;; + esac +done + +mkdir -p "$REPORT_DIR" +SSH_FAILURES_LOG="$REPORT_DIR/ssh-failures.log" +FINDINGS_FILE="$REPORT_DIR/findings.txt" + +# --------------------------------------------------------------------------- +# Remote collector script +# +# Kept single-quoted so no local variables are interpolated into the heredoc. +# STUCK_PROC_CPU_WARN is passed as $1 when invoking the remote bash session, +# so the configurable threshold reaches the collector without escaping issues. +# --------------------------------------------------------------------------- +COLLECTOR_SCRIPT='#!/usr/bin/env bash +STUCK_PROC_CPU_WARN="${1:-10}" + +cpu_load() { + uptime | awk -F"load average:" "{print \$2}" | awk -F"[, ]+" "{print \$2}" +} + +mem_pct() { + free | awk "/^Mem:/ {printf \"%.0f\", \$3/\$2*100}" +} + +disk_usage() { + df --output=pcent,target -x tmpfs -x devtmpfs 2>/dev/null | tail -n +2 | \ + while read -r pct mnt; do echo "${pct%%%} $mnt"; done +} + +zombie_count() { + ps -eo stat= | grep -c "^Z" || true +} + +stuck_procs() { + ps -eo stat=,pcpu=,comm= | \ + awk -v t="$STUCK_PROC_CPU_WARN" "$1 ~ /^D/ && $2+0 >= t+0 {print $3}" | \ + paste -sd, +} + +echo "CPU_LOAD=$(cpu_load)" +echo "MEM_PCT=$(mem_pct)" +echo "ZOMBIES=$(zombie_count)" +echo "STUCK_PROCS=$(stuck_procs)" +disk_usage | while read -r pct mnt; do + echo "DISK $pct $mnt" +done +' + +# --------------------------------------------------------------------------- +# SSH helper — logs stderr to ssh-failures.log instead of silently discarding +# --------------------------------------------------------------------------- +ssh_cmd() { + local host="$1" + shift + # shellcheck disable=SC2086 + ssh $SSH_OPTS "${SSH_USER}@${host}" "$@" 2>>"$SSH_FAILURES_LOG" +} + +# --------------------------------------------------------------------------- +# LXC IP discovery +# +# lxc-info only returns IPs for containers using Proxmox-managed DHCP bridges. +# Containers with static IPs defined inside the container (not via Proxmox +# network config) return nothing. Fall back to parsing `pct config` in that +# case to find the ip= field from the container's network interface config. +# --------------------------------------------------------------------------- +get_lxc_ip() { + local ctid="$1" + local ip + ip=$(lxc-info -n "$ctid" -iH 2>/dev/null | head -1) + if [[ -z "$ip" ]]; then + ip=$(pct config "$ctid" 2>/dev/null | grep -oP '(?<=ip=)[^/,]+' | head -1) + fi + echo "$ip" +} + +# --------------------------------------------------------------------------- +# Inventory: running LXCs and VMs +# Returns lines of "label ip" +# --------------------------------------------------------------------------- +collect_inventory() { + # LXCs + pct list 2>/dev/null | tail -n +2 | while read -r ctid status _name; do + [[ "$status" != "running" ]] && continue + local ip + ip=$(get_lxc_ip "$ctid") + [[ -n "$ip" ]] && echo "lxc-${ctid} $ip" + done + + # VMs — use agent network info if available, fall back to qm config + qm list 2>/dev/null | tail -n +2 | while read -r vmid _name status _mem _bootdisk _pid; do + [[ "$status" != "running" ]] && continue + local ip + ip=$(qm guest cmd "$vmid" network-get-interfaces 2>/dev/null | + python3 -c " +import sys, json +try: + data = json.load(sys.stdin) + for iface in data: + for addr in iface.get('ip-addresses', []): + if addr['ip-address-type'] == 'ipv4' and not addr['ip-address'].startswith('127.'): + print(addr['ip-address']) + raise SystemExit +except Exception: + pass +" 2>/dev/null) + [[ -n "$ip" ]] && echo "vm-${vmid} $ip" + done +} + +# --------------------------------------------------------------------------- +# Collect metrics from one host and record findings +# --------------------------------------------------------------------------- +parse_and_report() { + local label="$1" + local addr="$2" + local raw + + if ! raw=$(echo "$COLLECTOR_SCRIPT" | ssh_cmd "$addr" bash -s -- "$STUCK_PROC_CPU_WARN"); then + echo "SSH_FAILURE $label $addr" >>"$SSH_FAILURES_LOG" + echo "WARN $label: SSH connection failed" >>"$FINDINGS_FILE" + return + fi + + while IFS= read -r line; do + case "$line" in + CPU_LOAD=*) + local load="${line#CPU_LOAD=}" + if [[ -n "$load" ]] && awk "BEGIN{exit !($load > $LOAD_WARN)}"; then + echo "WARN $label: load average ${load} > ${LOAD_WARN}" >>"$FINDINGS_FILE" + fi + ;; + MEM_PCT=*) + local mem="${line#MEM_PCT=}" + if [[ -n "$mem" ]] && ((mem >= MEM_WARN)); then + echo "WARN $label: memory ${mem}% >= ${MEM_WARN}%" >>"$FINDINGS_FILE" + fi + ;; + ZOMBIES=*) + local zombies="${line#ZOMBIES=}" + if [[ -n "$zombies" ]] && ((zombies > 0)); then + echo "WARN $label: ${zombies} zombie process(es)" >>"$FINDINGS_FILE" + fi + ;; + STUCK_PROCS=*) + local procs="${line#STUCK_PROCS=}" + if [[ -n "$procs" ]]; then + echo "WARN $label: D-state procs with CPU>=${STUCK_PROC_CPU_WARN}%: ${procs}" >>"$FINDINGS_FILE" + fi + ;; + DISK\ *) + local pct mnt + read -r _ pct mnt <<<"$line" + if ((pct >= DISK_CRIT)); then + echo "CRIT $label: disk ${mnt} at ${pct}% >= ${DISK_CRIT}%" >>"$FINDINGS_FILE" + elif ((pct >= DISK_WARN)); then + echo "WARN $label: disk ${mnt} at ${pct}% >= ${DISK_WARN}%" >>"$FINDINGS_FILE" + fi + ;; + esac + done <<<"$raw" +} + +# --------------------------------------------------------------------------- +# Summary — driven by actual findings in findings.txt and ssh-failures.log +# --------------------------------------------------------------------------- +generate_summary() { + local host_count="$1" + local ssh_failure_count=0 + local warn_count=0 + local crit_count=0 + + [[ -f "$SSH_FAILURES_LOG" ]] && + ssh_failure_count=$(grep -c '^SSH_FAILURE' "$SSH_FAILURES_LOG" 2>/dev/null || true) + [[ -f "$FINDINGS_FILE" ]] && + warn_count=$(grep -c '^WARN' "$FINDINGS_FILE" 2>/dev/null || true) + [[ -f "$FINDINGS_FILE" ]] && + crit_count=$(grep -c '^CRIT' "$FINDINGS_FILE" 2>/dev/null || true) + + echo "" + echo "==============================" + echo " HOMELAB AUDIT SUMMARY" + echo "==============================" + printf " Hosts audited : %d\n" "$host_count" + printf " SSH failures : %d\n" "$ssh_failure_count" + printf " Warnings : %d\n" "$warn_count" + printf " Critical : %d\n" "$crit_count" + echo "==============================" + + if ((warn_count + crit_count > 0)); then + echo "" + echo "Findings:" + sort "$FINDINGS_FILE" + fi + + if ((ssh_failure_count > 0)); then + echo "" + echo "SSH failures (see $SSH_FAILURES_LOG for details):" + grep '^SSH_FAILURE' "$SSH_FAILURES_LOG" | awk '{print " " $2 " (" $3 ")"}' + fi + + echo "" + echo "Reports: $REPORT_DIR" +} + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- +main() { + echo "Starting homelab audit — $(date)" + echo "Report dir: $REPORT_DIR" + echo "STUCK_PROC_CPU_WARN threshold: ${STUCK_PROC_CPU_WARN}%" + echo "" + + >"$FINDINGS_FILE" + + local host_count=0 + while read -r label addr; do + echo " Auditing $label ($addr)..." + parse_and_report "$label" "$addr" + ((host_count++)) || true + done < <(collect_inventory) + + generate_summary "$host_count" +} + +main "$@"