All checks were successful
Auto-merge docs-only PRs / auto-merge-docs (pull_request) Successful in 2s
Reduce VM 115 (docker-sba) from 16 vCPUs (2×8) to 8 vCPUs (1×8) to match actual workload (0.06 load/core). Add --hosts flag to homelab-audit.sh for targeted post-change audits. Closes #18 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
434 lines
14 KiB
Bash
Executable File
434 lines
14 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# homelab-audit.sh — SSH-based homelab health audit
|
|
#
|
|
# Runs on the Proxmox host. Discovers running LXCs and VMs, SSHes into each
|
|
# to collect system metrics, then generates a summary report.
|
|
#
|
|
# Usage:
|
|
# homelab-audit.sh [--output-dir DIR] [--hosts label:ip,label:ip,...]
|
|
#
|
|
# Environment overrides:
|
|
# STUCK_PROC_CPU_WARN CPU% at which a D-state process is flagged (default: 10)
|
|
# REPORT_DIR Output directory for per-host reports and logs
|
|
# SSH_USER Remote user (default: root)
|
|
|
|
# -e omitted intentionally — unreachable hosts should not abort the full audit
|
|
set -uo pipefail
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Configuration
|
|
# ---------------------------------------------------------------------------
|
|
STUCK_PROC_CPU_WARN="${STUCK_PROC_CPU_WARN:-10}"
|
|
REPORT_DIR="${REPORT_DIR:-/tmp/homelab-audit-$(date +%Y%m%d-%H%M%S)}"
|
|
SSH_USER="${SSH_USER:-root}"
|
|
SSH_OPTS="-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes"
|
|
|
|
DISK_WARN=80
|
|
DISK_CRIT=90
|
|
LOAD_WARN=2.0
|
|
MEM_WARN=85
|
|
ZOMBIE_WARN=1
|
|
SWAP_WARN=512
|
|
MANUAL_HOSTS=""
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--output-dir)
|
|
if [[ $# -lt 2 ]]; then
|
|
echo "Error: --output-dir requires an argument" >&2
|
|
exit 1
|
|
fi
|
|
REPORT_DIR="$2"
|
|
shift 2
|
|
;;
|
|
--hosts)
|
|
if [[ $# -lt 2 ]]; then
|
|
echo "Error: --hosts requires an argument (label:ip,label:ip,...)" >&2
|
|
exit 1
|
|
fi
|
|
MANUAL_HOSTS="$2"
|
|
shift 2
|
|
;;
|
|
*)
|
|
echo "Unknown option: $1" >&2
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
mkdir -p "$REPORT_DIR"
|
|
SSH_FAILURES_LOG="$REPORT_DIR/ssh-failures.log"
|
|
FINDINGS_FILE="$REPORT_DIR/findings.txt"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Remote collector script
|
|
#
|
|
# Kept single-quoted so no local variables are interpolated into the heredoc.
|
|
# STUCK_PROC_CPU_WARN is passed as $1 when invoking the remote bash session,
|
|
# so the configurable threshold reaches the collector without escaping issues.
|
|
# ---------------------------------------------------------------------------
|
|
COLLECTOR_SCRIPT='#!/usr/bin/env bash
|
|
STUCK_PROC_CPU_WARN="${1:-10}"
|
|
|
|
cpu_load() {
|
|
uptime | awk -F"load average:" "{print \$2}" | awk -F"[, ]+" "{print \$2}"
|
|
}
|
|
|
|
mem_pct() {
|
|
free | awk "/^Mem:/ {printf \"%.0f\", \$3/\$2*100}"
|
|
}
|
|
|
|
disk_usage() {
|
|
df --output=pcent,target -x tmpfs -x devtmpfs 2>/dev/null | tail -n +2 | \
|
|
while read -r pct mnt; do echo "${pct%%%} $mnt"; done
|
|
}
|
|
|
|
zombie_count() {
|
|
ps -eo stat= | grep -c "^Z" || true
|
|
}
|
|
|
|
stuck_procs() {
|
|
ps -eo stat=,pcpu=,comm= | \
|
|
awk -v t="$STUCK_PROC_CPU_WARN" '\''$1 ~ /^D/ && $2+0 >= t+0 {print $3}'\'' | \
|
|
paste -sd,
|
|
}
|
|
|
|
zombie_parents() {
|
|
ps -eo pid=,ppid=,stat= | awk '\''$3 ~ /^Z/ {print $2}'\'' | sort -u | \
|
|
xargs -I{} ps -o comm= -p {} 2>/dev/null | paste -sd,
|
|
}
|
|
|
|
swap_mb() {
|
|
free | awk '\''/^Swap:/ {printf "%.0f", $3/1024; found=1} END {if (!found) print "0"}'\''
|
|
}
|
|
|
|
oom_events() {
|
|
local count
|
|
count=$(journalctl -k --since "7 days ago" 2>/dev/null | grep -ci "out of memory") || true
|
|
echo "${count:-0}"
|
|
}
|
|
|
|
io_wait_pct() {
|
|
vmstat 1 2 2>/dev/null | tail -1 | awk '\''{print $16}'\''
|
|
}
|
|
|
|
echo "CPU_LOAD=$(cpu_load)"
|
|
echo "MEM_PCT=$(mem_pct)"
|
|
echo "ZOMBIES=$(zombie_count)"
|
|
echo "STUCK_PROCS=$(stuck_procs)"
|
|
echo "ZOMBIE_PARENTS=$(zombie_parents)"
|
|
echo "SWAP_MB=$(swap_mb)"
|
|
echo "OOM_EVENTS=$(oom_events)"
|
|
echo "IO_WAIT=$(io_wait_pct)"
|
|
disk_usage | while read -r pct mnt; do
|
|
echo "DISK $pct $mnt"
|
|
done
|
|
'
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# SSH helper — logs stderr to ssh-failures.log instead of silently discarding
|
|
# ---------------------------------------------------------------------------
|
|
ssh_cmd() {
|
|
local host="$1"
|
|
shift
|
|
# shellcheck disable=SC2086
|
|
ssh $SSH_OPTS "${SSH_USER}@${host}" "$@" 2>>"$SSH_FAILURES_LOG"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# LXC IP discovery
|
|
#
|
|
# lxc-info only returns IPs for containers using Proxmox-managed DHCP bridges.
|
|
# Containers with static IPs defined inside the container (not via Proxmox
|
|
# network config) return nothing. Fall back to parsing `pct config` in that
|
|
# case to find the ip= field from the container's network interface config.
|
|
# ---------------------------------------------------------------------------
|
|
get_lxc_ip() {
|
|
local ctid="$1"
|
|
local ip
|
|
ip=$(lxc-info -n "$ctid" -iH 2>/dev/null | head -1)
|
|
if [[ -z "$ip" ]]; then
|
|
ip=$(pct config "$ctid" 2>/dev/null | grep -oP '(?<=ip=)[^/,]+' | head -1)
|
|
fi
|
|
echo "$ip"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Inventory: running LXCs and VMs
|
|
# Returns lines of "label ip"
|
|
# ---------------------------------------------------------------------------
|
|
collect_inventory() {
|
|
# LXCs
|
|
pct list 2>/dev/null | tail -n +2 | while read -r ctid status _name; do
|
|
[[ "$status" != "running" ]] && continue
|
|
local ip
|
|
ip=$(get_lxc_ip "$ctid")
|
|
[[ -n "$ip" ]] && echo "lxc-${ctid} $ip"
|
|
done
|
|
|
|
# VMs — use agent network info if available, fall back to qm config
|
|
qm list 2>/dev/null | tail -n +2 | while read -r vmid _name status _mem _bootdisk _pid; do
|
|
[[ "$status" != "running" ]] && continue
|
|
local ip
|
|
ip=$(qm guest cmd "$vmid" network-get-interfaces 2>/dev/null |
|
|
python3 -c "
|
|
import sys, json
|
|
try:
|
|
data = json.load(sys.stdin)
|
|
for iface in data:
|
|
for addr in iface.get('ip-addresses', []):
|
|
if addr['ip-address-type'] == 'ipv4' and not addr['ip-address'].startswith('127.'):
|
|
print(addr['ip-address'])
|
|
raise SystemExit
|
|
except Exception:
|
|
pass
|
|
" 2>/dev/null)
|
|
[[ -n "$ip" ]] && echo "vm-${vmid} $ip"
|
|
done
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Collect metrics from one host and record findings
|
|
# ---------------------------------------------------------------------------
|
|
parse_and_report() {
|
|
local label="$1"
|
|
local addr="$2"
|
|
local raw
|
|
|
|
if ! raw=$(echo "$COLLECTOR_SCRIPT" | ssh_cmd "$addr" bash -s -- "$STUCK_PROC_CPU_WARN"); then
|
|
echo "SSH_FAILURE $label $addr" >>"$SSH_FAILURES_LOG"
|
|
echo "WARN $label: SSH connection failed" >>"$FINDINGS_FILE"
|
|
return
|
|
fi
|
|
|
|
while IFS= read -r line; do
|
|
case "$line" in
|
|
CPU_LOAD=*)
|
|
local load="${line#CPU_LOAD=}"
|
|
if [[ -n "$load" ]] && awk "BEGIN{exit !($load > $LOAD_WARN)}"; then
|
|
echo "WARN $label: load average ${load} > ${LOAD_WARN}" >>"$FINDINGS_FILE"
|
|
fi
|
|
;;
|
|
MEM_PCT=*)
|
|
local mem="${line#MEM_PCT=}"
|
|
if [[ -n "$mem" ]] && ((mem >= MEM_WARN)); then
|
|
echo "WARN $label: memory ${mem}% >= ${MEM_WARN}%" >>"$FINDINGS_FILE"
|
|
fi
|
|
;;
|
|
ZOMBIES=*)
|
|
local zombies="${line#ZOMBIES=}"
|
|
if [[ -n "$zombies" ]] && ((zombies >= ZOMBIE_WARN)); then
|
|
echo "WARN $label: ${zombies} zombie process(es)" >>"$FINDINGS_FILE"
|
|
fi
|
|
;;
|
|
STUCK_PROCS=*)
|
|
local procs="${line#STUCK_PROCS=}"
|
|
if [[ -n "$procs" ]]; then
|
|
echo "WARN $label: D-state procs with CPU>=${STUCK_PROC_CPU_WARN}%: ${procs}" >>"$FINDINGS_FILE"
|
|
fi
|
|
;;
|
|
ZOMBIE_PARENTS=*)
|
|
local zparents="${line#ZOMBIE_PARENTS=}"
|
|
if [[ -n "$zparents" ]]; then
|
|
echo "INFO $label: zombie parent process(es): ${zparents}" >>"$FINDINGS_FILE"
|
|
fi
|
|
;;
|
|
SWAP_MB=*)
|
|
local swap="${line#SWAP_MB=}"
|
|
if [[ -n "$swap" ]] && ((swap >= SWAP_WARN)); then
|
|
echo "WARN $label: swap usage ${swap} MB >= ${SWAP_WARN} MB" >>"$FINDINGS_FILE"
|
|
fi
|
|
;;
|
|
OOM_EVENTS=*)
|
|
local ooms="${line#OOM_EVENTS=}"
|
|
if [[ -n "$ooms" ]] && ((ooms > 0)); then
|
|
echo "WARN $label: ${ooms} OOM kill event(s) in last 7 days" >>"$FINDINGS_FILE"
|
|
fi
|
|
;;
|
|
IO_WAIT=*)
|
|
local iowait="${line#IO_WAIT=}"
|
|
if [[ -n "$iowait" ]] && ((iowait > 20)); then
|
|
echo "WARN $label: I/O wait ${iowait}% > 20%" >>"$FINDINGS_FILE"
|
|
fi
|
|
;;
|
|
DISK\ *)
|
|
local pct mnt
|
|
read -r _ pct mnt <<<"$line"
|
|
if ((pct >= DISK_CRIT)); then
|
|
echo "CRIT $label: disk ${mnt} at ${pct}% >= ${DISK_CRIT}%" >>"$FINDINGS_FILE"
|
|
elif ((pct >= DISK_WARN)); then
|
|
echo "WARN $label: disk ${mnt} at ${pct}% >= ${DISK_WARN}%" >>"$FINDINGS_FILE"
|
|
fi
|
|
;;
|
|
esac
|
|
done <<<"$raw"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Summary — driven by actual findings in findings.txt and ssh-failures.log
|
|
# ---------------------------------------------------------------------------
|
|
generate_summary() {
|
|
local host_count="$1"
|
|
local ssh_failure_count=0
|
|
local warn_count=0
|
|
local crit_count=0
|
|
|
|
[[ -f "$SSH_FAILURES_LOG" ]] &&
|
|
ssh_failure_count=$(grep -c '^SSH_FAILURE' "$SSH_FAILURES_LOG" 2>/dev/null || true)
|
|
[[ -f "$FINDINGS_FILE" ]] &&
|
|
warn_count=$(grep -c '^WARN' "$FINDINGS_FILE" 2>/dev/null || true)
|
|
[[ -f "$FINDINGS_FILE" ]] &&
|
|
crit_count=$(grep -c '^CRIT' "$FINDINGS_FILE" 2>/dev/null || true)
|
|
|
|
echo ""
|
|
echo "=============================="
|
|
echo " HOMELAB AUDIT SUMMARY"
|
|
echo "=============================="
|
|
printf " Hosts audited : %d\n" "$host_count"
|
|
printf " SSH failures : %d\n" "$ssh_failure_count"
|
|
printf " Warnings : %d\n" "$warn_count"
|
|
printf " Critical : %d\n" "$crit_count"
|
|
echo "=============================="
|
|
|
|
if ((warn_count + crit_count > 0)); then
|
|
echo ""
|
|
echo "Findings:"
|
|
sort "$FINDINGS_FILE"
|
|
fi
|
|
|
|
if ((ssh_failure_count > 0)); then
|
|
echo ""
|
|
echo "SSH failures (see $SSH_FAILURES_LOG for details):"
|
|
grep '^SSH_FAILURE' "$SSH_FAILURES_LOG" | awk '{print " " $2 " (" $3 ")"}'
|
|
fi
|
|
|
|
echo ""
|
|
echo "Reports: $REPORT_DIR"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Proxmox backup recency — queries vzdump task history via pvesh (runs locally)
|
|
# ---------------------------------------------------------------------------
|
|
check_backup_recency() {
|
|
local tasks_json_file="$REPORT_DIR/vzdump-tasks.json"
|
|
pvesh get /nodes/proxmox/tasks --typefilter vzdump --limit 50 --output-format json \
|
|
>"$tasks_json_file" 2>/dev/null || {
|
|
echo "WARN proxmox: failed to query vzdump task history" >>"$FINDINGS_FILE"
|
|
return
|
|
}
|
|
|
|
[[ ! -s "$tasks_json_file" ]] && return
|
|
|
|
local running_ids=()
|
|
while read -r ctid; do
|
|
running_ids+=("$ctid")
|
|
done < <(pct list 2>/dev/null | awk 'NR>1 && $2=="running"{print $1}')
|
|
while read -r vmid; do
|
|
running_ids+=("$vmid")
|
|
done < <(qm list 2>/dev/null | awk 'NR>1 && $3=="running"{print $1}')
|
|
|
|
[[ ${#running_ids[@]} -eq 0 ]] && return
|
|
|
|
local week_ago
|
|
week_ago=$(($(date +%s) - 7 * 86400))
|
|
|
|
python3 - "$tasks_json_file" "$week_ago" "${running_ids[@]}" <<'PYEOF' >>"$FINDINGS_FILE"
|
|
import sys, json, datetime
|
|
|
|
tasks_file, week_ago = sys.argv[1], int(sys.argv[2])
|
|
running_ids = set(sys.argv[3:])
|
|
|
|
try:
|
|
tasks = json.load(open(tasks_file))
|
|
except Exception:
|
|
sys.exit(0)
|
|
|
|
last_backup = {}
|
|
for task in tasks:
|
|
if task.get("type") != "vzdump" or task.get("status") != "OK":
|
|
continue
|
|
vmid = str(task.get("id", ""))
|
|
endtime = int(task.get("endtime", 0))
|
|
if vmid and endtime and endtime > last_backup.get(vmid, 0):
|
|
last_backup[vmid] = endtime
|
|
|
|
for vmid in sorted(running_ids):
|
|
ts = last_backup.get(vmid)
|
|
if ts and ts >= week_ago:
|
|
pass
|
|
elif ts:
|
|
dt = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d")
|
|
print(f"WARN proxmox/vm-{vmid}: last backup {dt} is older than 7 days")
|
|
else:
|
|
print(f"CRIT proxmox/vm-{vmid}: no backup found in task history")
|
|
PYEOF
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Certificate expiry check — runs from the audit host via openssl
|
|
# ---------------------------------------------------------------------------
|
|
check_cert_expiry() {
|
|
local label="$1"
|
|
local addr="$2"
|
|
local now
|
|
now=$(date +%s)
|
|
|
|
for port in 443 8443; do
|
|
local enddate
|
|
enddate=$(echo | timeout 10 openssl s_client -connect "${addr}:${port}" 2>/dev/null |
|
|
openssl x509 -noout -enddate 2>/dev/null) || continue
|
|
[[ -z "$enddate" ]] && continue
|
|
|
|
local expiry_str="${enddate#notAfter=}"
|
|
local expiry_epoch
|
|
expiry_epoch=$(date -d "$expiry_str" +%s 2>/dev/null) || continue
|
|
local days_left=$(((expiry_epoch - now) / 86400))
|
|
|
|
if ((days_left <= 7)); then
|
|
echo "CRIT $label: TLS cert on :${port} expires in ${days_left} days" >>"$FINDINGS_FILE"
|
|
elif ((days_left <= 14)); then
|
|
echo "WARN $label: TLS cert on :${port} expires in ${days_left} days" >>"$FINDINGS_FILE"
|
|
fi
|
|
done
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
main() {
|
|
echo "Starting homelab audit — $(date)"
|
|
echo "Report dir: $REPORT_DIR"
|
|
echo "STUCK_PROC_CPU_WARN threshold: ${STUCK_PROC_CPU_WARN}%"
|
|
echo ""
|
|
|
|
>"$FINDINGS_FILE"
|
|
|
|
echo " Checking Proxmox backup recency..."
|
|
check_backup_recency
|
|
|
|
local host_count=0
|
|
if [[ -n "$MANUAL_HOSTS" ]]; then
|
|
# --hosts flag: parse comma-separated label:ip pairs
|
|
IFS=',' read -ra host_entries <<<"$MANUAL_HOSTS"
|
|
for entry in "${host_entries[@]}"; do
|
|
local label="${entry%%:*}"
|
|
local addr="${entry#*:}"
|
|
echo " Auditing $label ($addr)..."
|
|
parse_and_report "$label" "$addr"
|
|
check_cert_expiry "$label" "$addr"
|
|
((host_count++)) || true
|
|
done
|
|
else
|
|
while read -r label addr; do
|
|
echo " Auditing $label ($addr)..."
|
|
parse_and_report "$label" "$addr"
|
|
check_cert_expiry "$label" "$addr"
|
|
((host_count++)) || true
|
|
done < <(collect_inventory)
|
|
fi
|
|
|
|
generate_summary "$host_count"
|
|
}
|
|
|
|
main "$@"
|