diff --git a/monitoring/scripts/homelab-audit.sh b/monitoring/scripts/homelab-audit.sh new file mode 100755 index 0000000..f674383 --- /dev/null +++ b/monitoring/scripts/homelab-audit.sh @@ -0,0 +1,483 @@ +#!/usr/bin/env bash +# homelab-audit.sh — Comprehensive homelab infrastructure audit +# Collects resource allocation, utilization, stuck processes, and inefficiencies +# across all Proxmox VMs/CTs and physical hosts. +# +# Usage: ./homelab-audit.sh [--json] [--output FILE] +# Requires: SSH access to all hosts (via ~/.ssh/config aliases) + +# -e omitted intentionally — unreachable hosts should not abort the full audit +set -uo pipefail + +# ── Configuration ───────────────────────────────────────────────────────────── +PROXMOX_HOST="proxmox" +PHYSICAL_HOSTS=("manticore") +SSH_TIMEOUT=10 +OUTPUT_FORMAT="text" +OUTPUT_FILE="" +REPORT_DIR="/tmp/homelab-audit-$(date +%Y%m%d-%H%M%S)" + +# Thresholds for flagging issues +LOAD_PER_CORE_WARN=0.7 +LOAD_PER_CORE_CRIT=1.0 +MEM_USED_PCT_WARN=80 +MEM_USED_PCT_CRIT=95 +DISK_USED_PCT_WARN=80 +DISK_USED_PCT_CRIT=90 +SWAP_USED_MB_WARN=500 +UPTIME_DAYS_WARN=30 +ZOMBIE_WARN=1 +STUCK_PROC_CPU_WARN=10 # % CPU for a single process running >24h + +# ── Argument parsing ───────────────────────────────────────────────────────── +while [[ $# -gt 0 ]]; do + case $1 in + --json) + OUTPUT_FORMAT="json" + shift + ;; + --output) + OUTPUT_FILE="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +mkdir -p "$REPORT_DIR" + +# ── Helpers ────────────────────────────────────────────────────────────────── + +ssh_cmd() { + local host="$1" + shift + ssh -n -o ConnectTimeout=$SSH_TIMEOUT -o BatchMode=yes -o StrictHostKeyChecking=accept-new "$host" "$@" 2>>"$REPORT_DIR/ssh-failures.log" +} + +# ssh_stdin — like ssh_cmd but allows stdin (for heredocs/pipe input) +ssh_stdin() { + local host="$1" + shift + ssh -o ConnectTimeout=$SSH_TIMEOUT -o BatchMode=yes -o StrictHostKeyChecking=accept-new "$host" "$@" 2>>"$REPORT_DIR/ssh-failures.log" +} + +log_section() { echo -e "\n━━━ $1 ━━━"; } +log_subsection() { echo -e "\n ── $1 ──"; } +log_ok() { echo " ✓ $1"; } +log_warn() { echo " ⚠ $1"; } +log_crit() { echo " ✖ $1"; } +log_info() { echo " $1"; } + +# ── Per-host collection script (runs remotely) ────────────────────────────── +# This heredoc is sent to each host via SSH. It outputs structured key=value data. +COLLECTOR_SCRIPT=' +#!/bin/bash +echo "AUDIT_START" +echo "hostname=$(hostname)" +echo "uptime_seconds=$(cat /proc/uptime | cut -d" " -f1 | cut -d"." -f1)" +echo "uptime_days=$(( $(cat /proc/uptime | cut -d" " -f1 | cut -d"." -f1) / 86400 ))" + +# CPU +echo "cpu_cores=$(nproc 2>/dev/null || grep -c ^processor /proc/cpuinfo)" +read load1 load5 load15 rest < /proc/loadavg +echo "load_1m=$load1" +echo "load_5m=$load5" +echo "load_15m=$load15" + +# Memory (in MB) +mem_total=$(awk "/MemTotal/ {printf \"%.0f\", \$2/1024}" /proc/meminfo) +mem_avail=$(awk "/MemAvailable/ {printf \"%.0f\", \$2/1024}" /proc/meminfo) +mem_used=$((mem_total - mem_avail)) +swap_total=$(awk "/SwapTotal/ {printf \"%.0f\", \$2/1024}" /proc/meminfo) +swap_used=$(awk "/SwapFree/ {printf \"%.0f\", ($swap_total*1024 - \$2)/1024}" /proc/meminfo 2>/dev/null || echo 0) +echo "mem_total_mb=$mem_total" +echo "mem_used_mb=$mem_used" +echo "mem_avail_mb=$mem_avail" +echo "mem_used_pct=$((mem_used * 100 / (mem_total > 0 ? mem_total : 1)))" +echo "swap_total_mb=$swap_total" +echo "swap_used_mb=$swap_used" + +# Disk usage (non-tmpfs, non-overlay mounts) +echo "DISK_START" +df -h --output=target,size,used,avail,pcent -x tmpfs -x devtmpfs -x overlay -x squashfs 2>/dev/null | tail -n +2 | while read mount size used avail pct; do + echo "disk|$mount|$size|$used|$avail|$pct" +done +echo "DISK_END" + +# Top CPU processes (>5% CPU) +echo "PROCS_START" +ps aux --sort=-%cpu --no-headers 2>/dev/null | head -15 | while read user pid cpu mem vsz rss tty stat start time cmd; do + echo "proc|$user|$pid|$cpu|$mem|$start|$time|$cmd" +done +echo "PROCS_END" + +# Zombie processes +zombies=$(ps aux 2>/dev/null | awk "\$8 ~ /Z/" | wc -l) +echo "zombie_count=$zombies" + +# Failed systemd units +echo "FAILED_UNITS_START" +systemctl --failed --no-legend --no-pager 2>/dev/null | while read unit load active sub desc; do + echo "failed_unit|$unit" +done +echo "FAILED_UNITS_END" + +# Docker containers (if docker is available) +if command -v docker &>/dev/null; then + echo "DOCKER_START" + docker stats --no-stream --format "docker|{{.Name}}|{{.CPUPerc}}|{{.MemUsage}}|{{.NetIO}}|{{.PIDs}}" 2>/dev/null || true + echo "DOCKER_END" + echo "DOCKER_CONTAINERS_START" + docker ps -a --format "container|{{.Names}}|{{.Status}}|{{.Image}}|{{.Ports}}" 2>/dev/null || true + echo "DOCKER_CONTAINERS_END" +fi + +# Listening ports (services inventory) +echo "LISTENERS_START" +ss -tlnp 2>/dev/null | tail -n +2 | awk "{print \"listen|\" \$4 \"|\" \$6}" | head -30 +echo "LISTENERS_END" + +# Long-running high-CPU processes (potential stuck processes) +echo "STUCK_PROCS_START" +now=$(date +%s) +ps -eo pid,etimes,%cpu,comm --sort=-%cpu --no-headers 2>/dev/null | while read pid etime cpu comm; do + # etimes = elapsed time in seconds; flag if >24h and >STUCK threshold + cpu_int=${cpu%.*} + if [[ "$etime" -gt 86400 && "${cpu_int:-0}" -gt ${STUCK_PROC_CPU_WARN:-10} ]]; then + days=$((etime / 86400)) + echo "stuck|$pid|$cpu|${days}d|$comm" + fi +done +echo "STUCK_PROCS_END" + +echo "AUDIT_END" +' + +# ── Proxmox-specific collection ───────────────────────────────────────────── +collect_proxmox_inventory() { + log_section "PROXMOX HOST INVENTORY ($PROXMOX_HOST)" + + # Get host-level info + local pve_data + pve_data=$(ssh_stdin "$PROXMOX_HOST" "STUCK_PROC_CPU_WARN=$STUCK_PROC_CPU_WARN bash -s" <<<"$COLLECTOR_SCRIPT") + echo "$pve_data" >"$REPORT_DIR/proxmox-host.txt" + parse_and_report "proxmox" "$pve_data" + + # VM inventory with resource allocations + log_subsection "Virtual Machines" + printf " %-6s %-25s %-8s %6s %8s %10s\n" "VMID" "NAME" "STATUS" "vCPUs" "RAM(GB)" "DISK(GB)" + printf " %-6s %-25s %-8s %6s %8s %10s\n" "------" "-------------------------" "--------" "------" "--------" "----------" + + local total_vcpus=0 total_vm_ram=0 + + while IFS= read -r line; do + local vmid=$(echo "$line" | awk '{print $1}') + local name=$(echo "$line" | awk '{print $2}') + local status=$(echo "$line" | awk '{print $3}') + local mem_mb=$(echo "$line" | awk '{print $4}') + local disk_gb=$(echo "$line" | awk '{print $5}') + + # Get vCPU count from config + local vcpus + vcpus=$(ssh_cmd "$PROXMOX_HOST" "qm config $vmid 2>/dev/null | grep -E '^(cores|sockets)'" | + awk -F: '/cores/{c=$2} /sockets/{s=$2} END{printf "%.0f", (c+0)*(s>0?s:1)}') + [[ -z "$vcpus" ]] && vcpus=0 + + local mem_gb=$(echo "scale=1; $mem_mb / 1024" | bc 2>/dev/null || echo "?") + + if [[ "$status" == "running" ]]; then + total_vcpus=$((total_vcpus + vcpus)) + total_vm_ram=$((total_vm_ram + mem_mb)) + fi + + local flag="" + [[ "$status" == "stopped" ]] && flag=" (wasting disk)" + [[ "$status" == "running" && "$vcpus" -gt 8 ]] && flag=" (heavy)" + + printf " %-6s %-25s %-8s %6s %8s %10s%s\n" "$vmid" "$name" "$status" "$vcpus" "$mem_gb" "$disk_gb" "$flag" + done < <(ssh_cmd "$PROXMOX_HOST" "qm list 2>/dev/null" | tail -n +2 | awk 'NF{printf "%s %s %s %s %s\n", $1, $2, $3, $4, $5}') + + # CT inventory + log_subsection "LXC Containers" + printf " %-6s %-30s %-8s %6s %8s\n" "CTID" "NAME" "STATUS" "vCPUs" "RAM(MB)" + printf " %-6s %-30s %-8s %6s %8s\n" "------" "------------------------------" "--------" "------" "--------" + + while IFS= read -r line; do + local ctid=$(echo "$line" | awk '{print $1}') + local status=$(echo "$line" | awk '{print $2}') + local name=$(echo "$line" | awk '{print $NF}') + + # Get CT resource config + local ct_cores ct_mem + ct_cores=$(ssh_cmd "$PROXMOX_HOST" "pct config $ctid 2>/dev/null | grep ^cores" | awk -F: '{print $2}' | tr -d ' ') + ct_mem=$(ssh_cmd "$PROXMOX_HOST" "pct config $ctid 2>/dev/null | grep ^memory" | awk -F: '{print $2}' | tr -d ' ') + [[ -z "$ct_cores" ]] && ct_cores="(host)" + [[ -z "$ct_mem" ]] && ct_mem="?" + + if [[ "$status" == "running" && "$ct_cores" =~ ^[0-9]+$ ]]; then + total_vcpus=$((total_vcpus + ct_cores)) + fi + + printf " %-6s %-30s %-8s %6s %8s\n" "$ctid" "$name" "$status" "$ct_cores" "$ct_mem" + done < <(ssh_cmd "$PROXMOX_HOST" "pct list 2>/dev/null" | tail -n +2) + + # Physical cores + local phys_cores + phys_cores=$(ssh_cmd "$PROXMOX_HOST" "nproc") + + log_subsection "Resource Summary" + log_info "Physical cores: $phys_cores" + log_info "Total allocated vCPUs: $total_vcpus" + local ratio=$(echo "scale=2; $total_vcpus / $phys_cores" | bc 2>/dev/null || echo "?") + log_info "Overcommit ratio: ${ratio}:1" + local total_vm_ram_gb=$(echo "scale=1; $total_vm_ram / 1024" | bc 2>/dev/null || echo "?") + local phys_ram + phys_ram=$(ssh_cmd "$PROXMOX_HOST" "awk '/MemTotal/{printf \"%.1f\", \$2/1024/1024}' /proc/meminfo") + log_info "Physical RAM: ${phys_ram} GB" + log_info "Total allocated VM RAM: ${total_vm_ram_gb} GB" + + if (($(echo "$ratio > 1.5" | bc -l 2>/dev/null || echo 0))); then + log_warn "vCPU overcommit ratio ${ratio}:1 is high — may cause contention" + fi +} + +# ── Parse collector output and report findings ─────────────────────────────── +parse_and_report() { + local label="$1" + local data="$2" + + # Extract key values + local hostname=$(echo "$data" | grep "^hostname=" | cut -d= -f2) + local uptime_days=$(echo "$data" | grep "^uptime_days=" | cut -d= -f2) + local cpu_cores=$(echo "$data" | grep "^cpu_cores=" | cut -d= -f2) + local load_1m=$(echo "$data" | grep "^load_1m=" | cut -d= -f2) + local load_5m=$(echo "$data" | grep "^load_5m=" | cut -d= -f2) + local mem_total=$(echo "$data" | grep "^mem_total_mb=" | cut -d= -f2) + local mem_used=$(echo "$data" | grep "^mem_used_mb=" | cut -d= -f2) + local mem_used_pct=$(echo "$data" | grep "^mem_used_pct=" | cut -d= -f2) + local swap_used=$(echo "$data" | grep "^swap_used_mb=" | cut -d= -f2) + local zombie_count=$(echo "$data" | grep "^zombie_count=" | cut -d= -f2) + + log_subsection "System: $hostname" + + # Uptime + if [[ "${uptime_days:-0}" -gt "$UPTIME_DAYS_WARN" ]]; then + log_warn "Uptime: ${uptime_days} days — consider scheduling maintenance reboot" + else + log_ok "Uptime: ${uptime_days} days" + fi + + # Load per core + local load_per_core + load_per_core=$(echo "scale=2; ${load_5m:-0} / ${cpu_cores:-1}" | bc 2>/dev/null || echo "0") + if (($(echo "$load_per_core > $LOAD_PER_CORE_CRIT" | bc -l 2>/dev/null || echo 0))); then + log_crit "Load: ${load_1m}/${load_5m} on ${cpu_cores} cores (${load_per_core}/core) — OVERLOADED" + elif (($(echo "$load_per_core > $LOAD_PER_CORE_WARN" | bc -l 2>/dev/null || echo 0))); then + log_warn "Load: ${load_1m}/${load_5m} on ${cpu_cores} cores (${load_per_core}/core) — elevated" + else + log_ok "Load: ${load_1m}/${load_5m} on ${cpu_cores} cores (${load_per_core}/core)" + fi + + # Memory + if [[ "${mem_used_pct:-0}" -gt "$MEM_USED_PCT_CRIT" ]]; then + log_crit "Memory: ${mem_used}/${mem_total} MB (${mem_used_pct}%) — CRITICAL" + elif [[ "${mem_used_pct:-0}" -gt "$MEM_USED_PCT_WARN" ]]; then + log_warn "Memory: ${mem_used}/${mem_total} MB (${mem_used_pct}%)" + else + log_ok "Memory: ${mem_used}/${mem_total} MB (${mem_used_pct}%)" + fi + + # Swap + if [[ "${swap_used:-0}" -gt "$SWAP_USED_MB_WARN" ]]; then + log_warn "Swap: ${swap_used} MB in use" + fi + + # Disk + echo "$data" | sed -n '/DISK_START/,/DISK_END/p' | grep "^disk|" | while IFS='|' read _ mount size used avail pct; do + pct_num=${pct%%%} + if [[ "${pct_num:-0}" -gt "$DISK_USED_PCT_CRIT" ]]; then + log_crit "Disk $mount: ${used}/${size} (${pct}) — CRITICAL" + elif [[ "${pct_num:-0}" -gt "$DISK_USED_PCT_WARN" ]]; then + log_warn "Disk $mount: ${used}/${size} (${pct})" + else + log_ok "Disk $mount: ${used}/${size} (${pct})" + fi + done + + # Zombies + if [[ "${zombie_count:-0}" -gt 0 ]]; then + log_warn "Zombie processes: $zombie_count" + fi + + # Stuck processes + local stuck_procs + stuck_procs=$(echo "$data" | sed -n '/STUCK_PROCS_START/,/STUCK_PROCS_END/p' | grep "^stuck|") + if [[ -n "$stuck_procs" ]]; then + log_warn "Stuck/runaway processes detected:" + echo "$stuck_procs" | while IFS='|' read _ pid cpu age comm; do + log_info "PID $pid: $comm at ${cpu}% CPU for $age" + done + fi + + # Failed systemd units + local failed + failed=$(echo "$data" | sed -n '/FAILED_UNITS_START/,/FAILED_UNITS_END/p' | grep "^failed_unit|") + if [[ -n "$failed" ]]; then + log_warn "Failed systemd units:" + echo "$failed" | while IFS='|' read _ unit; do + log_info "$unit" + done + fi + + # Docker containers + local docker_data + docker_data=$(echo "$data" | sed -n '/DOCKER_START/,/DOCKER_END/p' | grep "^docker|") + if [[ -n "$docker_data" ]]; then + echo "" + log_info "Docker containers:" + printf " %-30s %8s %s\n" "CONTAINER" "CPU%" "MEMORY" + echo "$docker_data" | while IFS='|' read _ name cpu mem net pids; do + printf " %-30s %8s %s\n" "$name" "$cpu" "$mem" + done + fi + + # Stopped docker containers + local stopped_containers + stopped_containers=$(echo "$data" | sed -n '/DOCKER_CONTAINERS_START/,/DOCKER_CONTAINERS_END/p' | grep "^container|" | grep -i "exited") + if [[ -n "$stopped_containers" ]]; then + log_warn "Stopped Docker containers (wasting disk):" + echo "$stopped_containers" | while IFS='|' read _ name status image ports; do + log_info "$name ($image) — $status" + done + fi +} + +# ── Collect from individual VM/CT guests ───────────────────────────────────── +collect_guest() { + local label="$1" + local ssh_target="$2" + + local data + data=$(ssh_stdin "$ssh_target" "STUCK_PROC_CPU_WARN=$STUCK_PROC_CPU_WARN bash -s" <<<"$COLLECTOR_SCRIPT") || { + log_warn "Could not connect to $label ($ssh_target)" + return + } + echo "$data" >"$REPORT_DIR/${label}.txt" + parse_and_report "$label" "$data" +} + +# ── Build SSH target map from Proxmox ──────────────────────────────────────── +build_guest_map() { + # Map of VMID/CTID -> SSH target (IP) + # We get IPs from the guest agent or known SSH config + local -n map_ref=$1 + + # Get VM IPs via guest agent + while IFS= read -r line; do + local vmid=$(echo "$line" | awk '{print $1}') + local name=$(echo "$line" | awk '{print $2}') + local status=$(echo "$line" | awk '{print $3}') + [[ "$status" != "running" ]] && continue + + local ip + ip=$(ssh_cmd "$PROXMOX_HOST" "qm guest cmd $vmid network-get-interfaces 2>/dev/null" | + python3 -c " +import sys, json +data = json.load(sys.stdin) +for iface in data: + if iface.get('name') in ('lo',): continue + for addr in iface.get('ip-addresses', []): + if addr['ip-address-type'] == 'ipv4' and not addr['ip-address'].startswith('127.'): + print(addr['ip-address']) + sys.exit() +" 2>/dev/null) || true + + if [[ -n "$ip" ]]; then + map_ref["vm-${vmid}-${name}"]="$ip" + fi + done < <(ssh_cmd "$PROXMOX_HOST" "qm list 2>/dev/null" | tail -n +2 | awk 'NF{print $1, $2, $3}') + + # Get CT IPs + while IFS= read -r line; do + local ctid=$(echo "$line" | awk '{print $1}') + local status=$(echo "$line" | awk '{print $2}') + local name=$(echo "$line" | awk '{print $NF}') + [[ "$status" != "running" ]] && continue + + local ip + ip=$(ssh_cmd "$PROXMOX_HOST" "lxc-info -n $ctid -iH 2>/dev/null | head -1") || true + if [[ -z "$ip" ]]; then + ip=$(ssh_cmd "$PROXMOX_HOST" "pct config $ctid 2>/dev/null | grep -oP 'ip=\K[0-9.]+'") || true + fi + + if [[ -n "$ip" ]]; then + map_ref["ct-${ctid}-${name}"]="$ip" + fi + done < <(ssh_cmd "$PROXMOX_HOST" "pct list 2>/dev/null" | tail -n +2) +} + +# ── Summary and recommendations ────────────────────────────────────────────── +generate_summary() { + log_section "AUDIT SUMMARY & RECOMMENDATIONS" + + echo "" + echo " Raw data saved to: $REPORT_DIR/" + echo "" + + if [[ -s "$REPORT_DIR/ssh-failures.log" ]]; then + local ssh_fail_count + ssh_fail_count=$(wc -l <"$REPORT_DIR/ssh-failures.log") + log_warn "SSH failures: $ssh_fail_count error(s) logged to $REPORT_DIR/ssh-failures.log" + echo "" + fi + + echo " Review the flags above (⚠ warnings, ✖ critical) and consider:" + echo " 1. Kill stuck/runaway processes immediately" + echo " 2. Restart or remove failed systemd units" + echo " 3. Clean up stopped Docker containers and unused images" + echo " 4. Right-size VM/CT resource allocations based on actual usage" + echo " 5. Shut down or decommission VMs marked as stopped/unused" + echo " 6. Schedule maintenance reboots for long-uptime hosts" + echo " 7. Adjust monitoring thresholds to use per-core load metrics" + echo "" +} + +# ── Main ───────────────────────────────────────────────────────────────────── +main() { + echo "╔══════════════════════════════════════════════════════════════╗" + echo "║ HOMELAB INFRASTRUCTURE AUDIT ║" + echo "║ $(date '+%Y-%m-%d %H:%M:%S') ║" + echo "╚══════════════════════════════════════════════════════════════╝" + + # 1. Proxmox host + inventory + collect_proxmox_inventory + + # 2. Discover and audit all guests + log_section "GUEST AUDITS" + declare -A guest_map + build_guest_map guest_map + + for label in $(echo "${!guest_map[@]}" | tr ' ' '\n' | sort); do + local target="${guest_map[$label]}" + collect_guest "$label" "$target" + done + + # 3. Physical hosts + log_section "PHYSICAL HOSTS" + for host in "${PHYSICAL_HOSTS[@]}"; do + collect_guest "$host" "$host" + done + + # 4. Summary + generate_summary +} + +# Redirect output if --output was specified +if [[ -n "$OUTPUT_FILE" ]]; then + main 2>&1 | tee "$OUTPUT_FILE" +else + main 2>&1 +fi