fix: resolve variable interpolation and collector bugs in homelab-audit.sh
All checks were successful
Auto-merge docs-only PRs / auto-merge-docs (pull_request) Successful in 6s
All checks were successful
Auto-merge docs-only PRs / auto-merge-docs (pull_request) Successful in 6s
- Pass STUCK_PROC_CPU_WARN as env var to remote bash sessions so the configurable threshold actually reaches the collector (was hardcoded 10) - Add pct config fallback for LXC IP discovery when lxc-info returns empty (static-IP containers) - Log SSH failures to $REPORT_DIR/ssh-failures.log instead of /dev/null, with a count in the audit summary - Add comment explaining intentional omission of set -e Fixes #23 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
5ee48af40f
commit
daee881987
483
monitoring/scripts/homelab-audit.sh
Executable file
483
monitoring/scripts/homelab-audit.sh
Executable file
@ -0,0 +1,483 @@
|
||||
#!/usr/bin/env bash
|
||||
# homelab-audit.sh — Comprehensive homelab infrastructure audit
|
||||
# Collects resource allocation, utilization, stuck processes, and inefficiencies
|
||||
# across all Proxmox VMs/CTs and physical hosts.
|
||||
#
|
||||
# Usage: ./homelab-audit.sh [--json] [--output FILE]
|
||||
# Requires: SSH access to all hosts (via ~/.ssh/config aliases)
|
||||
|
||||
# -e omitted intentionally — unreachable hosts should not abort the full audit
|
||||
set -uo pipefail
|
||||
|
||||
# ── Configuration ─────────────────────────────────────────────────────────────
|
||||
PROXMOX_HOST="proxmox"
|
||||
PHYSICAL_HOSTS=("manticore")
|
||||
SSH_TIMEOUT=10
|
||||
OUTPUT_FORMAT="text"
|
||||
OUTPUT_FILE=""
|
||||
REPORT_DIR="/tmp/homelab-audit-$(date +%Y%m%d-%H%M%S)"
|
||||
|
||||
# Thresholds for flagging issues
|
||||
LOAD_PER_CORE_WARN=0.7
|
||||
LOAD_PER_CORE_CRIT=1.0
|
||||
MEM_USED_PCT_WARN=80
|
||||
MEM_USED_PCT_CRIT=95
|
||||
DISK_USED_PCT_WARN=80
|
||||
DISK_USED_PCT_CRIT=90
|
||||
SWAP_USED_MB_WARN=500
|
||||
UPTIME_DAYS_WARN=30
|
||||
ZOMBIE_WARN=1
|
||||
STUCK_PROC_CPU_WARN=10 # % CPU for a single process running >24h
|
||||
|
||||
# ── Argument parsing ─────────────────────────────────────────────────────────
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--json)
|
||||
OUTPUT_FORMAT="json"
|
||||
shift
|
||||
;;
|
||||
--output)
|
||||
OUTPUT_FILE="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
mkdir -p "$REPORT_DIR"
|
||||
|
||||
# ── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
ssh_cmd() {
|
||||
local host="$1"
|
||||
shift
|
||||
ssh -n -o ConnectTimeout=$SSH_TIMEOUT -o BatchMode=yes -o StrictHostKeyChecking=accept-new "$host" "$@" 2>>"$REPORT_DIR/ssh-failures.log"
|
||||
}
|
||||
|
||||
# ssh_stdin — like ssh_cmd but allows stdin (for heredocs/pipe input)
|
||||
ssh_stdin() {
|
||||
local host="$1"
|
||||
shift
|
||||
ssh -o ConnectTimeout=$SSH_TIMEOUT -o BatchMode=yes -o StrictHostKeyChecking=accept-new "$host" "$@" 2>>"$REPORT_DIR/ssh-failures.log"
|
||||
}
|
||||
|
||||
log_section() { echo -e "\n━━━ $1 ━━━"; }
|
||||
log_subsection() { echo -e "\n ── $1 ──"; }
|
||||
log_ok() { echo " ✓ $1"; }
|
||||
log_warn() { echo " ⚠ $1"; }
|
||||
log_crit() { echo " ✖ $1"; }
|
||||
log_info() { echo " $1"; }
|
||||
|
||||
# ── Per-host collection script (runs remotely) ──────────────────────────────
|
||||
# This heredoc is sent to each host via SSH. It outputs structured key=value data.
|
||||
COLLECTOR_SCRIPT='
|
||||
#!/bin/bash
|
||||
echo "AUDIT_START"
|
||||
echo "hostname=$(hostname)"
|
||||
echo "uptime_seconds=$(cat /proc/uptime | cut -d" " -f1 | cut -d"." -f1)"
|
||||
echo "uptime_days=$(( $(cat /proc/uptime | cut -d" " -f1 | cut -d"." -f1) / 86400 ))"
|
||||
|
||||
# CPU
|
||||
echo "cpu_cores=$(nproc 2>/dev/null || grep -c ^processor /proc/cpuinfo)"
|
||||
read load1 load5 load15 rest < /proc/loadavg
|
||||
echo "load_1m=$load1"
|
||||
echo "load_5m=$load5"
|
||||
echo "load_15m=$load15"
|
||||
|
||||
# Memory (in MB)
|
||||
mem_total=$(awk "/MemTotal/ {printf \"%.0f\", \$2/1024}" /proc/meminfo)
|
||||
mem_avail=$(awk "/MemAvailable/ {printf \"%.0f\", \$2/1024}" /proc/meminfo)
|
||||
mem_used=$((mem_total - mem_avail))
|
||||
swap_total=$(awk "/SwapTotal/ {printf \"%.0f\", \$2/1024}" /proc/meminfo)
|
||||
swap_used=$(awk "/SwapFree/ {printf \"%.0f\", ($swap_total*1024 - \$2)/1024}" /proc/meminfo 2>/dev/null || echo 0)
|
||||
echo "mem_total_mb=$mem_total"
|
||||
echo "mem_used_mb=$mem_used"
|
||||
echo "mem_avail_mb=$mem_avail"
|
||||
echo "mem_used_pct=$((mem_used * 100 / (mem_total > 0 ? mem_total : 1)))"
|
||||
echo "swap_total_mb=$swap_total"
|
||||
echo "swap_used_mb=$swap_used"
|
||||
|
||||
# Disk usage (non-tmpfs, non-overlay mounts)
|
||||
echo "DISK_START"
|
||||
df -h --output=target,size,used,avail,pcent -x tmpfs -x devtmpfs -x overlay -x squashfs 2>/dev/null | tail -n +2 | while read mount size used avail pct; do
|
||||
echo "disk|$mount|$size|$used|$avail|$pct"
|
||||
done
|
||||
echo "DISK_END"
|
||||
|
||||
# Top CPU processes (>5% CPU)
|
||||
echo "PROCS_START"
|
||||
ps aux --sort=-%cpu --no-headers 2>/dev/null | head -15 | while read user pid cpu mem vsz rss tty stat start time cmd; do
|
||||
echo "proc|$user|$pid|$cpu|$mem|$start|$time|$cmd"
|
||||
done
|
||||
echo "PROCS_END"
|
||||
|
||||
# Zombie processes
|
||||
zombies=$(ps aux 2>/dev/null | awk "\$8 ~ /Z/" | wc -l)
|
||||
echo "zombie_count=$zombies"
|
||||
|
||||
# Failed systemd units
|
||||
echo "FAILED_UNITS_START"
|
||||
systemctl --failed --no-legend --no-pager 2>/dev/null | while read unit load active sub desc; do
|
||||
echo "failed_unit|$unit"
|
||||
done
|
||||
echo "FAILED_UNITS_END"
|
||||
|
||||
# Docker containers (if docker is available)
|
||||
if command -v docker &>/dev/null; then
|
||||
echo "DOCKER_START"
|
||||
docker stats --no-stream --format "docker|{{.Name}}|{{.CPUPerc}}|{{.MemUsage}}|{{.NetIO}}|{{.PIDs}}" 2>/dev/null || true
|
||||
echo "DOCKER_END"
|
||||
echo "DOCKER_CONTAINERS_START"
|
||||
docker ps -a --format "container|{{.Names}}|{{.Status}}|{{.Image}}|{{.Ports}}" 2>/dev/null || true
|
||||
echo "DOCKER_CONTAINERS_END"
|
||||
fi
|
||||
|
||||
# Listening ports (services inventory)
|
||||
echo "LISTENERS_START"
|
||||
ss -tlnp 2>/dev/null | tail -n +2 | awk "{print \"listen|\" \$4 \"|\" \$6}" | head -30
|
||||
echo "LISTENERS_END"
|
||||
|
||||
# Long-running high-CPU processes (potential stuck processes)
|
||||
echo "STUCK_PROCS_START"
|
||||
now=$(date +%s)
|
||||
ps -eo pid,etimes,%cpu,comm --sort=-%cpu --no-headers 2>/dev/null | while read pid etime cpu comm; do
|
||||
# etimes = elapsed time in seconds; flag if >24h and >STUCK threshold
|
||||
cpu_int=${cpu%.*}
|
||||
if [[ "$etime" -gt 86400 && "${cpu_int:-0}" -gt ${STUCK_PROC_CPU_WARN:-10} ]]; then
|
||||
days=$((etime / 86400))
|
||||
echo "stuck|$pid|$cpu|${days}d|$comm"
|
||||
fi
|
||||
done
|
||||
echo "STUCK_PROCS_END"
|
||||
|
||||
echo "AUDIT_END"
|
||||
'
|
||||
|
||||
# ── Proxmox-specific collection ─────────────────────────────────────────────
|
||||
collect_proxmox_inventory() {
|
||||
log_section "PROXMOX HOST INVENTORY ($PROXMOX_HOST)"
|
||||
|
||||
# Get host-level info
|
||||
local pve_data
|
||||
pve_data=$(ssh_stdin "$PROXMOX_HOST" "STUCK_PROC_CPU_WARN=$STUCK_PROC_CPU_WARN bash -s" <<<"$COLLECTOR_SCRIPT")
|
||||
echo "$pve_data" >"$REPORT_DIR/proxmox-host.txt"
|
||||
parse_and_report "proxmox" "$pve_data"
|
||||
|
||||
# VM inventory with resource allocations
|
||||
log_subsection "Virtual Machines"
|
||||
printf " %-6s %-25s %-8s %6s %8s %10s\n" "VMID" "NAME" "STATUS" "vCPUs" "RAM(GB)" "DISK(GB)"
|
||||
printf " %-6s %-25s %-8s %6s %8s %10s\n" "------" "-------------------------" "--------" "------" "--------" "----------"
|
||||
|
||||
local total_vcpus=0 total_vm_ram=0
|
||||
|
||||
while IFS= read -r line; do
|
||||
local vmid=$(echo "$line" | awk '{print $1}')
|
||||
local name=$(echo "$line" | awk '{print $2}')
|
||||
local status=$(echo "$line" | awk '{print $3}')
|
||||
local mem_mb=$(echo "$line" | awk '{print $4}')
|
||||
local disk_gb=$(echo "$line" | awk '{print $5}')
|
||||
|
||||
# Get vCPU count from config
|
||||
local vcpus
|
||||
vcpus=$(ssh_cmd "$PROXMOX_HOST" "qm config $vmid 2>/dev/null | grep -E '^(cores|sockets)'" |
|
||||
awk -F: '/cores/{c=$2} /sockets/{s=$2} END{printf "%.0f", (c+0)*(s>0?s:1)}')
|
||||
[[ -z "$vcpus" ]] && vcpus=0
|
||||
|
||||
local mem_gb=$(echo "scale=1; $mem_mb / 1024" | bc 2>/dev/null || echo "?")
|
||||
|
||||
if [[ "$status" == "running" ]]; then
|
||||
total_vcpus=$((total_vcpus + vcpus))
|
||||
total_vm_ram=$((total_vm_ram + mem_mb))
|
||||
fi
|
||||
|
||||
local flag=""
|
||||
[[ "$status" == "stopped" ]] && flag=" (wasting disk)"
|
||||
[[ "$status" == "running" && "$vcpus" -gt 8 ]] && flag=" (heavy)"
|
||||
|
||||
printf " %-6s %-25s %-8s %6s %8s %10s%s\n" "$vmid" "$name" "$status" "$vcpus" "$mem_gb" "$disk_gb" "$flag"
|
||||
done < <(ssh_cmd "$PROXMOX_HOST" "qm list 2>/dev/null" | tail -n +2 | awk 'NF{printf "%s %s %s %s %s\n", $1, $2, $3, $4, $5}')
|
||||
|
||||
# CT inventory
|
||||
log_subsection "LXC Containers"
|
||||
printf " %-6s %-30s %-8s %6s %8s\n" "CTID" "NAME" "STATUS" "vCPUs" "RAM(MB)"
|
||||
printf " %-6s %-30s %-8s %6s %8s\n" "------" "------------------------------" "--------" "------" "--------"
|
||||
|
||||
while IFS= read -r line; do
|
||||
local ctid=$(echo "$line" | awk '{print $1}')
|
||||
local status=$(echo "$line" | awk '{print $2}')
|
||||
local name=$(echo "$line" | awk '{print $NF}')
|
||||
|
||||
# Get CT resource config
|
||||
local ct_cores ct_mem
|
||||
ct_cores=$(ssh_cmd "$PROXMOX_HOST" "pct config $ctid 2>/dev/null | grep ^cores" | awk -F: '{print $2}' | tr -d ' ')
|
||||
ct_mem=$(ssh_cmd "$PROXMOX_HOST" "pct config $ctid 2>/dev/null | grep ^memory" | awk -F: '{print $2}' | tr -d ' ')
|
||||
[[ -z "$ct_cores" ]] && ct_cores="(host)"
|
||||
[[ -z "$ct_mem" ]] && ct_mem="?"
|
||||
|
||||
if [[ "$status" == "running" && "$ct_cores" =~ ^[0-9]+$ ]]; then
|
||||
total_vcpus=$((total_vcpus + ct_cores))
|
||||
fi
|
||||
|
||||
printf " %-6s %-30s %-8s %6s %8s\n" "$ctid" "$name" "$status" "$ct_cores" "$ct_mem"
|
||||
done < <(ssh_cmd "$PROXMOX_HOST" "pct list 2>/dev/null" | tail -n +2)
|
||||
|
||||
# Physical cores
|
||||
local phys_cores
|
||||
phys_cores=$(ssh_cmd "$PROXMOX_HOST" "nproc")
|
||||
|
||||
log_subsection "Resource Summary"
|
||||
log_info "Physical cores: $phys_cores"
|
||||
log_info "Total allocated vCPUs: $total_vcpus"
|
||||
local ratio=$(echo "scale=2; $total_vcpus / $phys_cores" | bc 2>/dev/null || echo "?")
|
||||
log_info "Overcommit ratio: ${ratio}:1"
|
||||
local total_vm_ram_gb=$(echo "scale=1; $total_vm_ram / 1024" | bc 2>/dev/null || echo "?")
|
||||
local phys_ram
|
||||
phys_ram=$(ssh_cmd "$PROXMOX_HOST" "awk '/MemTotal/{printf \"%.1f\", \$2/1024/1024}' /proc/meminfo")
|
||||
log_info "Physical RAM: ${phys_ram} GB"
|
||||
log_info "Total allocated VM RAM: ${total_vm_ram_gb} GB"
|
||||
|
||||
if (($(echo "$ratio > 1.5" | bc -l 2>/dev/null || echo 0))); then
|
||||
log_warn "vCPU overcommit ratio ${ratio}:1 is high — may cause contention"
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Parse collector output and report findings ───────────────────────────────
|
||||
parse_and_report() {
|
||||
local label="$1"
|
||||
local data="$2"
|
||||
|
||||
# Extract key values
|
||||
local hostname=$(echo "$data" | grep "^hostname=" | cut -d= -f2)
|
||||
local uptime_days=$(echo "$data" | grep "^uptime_days=" | cut -d= -f2)
|
||||
local cpu_cores=$(echo "$data" | grep "^cpu_cores=" | cut -d= -f2)
|
||||
local load_1m=$(echo "$data" | grep "^load_1m=" | cut -d= -f2)
|
||||
local load_5m=$(echo "$data" | grep "^load_5m=" | cut -d= -f2)
|
||||
local mem_total=$(echo "$data" | grep "^mem_total_mb=" | cut -d= -f2)
|
||||
local mem_used=$(echo "$data" | grep "^mem_used_mb=" | cut -d= -f2)
|
||||
local mem_used_pct=$(echo "$data" | grep "^mem_used_pct=" | cut -d= -f2)
|
||||
local swap_used=$(echo "$data" | grep "^swap_used_mb=" | cut -d= -f2)
|
||||
local zombie_count=$(echo "$data" | grep "^zombie_count=" | cut -d= -f2)
|
||||
|
||||
log_subsection "System: $hostname"
|
||||
|
||||
# Uptime
|
||||
if [[ "${uptime_days:-0}" -gt "$UPTIME_DAYS_WARN" ]]; then
|
||||
log_warn "Uptime: ${uptime_days} days — consider scheduling maintenance reboot"
|
||||
else
|
||||
log_ok "Uptime: ${uptime_days} days"
|
||||
fi
|
||||
|
||||
# Load per core
|
||||
local load_per_core
|
||||
load_per_core=$(echo "scale=2; ${load_5m:-0} / ${cpu_cores:-1}" | bc 2>/dev/null || echo "0")
|
||||
if (($(echo "$load_per_core > $LOAD_PER_CORE_CRIT" | bc -l 2>/dev/null || echo 0))); then
|
||||
log_crit "Load: ${load_1m}/${load_5m} on ${cpu_cores} cores (${load_per_core}/core) — OVERLOADED"
|
||||
elif (($(echo "$load_per_core > $LOAD_PER_CORE_WARN" | bc -l 2>/dev/null || echo 0))); then
|
||||
log_warn "Load: ${load_1m}/${load_5m} on ${cpu_cores} cores (${load_per_core}/core) — elevated"
|
||||
else
|
||||
log_ok "Load: ${load_1m}/${load_5m} on ${cpu_cores} cores (${load_per_core}/core)"
|
||||
fi
|
||||
|
||||
# Memory
|
||||
if [[ "${mem_used_pct:-0}" -gt "$MEM_USED_PCT_CRIT" ]]; then
|
||||
log_crit "Memory: ${mem_used}/${mem_total} MB (${mem_used_pct}%) — CRITICAL"
|
||||
elif [[ "${mem_used_pct:-0}" -gt "$MEM_USED_PCT_WARN" ]]; then
|
||||
log_warn "Memory: ${mem_used}/${mem_total} MB (${mem_used_pct}%)"
|
||||
else
|
||||
log_ok "Memory: ${mem_used}/${mem_total} MB (${mem_used_pct}%)"
|
||||
fi
|
||||
|
||||
# Swap
|
||||
if [[ "${swap_used:-0}" -gt "$SWAP_USED_MB_WARN" ]]; then
|
||||
log_warn "Swap: ${swap_used} MB in use"
|
||||
fi
|
||||
|
||||
# Disk
|
||||
echo "$data" | sed -n '/DISK_START/,/DISK_END/p' | grep "^disk|" | while IFS='|' read _ mount size used avail pct; do
|
||||
pct_num=${pct%%%}
|
||||
if [[ "${pct_num:-0}" -gt "$DISK_USED_PCT_CRIT" ]]; then
|
||||
log_crit "Disk $mount: ${used}/${size} (${pct}) — CRITICAL"
|
||||
elif [[ "${pct_num:-0}" -gt "$DISK_USED_PCT_WARN" ]]; then
|
||||
log_warn "Disk $mount: ${used}/${size} (${pct})"
|
||||
else
|
||||
log_ok "Disk $mount: ${used}/${size} (${pct})"
|
||||
fi
|
||||
done
|
||||
|
||||
# Zombies
|
||||
if [[ "${zombie_count:-0}" -gt 0 ]]; then
|
||||
log_warn "Zombie processes: $zombie_count"
|
||||
fi
|
||||
|
||||
# Stuck processes
|
||||
local stuck_procs
|
||||
stuck_procs=$(echo "$data" | sed -n '/STUCK_PROCS_START/,/STUCK_PROCS_END/p' | grep "^stuck|")
|
||||
if [[ -n "$stuck_procs" ]]; then
|
||||
log_warn "Stuck/runaway processes detected:"
|
||||
echo "$stuck_procs" | while IFS='|' read _ pid cpu age comm; do
|
||||
log_info "PID $pid: $comm at ${cpu}% CPU for $age"
|
||||
done
|
||||
fi
|
||||
|
||||
# Failed systemd units
|
||||
local failed
|
||||
failed=$(echo "$data" | sed -n '/FAILED_UNITS_START/,/FAILED_UNITS_END/p' | grep "^failed_unit|")
|
||||
if [[ -n "$failed" ]]; then
|
||||
log_warn "Failed systemd units:"
|
||||
echo "$failed" | while IFS='|' read _ unit; do
|
||||
log_info "$unit"
|
||||
done
|
||||
fi
|
||||
|
||||
# Docker containers
|
||||
local docker_data
|
||||
docker_data=$(echo "$data" | sed -n '/DOCKER_START/,/DOCKER_END/p' | grep "^docker|")
|
||||
if [[ -n "$docker_data" ]]; then
|
||||
echo ""
|
||||
log_info "Docker containers:"
|
||||
printf " %-30s %8s %s\n" "CONTAINER" "CPU%" "MEMORY"
|
||||
echo "$docker_data" | while IFS='|' read _ name cpu mem net pids; do
|
||||
printf " %-30s %8s %s\n" "$name" "$cpu" "$mem"
|
||||
done
|
||||
fi
|
||||
|
||||
# Stopped docker containers
|
||||
local stopped_containers
|
||||
stopped_containers=$(echo "$data" | sed -n '/DOCKER_CONTAINERS_START/,/DOCKER_CONTAINERS_END/p' | grep "^container|" | grep -i "exited")
|
||||
if [[ -n "$stopped_containers" ]]; then
|
||||
log_warn "Stopped Docker containers (wasting disk):"
|
||||
echo "$stopped_containers" | while IFS='|' read _ name status image ports; do
|
||||
log_info "$name ($image) — $status"
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Collect from individual VM/CT guests ─────────────────────────────────────
|
||||
collect_guest() {
|
||||
local label="$1"
|
||||
local ssh_target="$2"
|
||||
|
||||
local data
|
||||
data=$(ssh_stdin "$ssh_target" "STUCK_PROC_CPU_WARN=$STUCK_PROC_CPU_WARN bash -s" <<<"$COLLECTOR_SCRIPT") || {
|
||||
log_warn "Could not connect to $label ($ssh_target)"
|
||||
return
|
||||
}
|
||||
echo "$data" >"$REPORT_DIR/${label}.txt"
|
||||
parse_and_report "$label" "$data"
|
||||
}
|
||||
|
||||
# ── Build SSH target map from Proxmox ────────────────────────────────────────
|
||||
build_guest_map() {
|
||||
# Map of VMID/CTID -> SSH target (IP)
|
||||
# We get IPs from the guest agent or known SSH config
|
||||
local -n map_ref=$1
|
||||
|
||||
# Get VM IPs via guest agent
|
||||
while IFS= read -r line; do
|
||||
local vmid=$(echo "$line" | awk '{print $1}')
|
||||
local name=$(echo "$line" | awk '{print $2}')
|
||||
local status=$(echo "$line" | awk '{print $3}')
|
||||
[[ "$status" != "running" ]] && continue
|
||||
|
||||
local ip
|
||||
ip=$(ssh_cmd "$PROXMOX_HOST" "qm guest cmd $vmid network-get-interfaces 2>/dev/null" |
|
||||
python3 -c "
|
||||
import sys, json
|
||||
data = json.load(sys.stdin)
|
||||
for iface in data:
|
||||
if iface.get('name') in ('lo',): continue
|
||||
for addr in iface.get('ip-addresses', []):
|
||||
if addr['ip-address-type'] == 'ipv4' and not addr['ip-address'].startswith('127.'):
|
||||
print(addr['ip-address'])
|
||||
sys.exit()
|
||||
" 2>/dev/null) || true
|
||||
|
||||
if [[ -n "$ip" ]]; then
|
||||
map_ref["vm-${vmid}-${name}"]="$ip"
|
||||
fi
|
||||
done < <(ssh_cmd "$PROXMOX_HOST" "qm list 2>/dev/null" | tail -n +2 | awk 'NF{print $1, $2, $3}')
|
||||
|
||||
# Get CT IPs
|
||||
while IFS= read -r line; do
|
||||
local ctid=$(echo "$line" | awk '{print $1}')
|
||||
local status=$(echo "$line" | awk '{print $2}')
|
||||
local name=$(echo "$line" | awk '{print $NF}')
|
||||
[[ "$status" != "running" ]] && continue
|
||||
|
||||
local ip
|
||||
ip=$(ssh_cmd "$PROXMOX_HOST" "lxc-info -n $ctid -iH 2>/dev/null | head -1") || true
|
||||
if [[ -z "$ip" ]]; then
|
||||
ip=$(ssh_cmd "$PROXMOX_HOST" "pct config $ctid 2>/dev/null | grep -oP 'ip=\K[0-9.]+'") || true
|
||||
fi
|
||||
|
||||
if [[ -n "$ip" ]]; then
|
||||
map_ref["ct-${ctid}-${name}"]="$ip"
|
||||
fi
|
||||
done < <(ssh_cmd "$PROXMOX_HOST" "pct list 2>/dev/null" | tail -n +2)
|
||||
}
|
||||
|
||||
# ── Summary and recommendations ──────────────────────────────────────────────
|
||||
generate_summary() {
|
||||
log_section "AUDIT SUMMARY & RECOMMENDATIONS"
|
||||
|
||||
echo ""
|
||||
echo " Raw data saved to: $REPORT_DIR/"
|
||||
echo ""
|
||||
|
||||
if [[ -s "$REPORT_DIR/ssh-failures.log" ]]; then
|
||||
local ssh_fail_count
|
||||
ssh_fail_count=$(wc -l <"$REPORT_DIR/ssh-failures.log")
|
||||
log_warn "SSH failures: $ssh_fail_count error(s) logged to $REPORT_DIR/ssh-failures.log"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
echo " Review the flags above (⚠ warnings, ✖ critical) and consider:"
|
||||
echo " 1. Kill stuck/runaway processes immediately"
|
||||
echo " 2. Restart or remove failed systemd units"
|
||||
echo " 3. Clean up stopped Docker containers and unused images"
|
||||
echo " 4. Right-size VM/CT resource allocations based on actual usage"
|
||||
echo " 5. Shut down or decommission VMs marked as stopped/unused"
|
||||
echo " 6. Schedule maintenance reboots for long-uptime hosts"
|
||||
echo " 7. Adjust monitoring thresholds to use per-core load metrics"
|
||||
echo ""
|
||||
}
|
||||
|
||||
# ── Main ─────────────────────────────────────────────────────────────────────
|
||||
main() {
|
||||
echo "╔══════════════════════════════════════════════════════════════╗"
|
||||
echo "║ HOMELAB INFRASTRUCTURE AUDIT ║"
|
||||
echo "║ $(date '+%Y-%m-%d %H:%M:%S') ║"
|
||||
echo "╚══════════════════════════════════════════════════════════════╝"
|
||||
|
||||
# 1. Proxmox host + inventory
|
||||
collect_proxmox_inventory
|
||||
|
||||
# 2. Discover and audit all guests
|
||||
log_section "GUEST AUDITS"
|
||||
declare -A guest_map
|
||||
build_guest_map guest_map
|
||||
|
||||
for label in $(echo "${!guest_map[@]}" | tr ' ' '\n' | sort); do
|
||||
local target="${guest_map[$label]}"
|
||||
collect_guest "$label" "$target"
|
||||
done
|
||||
|
||||
# 3. Physical hosts
|
||||
log_section "PHYSICAL HOSTS"
|
||||
for host in "${PHYSICAL_HOSTS[@]}"; do
|
||||
collect_guest "$host" "$host"
|
||||
done
|
||||
|
||||
# 4. Summary
|
||||
generate_summary
|
||||
}
|
||||
|
||||
# Redirect output if --output was specified
|
||||
if [[ -n "$OUTPUT_FILE" ]]; then
|
||||
main 2>&1 | tee "$OUTPUT_FILE"
|
||||
else
|
||||
main 2>&1
|
||||
fi
|
||||
Loading…
Reference in New Issue
Block a user