diff --git a/monitoring/scripts/homelab-audit.sh b/monitoring/scripts/homelab-audit.sh index 3e25aa3..da2cf7d 100755 --- a/monitoring/scripts/homelab-audit.sh +++ b/monitoring/scripts/homelab-audit.sh @@ -99,6 +99,10 @@ oom_events() { echo "${count:-0}" } +io_wait_pct() { + vmstat 1 2 2>/dev/null | tail -1 | awk '\''{print $16}'\'' +} + echo "CPU_LOAD=$(cpu_load)" echo "MEM_PCT=$(mem_pct)" echo "ZOMBIES=$(zombie_count)" @@ -106,6 +110,7 @@ echo "STUCK_PROCS=$(stuck_procs)" echo "ZOMBIE_PARENTS=$(zombie_parents)" echo "SWAP_MB=$(swap_mb)" echo "OOM_EVENTS=$(oom_events)" +echo "IO_WAIT=$(io_wait_pct)" disk_usage | while read -r pct mnt; do echo "DISK $pct $mnt" done @@ -231,6 +236,12 @@ parse_and_report() { echo "WARN $label: ${ooms} OOM kill event(s) in last 7 days" >>"$FINDINGS_FILE" fi ;; + IO_WAIT=*) + local iowait="${line#IO_WAIT=}" + if [[ -n "$iowait" ]] && ((iowait > 20)); then + echo "WARN $label: I/O wait ${iowait}% > 20%" >>"$FINDINGS_FILE" + fi + ;; DISK\ *) local pct mnt read -r _ pct mnt <<<"$line" @@ -286,6 +297,92 @@ generate_summary() { echo "Reports: $REPORT_DIR" } +# --------------------------------------------------------------------------- +# Proxmox backup recency — queries vzdump task history via pvesh (runs locally) +# --------------------------------------------------------------------------- +check_backup_recency() { + local tasks_json_file="$REPORT_DIR/vzdump-tasks.json" + pvesh get /nodes/proxmox/tasks --typefilter vzdump --limit 50 --output-format json \ + >"$tasks_json_file" 2>/dev/null || { + echo "WARN proxmox: failed to query vzdump task history" >>"$FINDINGS_FILE" + return + } + + [[ ! -s "$tasks_json_file" ]] && return + + local running_ids=() + while read -r ctid; do + running_ids+=("$ctid") + done < <(pct list 2>/dev/null | awk 'NR>1 && $2=="running"{print $1}') + while read -r vmid; do + running_ids+=("$vmid") + done < <(qm list 2>/dev/null | awk 'NR>1 && $3=="running"{print $1}') + + [[ ${#running_ids[@]} -eq 0 ]] && return + + local week_ago + week_ago=$(($(date +%s) - 7 * 86400)) + + python3 - "$tasks_json_file" "$week_ago" "${running_ids[@]}" <<'PYEOF' >>"$FINDINGS_FILE" +import sys, json, datetime + +tasks_file, week_ago = sys.argv[1], int(sys.argv[2]) +running_ids = set(sys.argv[3:]) + +try: + tasks = json.load(open(tasks_file)) +except Exception: + sys.exit(0) + +last_backup = {} +for task in tasks: + if task.get("type") != "vzdump" or task.get("status") != "OK": + continue + vmid = str(task.get("id", "")) + endtime = int(task.get("endtime", 0)) + if vmid and endtime and endtime > last_backup.get(vmid, 0): + last_backup[vmid] = endtime + +for vmid in sorted(running_ids): + ts = last_backup.get(vmid) + if ts and ts >= week_ago: + pass + elif ts: + dt = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d") + print(f"WARN proxmox/vm-{vmid}: last backup {dt} is older than 7 days") + else: + print(f"CRIT proxmox/vm-{vmid}: no backup found in task history") +PYEOF +} + +# --------------------------------------------------------------------------- +# Certificate expiry check — runs from the audit host via openssl +# --------------------------------------------------------------------------- +check_cert_expiry() { + local label="$1" + local addr="$2" + local now + now=$(date +%s) + + for port in 443 8443; do + local enddate + enddate=$(echo | timeout 10 openssl s_client -connect "${addr}:${port}" 2>/dev/null | + openssl x509 -noout -enddate 2>/dev/null) || continue + [[ -z "$enddate" ]] && continue + + local expiry_str="${enddate#notAfter=}" + local expiry_epoch + expiry_epoch=$(date -d "$expiry_str" +%s 2>/dev/null) || continue + local days_left=$(((expiry_epoch - now) / 86400)) + + if ((days_left <= 7)); then + echo "CRIT $label: TLS cert on :${port} expires in ${days_left} days" >>"$FINDINGS_FILE" + elif ((days_left <= 14)); then + echo "WARN $label: TLS cert on :${port} expires in ${days_left} days" >>"$FINDINGS_FILE" + fi + done +} + # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- @@ -297,10 +394,14 @@ main() { >"$FINDINGS_FILE" + echo " Checking Proxmox backup recency..." + check_backup_recency + local host_count=0 while read -r label addr; do echo " Auditing $label ($addr)..." parse_and_report "$label" "$addr" + check_cert_expiry "$label" "$addr" ((host_count++)) || true done < <(collect_inventory)