feat: add backup recency, cert expiry, OOM, and I/O wait checks (#25)
All checks were successful
Auto-merge docs-only PRs / auto-merge-docs (pull_request) Successful in 2s

Closes #25

- check_backup_recency(): queries pvesh vzdump task history; flags VMs
  with no backup (CRIT) or no backup in 7 days (WARN)
- check_cert_expiry(): probes ports 443/8443 per host via openssl;
  flags certs expiring ≤14 days (WARN) or ≤7 days (CRIT)
- io_wait_pct() in COLLECTOR_SCRIPT: uses vmstat 1 2 to sample I/O
  wait; flagged as WARN when > 20%
- OOM kill history was already collected via journalctl; no changes needed

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Cal Corum 2026-04-02 21:06:44 -05:00
parent e58c5b8cc1
commit ae5da035f6

View File

@ -99,6 +99,10 @@ oom_events() {
echo "${count:-0}"
}
io_wait_pct() {
vmstat 1 2 2>/dev/null | tail -1 | awk '\''{print $16}'\''
}
echo "CPU_LOAD=$(cpu_load)"
echo "MEM_PCT=$(mem_pct)"
echo "ZOMBIES=$(zombie_count)"
@ -106,6 +110,7 @@ echo "STUCK_PROCS=$(stuck_procs)"
echo "ZOMBIE_PARENTS=$(zombie_parents)"
echo "SWAP_MB=$(swap_mb)"
echo "OOM_EVENTS=$(oom_events)"
echo "IO_WAIT=$(io_wait_pct)"
disk_usage | while read -r pct mnt; do
echo "DISK $pct $mnt"
done
@ -231,6 +236,12 @@ parse_and_report() {
echo "WARN $label: ${ooms} OOM kill event(s) in last 7 days" >>"$FINDINGS_FILE"
fi
;;
IO_WAIT=*)
local iowait="${line#IO_WAIT=}"
if [[ -n "$iowait" ]] && ((iowait > 20)); then
echo "WARN $label: I/O wait ${iowait}% > 20%" >>"$FINDINGS_FILE"
fi
;;
DISK\ *)
local pct mnt
read -r _ pct mnt <<<"$line"
@ -286,6 +297,92 @@ generate_summary() {
echo "Reports: $REPORT_DIR"
}
# ---------------------------------------------------------------------------
# Proxmox backup recency — queries vzdump task history via pvesh (runs locally)
# ---------------------------------------------------------------------------
check_backup_recency() {
local tasks_json_file="$REPORT_DIR/vzdump-tasks.json"
pvesh get /nodes/proxmox/tasks --typefilter vzdump --limit 50 --output-format json \
>"$tasks_json_file" 2>/dev/null || {
echo "WARN proxmox: failed to query vzdump task history" >>"$FINDINGS_FILE"
return
}
[[ ! -s "$tasks_json_file" ]] && return
local running_ids=()
while read -r ctid; do
running_ids+=("$ctid")
done < <(pct list 2>/dev/null | awk 'NR>1 && $2=="running"{print $1}')
while read -r vmid; do
running_ids+=("$vmid")
done < <(qm list 2>/dev/null | awk 'NR>1 && $3=="running"{print $1}')
[[ ${#running_ids[@]} -eq 0 ]] && return
local week_ago
week_ago=$(($(date +%s) - 7 * 86400))
python3 - "$tasks_json_file" "$week_ago" "${running_ids[@]}" <<'PYEOF' >>"$FINDINGS_FILE"
import sys, json, datetime
tasks_file, week_ago = sys.argv[1], int(sys.argv[2])
running_ids = set(sys.argv[3:])
try:
tasks = json.load(open(tasks_file))
except Exception:
sys.exit(0)
last_backup = {}
for task in tasks:
if task.get("type") != "vzdump" or task.get("status") != "OK":
continue
vmid = str(task.get("id", ""))
endtime = int(task.get("endtime", 0))
if vmid and endtime and endtime > last_backup.get(vmid, 0):
last_backup[vmid] = endtime
for vmid in sorted(running_ids):
ts = last_backup.get(vmid)
if ts and ts >= week_ago:
pass
elif ts:
dt = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d")
print(f"WARN proxmox/vm-{vmid}: last backup {dt} is older than 7 days")
else:
print(f"CRIT proxmox/vm-{vmid}: no backup found in task history")
PYEOF
}
# ---------------------------------------------------------------------------
# Certificate expiry check — runs from the audit host via openssl
# ---------------------------------------------------------------------------
check_cert_expiry() {
local label="$1"
local addr="$2"
local now
now=$(date +%s)
for port in 443 8443; do
local enddate
enddate=$(echo | timeout 10 openssl s_client -connect "${addr}:${port}" 2>/dev/null |
openssl x509 -noout -enddate 2>/dev/null) || continue
[[ -z "$enddate" ]] && continue
local expiry_str="${enddate#notAfter=}"
local expiry_epoch
expiry_epoch=$(date -d "$expiry_str" +%s 2>/dev/null) || continue
local days_left=$(((expiry_epoch - now) / 86400))
if ((days_left <= 7)); then
echo "CRIT $label: TLS cert on :${port} expires in ${days_left} days" >>"$FINDINGS_FILE"
elif ((days_left <= 14)); then
echo "WARN $label: TLS cert on :${port} expires in ${days_left} days" >>"$FINDINGS_FILE"
fi
done
}
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
@ -297,10 +394,14 @@ main() {
>"$FINDINGS_FILE"
echo " Checking Proxmox backup recency..."
check_backup_recency
local host_count=0
while read -r label addr; do
echo " Auditing $label ($addr)..."
parse_and_report "$label" "$addr"
check_cert_expiry "$label" "$addr"
((host_count++)) || true
done < <(collect_inventory)