feat: add backup recency, cert expiry, OOM, and I/O wait checks (#25)
All checks were successful
Auto-merge docs-only PRs / auto-merge-docs (pull_request) Successful in 2s
All checks were successful
Auto-merge docs-only PRs / auto-merge-docs (pull_request) Successful in 2s
Closes #25 - check_backup_recency(): queries pvesh vzdump task history; flags VMs with no backup (CRIT) or no backup in 7 days (WARN) - check_cert_expiry(): probes ports 443/8443 per host via openssl; flags certs expiring ≤14 days (WARN) or ≤7 days (CRIT) - io_wait_pct() in COLLECTOR_SCRIPT: uses vmstat 1 2 to sample I/O wait; flagged as WARN when > 20% - OOM kill history was already collected via journalctl; no changes needed Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
e58c5b8cc1
commit
ae5da035f6
@ -99,6 +99,10 @@ oom_events() {
|
||||
echo "${count:-0}"
|
||||
}
|
||||
|
||||
io_wait_pct() {
|
||||
vmstat 1 2 2>/dev/null | tail -1 | awk '\''{print $16}'\''
|
||||
}
|
||||
|
||||
echo "CPU_LOAD=$(cpu_load)"
|
||||
echo "MEM_PCT=$(mem_pct)"
|
||||
echo "ZOMBIES=$(zombie_count)"
|
||||
@ -106,6 +110,7 @@ echo "STUCK_PROCS=$(stuck_procs)"
|
||||
echo "ZOMBIE_PARENTS=$(zombie_parents)"
|
||||
echo "SWAP_MB=$(swap_mb)"
|
||||
echo "OOM_EVENTS=$(oom_events)"
|
||||
echo "IO_WAIT=$(io_wait_pct)"
|
||||
disk_usage | while read -r pct mnt; do
|
||||
echo "DISK $pct $mnt"
|
||||
done
|
||||
@ -231,6 +236,12 @@ parse_and_report() {
|
||||
echo "WARN $label: ${ooms} OOM kill event(s) in last 7 days" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
IO_WAIT=*)
|
||||
local iowait="${line#IO_WAIT=}"
|
||||
if [[ -n "$iowait" ]] && ((iowait > 20)); then
|
||||
echo "WARN $label: I/O wait ${iowait}% > 20%" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
DISK\ *)
|
||||
local pct mnt
|
||||
read -r _ pct mnt <<<"$line"
|
||||
@ -286,6 +297,92 @@ generate_summary() {
|
||||
echo "Reports: $REPORT_DIR"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Proxmox backup recency — queries vzdump task history via pvesh (runs locally)
|
||||
# ---------------------------------------------------------------------------
|
||||
check_backup_recency() {
|
||||
local tasks_json_file="$REPORT_DIR/vzdump-tasks.json"
|
||||
pvesh get /nodes/proxmox/tasks --typefilter vzdump --limit 50 --output-format json \
|
||||
>"$tasks_json_file" 2>/dev/null || {
|
||||
echo "WARN proxmox: failed to query vzdump task history" >>"$FINDINGS_FILE"
|
||||
return
|
||||
}
|
||||
|
||||
[[ ! -s "$tasks_json_file" ]] && return
|
||||
|
||||
local running_ids=()
|
||||
while read -r ctid; do
|
||||
running_ids+=("$ctid")
|
||||
done < <(pct list 2>/dev/null | awk 'NR>1 && $2=="running"{print $1}')
|
||||
while read -r vmid; do
|
||||
running_ids+=("$vmid")
|
||||
done < <(qm list 2>/dev/null | awk 'NR>1 && $3=="running"{print $1}')
|
||||
|
||||
[[ ${#running_ids[@]} -eq 0 ]] && return
|
||||
|
||||
local week_ago
|
||||
week_ago=$(($(date +%s) - 7 * 86400))
|
||||
|
||||
python3 - "$tasks_json_file" "$week_ago" "${running_ids[@]}" <<'PYEOF' >>"$FINDINGS_FILE"
|
||||
import sys, json, datetime
|
||||
|
||||
tasks_file, week_ago = sys.argv[1], int(sys.argv[2])
|
||||
running_ids = set(sys.argv[3:])
|
||||
|
||||
try:
|
||||
tasks = json.load(open(tasks_file))
|
||||
except Exception:
|
||||
sys.exit(0)
|
||||
|
||||
last_backup = {}
|
||||
for task in tasks:
|
||||
if task.get("type") != "vzdump" or task.get("status") != "OK":
|
||||
continue
|
||||
vmid = str(task.get("id", ""))
|
||||
endtime = int(task.get("endtime", 0))
|
||||
if vmid and endtime and endtime > last_backup.get(vmid, 0):
|
||||
last_backup[vmid] = endtime
|
||||
|
||||
for vmid in sorted(running_ids):
|
||||
ts = last_backup.get(vmid)
|
||||
if ts and ts >= week_ago:
|
||||
pass
|
||||
elif ts:
|
||||
dt = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d")
|
||||
print(f"WARN proxmox/vm-{vmid}: last backup {dt} is older than 7 days")
|
||||
else:
|
||||
print(f"CRIT proxmox/vm-{vmid}: no backup found in task history")
|
||||
PYEOF
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Certificate expiry check — runs from the audit host via openssl
|
||||
# ---------------------------------------------------------------------------
|
||||
check_cert_expiry() {
|
||||
local label="$1"
|
||||
local addr="$2"
|
||||
local now
|
||||
now=$(date +%s)
|
||||
|
||||
for port in 443 8443; do
|
||||
local enddate
|
||||
enddate=$(echo | timeout 10 openssl s_client -connect "${addr}:${port}" 2>/dev/null |
|
||||
openssl x509 -noout -enddate 2>/dev/null) || continue
|
||||
[[ -z "$enddate" ]] && continue
|
||||
|
||||
local expiry_str="${enddate#notAfter=}"
|
||||
local expiry_epoch
|
||||
expiry_epoch=$(date -d "$expiry_str" +%s 2>/dev/null) || continue
|
||||
local days_left=$(((expiry_epoch - now) / 86400))
|
||||
|
||||
if ((days_left <= 7)); then
|
||||
echo "CRIT $label: TLS cert on :${port} expires in ${days_left} days" >>"$FINDINGS_FILE"
|
||||
elif ((days_left <= 14)); then
|
||||
echo "WARN $label: TLS cert on :${port} expires in ${days_left} days" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
@ -297,10 +394,14 @@ main() {
|
||||
|
||||
>"$FINDINGS_FILE"
|
||||
|
||||
echo " Checking Proxmox backup recency..."
|
||||
check_backup_recency
|
||||
|
||||
local host_count=0
|
||||
while read -r label addr; do
|
||||
echo " Auditing $label ($addr)..."
|
||||
parse_and_report "$label" "$addr"
|
||||
check_cert_expiry "$label" "$addr"
|
||||
((host_count++)) || true
|
||||
done < <(collect_inventory)
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user