claude-home/monitoring/scripts/proxmox-backup-check.sh
Cal Corum 95bae33309
All checks were successful
Auto-merge docs-only PRs / auto-merge-docs (pull_request) Successful in 2s
feat: add weekly Proxmox backup verification and CT 302 self-health check (#27)
Closes #27

- proxmox-backup-check.sh: SSHes to Proxmox, queries pvesh task history,
  classifies each running VM/CT as green/yellow/red by backup recency,
  posts a Discord embed summary. Designed for weekly cron on CT 302.

- ct302-self-health.sh: Checks disk usage on CT 302 itself, silently
  exits when healthy, posts a Discord alert when any filesystem exceeds
  80% threshold. Closes the blind spot where the monitoring system
  cannot monitor itself externally.

- Updated monitoring/scripts/CONTEXT.md with full operational docs,
  install instructions, and cron schedules for both new scripts.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-04 06:07:57 -05:00

231 lines
7.4 KiB
Bash

#!/usr/bin/env bash
# proxmox-backup-check.sh — Weekly Proxmox backup verification → Discord
#
# SSHes to the Proxmox host and checks that every running VM/CT has a
# successful vzdump backup within the last 7 days. Posts a color-coded
# Discord summary with per-guest status.
#
# Usage:
# proxmox-backup-check.sh [--discord-webhook URL] [--days N] [--dry-run]
#
# Environment overrides:
# DISCORD_WEBHOOK Discord webhook URL (required unless --dry-run)
# PROXMOX_NODE Proxmox node name (default: proxmox)
# PROXMOX_SSH SSH alias or host for Proxmox (default: proxmox)
# WINDOW_DAYS Backup recency window in days (default: 7)
#
# Install on CT 302 (weekly, Monday 08:00 UTC):
# 0 8 * * 1 /root/scripts/proxmox-backup-check.sh >> /var/log/proxmox-backup-check.log 2>&1
set -uo pipefail
PROXMOX_NODE="${PROXMOX_NODE:-proxmox}"
PROXMOX_SSH="${PROXMOX_SSH:-proxmox}"
WINDOW_DAYS="${WINDOW_DAYS:-7}"
DISCORD_WEBHOOK="${DISCORD_WEBHOOK:-}"
DRY_RUN=0
while [[ $# -gt 0 ]]; do
case "$1" in
--discord-webhook)
if [[ $# -lt 2 ]]; then
echo "Error: --discord-webhook requires a value" >&2
exit 1
fi
DISCORD_WEBHOOK="$2"
shift 2
;;
--days)
if [[ $# -lt 2 ]]; then
echo "Error: --days requires a value" >&2
exit 1
fi
WINDOW_DAYS="$2"
shift 2
;;
--dry-run)
DRY_RUN=1
shift
;;
*)
echo "Unknown option: $1" >&2
exit 1
;;
esac
done
if [[ "$DRY_RUN" -eq 0 && -z "$DISCORD_WEBHOOK" ]]; then
echo "Error: DISCORD_WEBHOOK not set. Use --discord-webhook URL or set env var." >&2
exit 1
fi
if ! command -v jq &>/dev/null; then
echo "Error: jq is required but not installed." >&2
exit 1
fi
SSH_OPTS="-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes"
CUTOFF=$(date -d "-${WINDOW_DAYS} days" +%s)
NOW=$(date +%s)
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
# ---------------------------------------------------------------------------
# Fetch data from Proxmox
# ---------------------------------------------------------------------------
log "Fetching VM and CT list from Proxmox node '${PROXMOX_NODE}'..."
VMS_JSON=$(ssh $SSH_OPTS "$PROXMOX_SSH" \
"pvesh get /nodes/${PROXMOX_NODE}/qemu --output-format json 2>/dev/null" || echo "[]")
CTS_JSON=$(ssh $SSH_OPTS "$PROXMOX_SSH" \
"pvesh get /nodes/${PROXMOX_NODE}/lxc --output-format json 2>/dev/null" || echo "[]")
log "Fetching recent vzdump task history (limit 200)..."
TASKS_JSON=$(ssh $SSH_OPTS "$PROXMOX_SSH" \
"pvesh get /nodes/${PROXMOX_NODE}/tasks --typefilter vzdump --limit 200 --output-format json 2>/dev/null" || echo "[]")
# ---------------------------------------------------------------------------
# Build per-guest backup status
# ---------------------------------------------------------------------------
# Merge VMs and CTs into one list: [{vmid, name, type}]
GUESTS_JSON=$(jq -n \
--argjson vms "$VMS_JSON" \
--argjson cts "$CTS_JSON" '
($vms | map(select(.status == "running") | {vmid: (.vmid | tostring), name, type: "VM"})) +
($cts | map(select(.status == "running") | {vmid: (.vmid | tostring), name, type: "CT"}))
')
GUEST_COUNT=$(echo "$GUESTS_JSON" | jq 'length')
log "Found ${GUEST_COUNT} running guests."
# For each guest, find the most recent successful (status == "OK") vzdump task
RESULTS=$(jq -n \
--argjson guests "$GUESTS_JSON" \
--argjson tasks "$TASKS_JSON" \
--argjson cutoff "$CUTOFF" \
--argjson now "$NOW" \
--argjson window "$WINDOW_DAYS" '
$guests | map(
. as $g |
($tasks | map(
select(
(.vmid | tostring) == $g.vmid
and .status == "OK"
) | .starttime
) | max // 0) as $last_ts |
{
vmid: $g.vmid,
name: $g.name,
type: $g.type,
last_backup_ts: $last_ts,
age_days: (if $last_ts > 0 then (($now - $last_ts) / 86400 | floor) else -1 end),
status: (
if $last_ts >= $cutoff then "green"
elif $last_ts > 0 then "yellow"
else "red"
end
)
}
) | sort_by(.vmid | tonumber)
')
GREEN_GUESTS=$(echo "$RESULTS" | jq '[.[] | select(.status == "green")]')
YELLOW_GUESTS=$(echo "$RESULTS" | jq '[.[] | select(.status == "yellow")]')
RED_GUESTS=$(echo "$RESULTS" | jq '[.[] | select(.status == "red")]')
GREEN_COUNT=$(echo "$GREEN_GUESTS" | jq 'length')
YELLOW_COUNT=$(echo "$YELLOW_GUESTS" | jq 'length')
RED_COUNT=$(echo "$RED_GUESTS" | jq 'length')
log "Results: ${GREEN_COUNT} green, ${YELLOW_COUNT} yellow, ${RED_COUNT} red"
# ---------------------------------------------------------------------------
# Build Discord payload
# ---------------------------------------------------------------------------
if [[ "$RED_COUNT" -gt 0 ]]; then
EMBED_COLOR=15548997 # 0xED4245 red
STATUS_LINE="🔴 Backup issues detected — action required"
elif [[ "$YELLOW_COUNT" -gt 0 ]]; then
EMBED_COLOR=16705372 # 0xFF851C orange
STATUS_LINE="🟡 Some backups are overdue (>${WINDOW_DAYS}d)"
else
EMBED_COLOR=5763719 # 0x57F287 green
STATUS_LINE="🟢 All ${GUEST_COUNT} guests backed up within ${WINDOW_DAYS} days"
fi
# Format guest lines: "VM 116 (plex) — 2d ago" or "CT 302 (claude-runner) — NO BACKUPS"
format_guest() {
local prefix="$1" guests="$2"
echo "$guests" | jq -r '.[] | "\(.type) \(.vmid) (\(.name))"' |
while IFS= read -r line; do echo "${prefix} ${line}"; done
}
format_guest_with_age() {
local prefix="$1" guests="$2"
echo "$guests" | jq -r '.[] | "\(.type) \(.vmid) (\(.name)) — \(.age_days)d ago"' |
while IFS= read -r line; do echo "${prefix} ${line}"; done
}
# Build fields array
fields='[]'
if [[ "$GREEN_COUNT" -gt 0 ]]; then
green_lines=$(format_guest_with_age "✅" "$GREEN_GUESTS")
fields=$(echo "$fields" | jq \
--arg name "🟢 Healthy (${GREEN_COUNT})" \
--arg value "$green_lines" \
'. + [{"name": $name, "value": $value, "inline": false}]')
fi
if [[ "$YELLOW_COUNT" -gt 0 ]]; then
yellow_lines=$(format_guest_with_age "⚠️" "$YELLOW_GUESTS")
fields=$(echo "$fields" | jq \
--arg name "🟡 Overdue — last backup >${WINDOW_DAYS}d ago (${YELLOW_COUNT})" \
--arg value "$yellow_lines" \
'. + [{"name": $name, "value": $value, "inline": false}]')
fi
if [[ "$RED_COUNT" -gt 0 ]]; then
red_lines=$(format_guest "❌" "$RED_GUESTS")
fields=$(echo "$fields" | jq \
--arg name "🔴 No Successful Backups Found (${RED_COUNT})" \
--arg value "$red_lines" \
'. + [{"name": $name, "value": $value, "inline": false}]')
fi
FOOTER="$(date -u '+%Y-%m-%d %H:%M UTC') · ${GUEST_COUNT} guests · window: ${WINDOW_DAYS}d"
PAYLOAD=$(jq -n \
--arg title "Proxmox Backup Check — ${STATUS_LINE}" \
--argjson color "$EMBED_COLOR" \
--argjson fields "$fields" \
--arg footer "$FOOTER" \
'{
"embeds": [{
"title": $title,
"color": $color,
"fields": $fields,
"footer": {"text": $footer}
}]
}')
if [[ "$DRY_RUN" -eq 1 ]]; then
log "DRY RUN — Discord payload:"
echo "$PAYLOAD" | jq .
exit 0
fi
log "Posting to Discord..."
HTTP_STATUS=$(curl -s -o /tmp/proxmox-backup-check-discord.out \
-w "%{http_code}" \
-X POST "$DISCORD_WEBHOOK" \
-H "Content-Type: application/json" \
-d "$PAYLOAD")
if [[ "$HTTP_STATUS" -ge 200 && "$HTTP_STATUS" -lt 300 ]]; then
log "Discord notification sent (HTTP ${HTTP_STATUS})."
else
log "Warning: Discord returned HTTP ${HTTP_STATUS}."
cat /tmp/proxmox-backup-check-discord.out >&2
exit 1
fi