diff --git a/monitoring/scripts/CONTEXT.md b/monitoring/scripts/CONTEXT.md index 2990a3f..4f06f76 100644 --- a/monitoring/scripts/CONTEXT.md +++ b/monitoring/scripts/CONTEXT.md @@ -1,9 +1,9 @@ --- title: "Monitoring Scripts Context" -description: "Operational context for all monitoring scripts: Jellyfin GPU health monitor, NVIDIA driver update checker, Tdarr API/file monitors, and Windows reboot detection. Includes cron schedules, Discord integration patterns, and troubleshooting." +description: "Operational context for all monitoring scripts: Proxmox backup checker, CT 302 self-health, Jellyfin GPU health monitor, NVIDIA driver update checker, Tdarr API/file monitors, and Windows reboot detection. Includes cron schedules, Discord integration patterns, and troubleshooting." type: context domain: monitoring -tags: [jellyfin, gpu, nvidia, tdarr, discord, cron, python, windows, scripts] +tags: [proxmox, backup, jellyfin, gpu, nvidia, tdarr, discord, cron, python, bash, windows, scripts] --- # Monitoring Scripts - Operational Context @@ -13,6 +13,77 @@ This directory contains active operational scripts for system monitoring, health ## Core Monitoring Scripts +### Proxmox Backup Verification +**Script**: `proxmox-backup-check.sh` +**Purpose**: Weekly check that every running VM/CT has a successful vzdump backup within 7 days. Posts a color-coded Discord embed with per-guest status. + +**Key Features**: +- SSHes to Proxmox host and queries `pvesh` task history + guest lists via API +- Categorizes each guest: ๐ŸŸข green (backed up), ๐ŸŸก yellow (overdue), ๐Ÿ”ด red (no backup) +- Sorts output by VMID; only posts to Discord โ€” no local side effects +- `--dry-run` mode prints the Discord payload without sending +- `--days N` overrides the default 7-day window + +**Schedule**: Weekly on Monday 08:00 UTC (CT 302 cron) +```bash +0 8 * * 1 DISCORD_WEBHOOK="" /root/scripts/proxmox-backup-check.sh >> /var/log/proxmox-backup-check.log 2>&1 +``` + +**Usage**: +```bash +# Dry run (no Discord) +proxmox-backup-check.sh --dry-run + +# Post to Discord +DISCORD_WEBHOOK="https://discord.com/api/webhooks/..." proxmox-backup-check.sh + +# Custom window +proxmox-backup-check.sh --days 14 --discord-webhook "https://..." +``` + +**Dependencies**: `jq`, `curl`, SSH access to Proxmox host alias `proxmox` + +**Install on CT 302**: +```bash +cp proxmox-backup-check.sh /root/scripts/ +chmod +x /root/scripts/proxmox-backup-check.sh +``` + +### CT 302 Self-Health Monitor +**Script**: `ct302-self-health.sh` +**Purpose**: Monitors disk usage on CT 302 (claude-runner) itself. Alerts to Discord when any filesystem exceeds the threshold (default 80%). Runs silently when healthy โ€” no Discord spam on green. + +**Key Features**: +- Checks all non-virtual filesystems (`df`, excludes tmpfs/devtmpfs/overlay) +- Only sends a Discord alert when a filesystem is at or above threshold +- `--always-post` flag forces a post even when healthy (useful for testing) +- `--dry-run` mode prints payload without sending + +**Schedule**: Daily at 07:00 UTC (CT 302 cron) +```bash +0 7 * * * DISCORD_WEBHOOK="" /root/scripts/ct302-self-health.sh >> /var/log/ct302-self-health.log 2>&1 +``` + +**Usage**: +```bash +# Check and alert if over 80% +DISCORD_WEBHOOK="https://discord.com/api/webhooks/..." ct302-self-health.sh + +# Lower threshold test +ct302-self-health.sh --threshold 50 --dry-run + +# Always post (weekly status report pattern) +ct302-self-health.sh --always-post --discord-webhook "https://..." +``` + +**Dependencies**: `jq`, `curl`, `df` + +**Install on CT 302**: +```bash +cp ct302-self-health.sh /root/scripts/ +chmod +x /root/scripts/ct302-self-health.sh +``` + ### Jellyfin GPU Health Monitor **Script**: `jellyfin_gpu_monitor.py` **Purpose**: Monitor Jellyfin container GPU access with Discord alerts and auto-restart capability @@ -235,6 +306,17 @@ python3 tdarr_file_monitor.py >> /mnt/NV2/Development/claude-home/logs/tdarr-fil 0 9 * * 1 /usr/bin/python3 /home/cal/scripts/nvidia_update_checker.py --check --discord-alerts >> /home/cal/logs/nvidia-update-checker.log 2>&1 ``` +**Active Cron Jobs** (on CT 302 / claude-runner, root user): +```bash +# Proxmox backup verification - Weekly (Mondays at 8 AM UTC) +0 8 * * 1 DISCORD_WEBHOOK="" /root/scripts/proxmox-backup-check.sh >> /var/log/proxmox-backup-check.log 2>&1 + +# CT 302 self-health disk check - Daily at 7 AM UTC (alerts only when >80%) +0 7 * * * DISCORD_WEBHOOK="" /root/scripts/ct302-self-health.sh >> /var/log/ct302-self-health.log 2>&1 +``` + +**Note**: Scripts must be installed manually on CT 302. Source of truth is `monitoring/scripts/` in this repo โ€” copy to `/root/scripts/` on CT 302 to deploy. + **Manual/On-Demand**: - `tdarr_monitor.py` - Run as needed for Tdarr health checks - `tdarr_file_monitor.py` - Can be scheduled if automatic backup needed diff --git a/monitoring/scripts/ct302-self-health.sh b/monitoring/scripts/ct302-self-health.sh new file mode 100644 index 0000000..e2adc2d --- /dev/null +++ b/monitoring/scripts/ct302-self-health.sh @@ -0,0 +1,158 @@ +#!/usr/bin/env bash +# ct302-self-health.sh โ€” CT 302 (claude-runner) disk self-check โ†’ Discord +# +# Monitors disk usage on CT 302 itself and alerts to Discord when any +# filesystem exceeds the threshold. Closes the blind spot where the +# monitoring system cannot monitor itself via external health checks. +# +# Designed to run silently when healthy (no Discord spam on green). +# Only posts when a filesystem is at or above THRESHOLD. +# +# Usage: +# ct302-self-health.sh [--discord-webhook URL] [--threshold N] [--dry-run] [--always-post] +# +# Environment overrides: +# DISCORD_WEBHOOK Discord webhook URL (required unless --dry-run) +# DISK_THRESHOLD Disk usage % alert threshold (default: 80) +# +# Install on CT 302 (daily, 07:00 UTC): +# 0 7 * * * /root/scripts/ct302-self-health.sh >> /var/log/ct302-self-health.log 2>&1 + +set -uo pipefail + +DISK_THRESHOLD="${DISK_THRESHOLD:-80}" +DISCORD_WEBHOOK="${DISCORD_WEBHOOK:-}" +DRY_RUN=0 +ALWAYS_POST=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --discord-webhook) + if [[ $# -lt 2 ]]; then + echo "Error: --discord-webhook requires a value" >&2 + exit 1 + fi + DISCORD_WEBHOOK="$2" + shift 2 + ;; + --threshold) + if [[ $# -lt 2 ]]; then + echo "Error: --threshold requires a value" >&2 + exit 1 + fi + DISK_THRESHOLD="$2" + shift 2 + ;; + --dry-run) + DRY_RUN=1 + shift + ;; + --always-post) + ALWAYS_POST=1 + shift + ;; + *) + echo "Unknown option: $1" >&2 + exit 1 + ;; + esac +done + +if [[ "$DRY_RUN" -eq 0 && -z "$DISCORD_WEBHOOK" ]]; then + echo "Error: DISCORD_WEBHOOK not set. Use --discord-webhook URL or set env var." >&2 + exit 1 +fi + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } + +# --------------------------------------------------------------------------- +# Check disk usage on all real filesystems +# --------------------------------------------------------------------------- +# df output: Filesystem Use% Mounted-on (skipping tmpfs, devtmpfs, overlay) +TRIGGERED=() +ALL_FS=() + +while IFS= read -r line; do + fs=$(echo "$line" | awk '{print $1}') + pct=$(echo "$line" | awk '{print $5}' | tr -d '%') + mount=$(echo "$line" | awk '{print $6}') + ALL_FS+=("${pct}% ${mount} (${fs})") + if [[ "$pct" -ge "$DISK_THRESHOLD" ]]; then + TRIGGERED+=("${pct}% used โ€” ${mount} (${fs})") + fi +done < <(df -h --output=source,size,used,avail,pcent,target | + tail -n +2 | + awk '$1 !~ /^(tmpfs|devtmpfs|overlay|udev)/' | + awk '{print $1, $5, $6}') + +HOSTNAME=$(hostname -s) +TRIGGERED_COUNT=${#TRIGGERED[@]} + +log "Disk check complete: ${TRIGGERED_COUNT} filesystem(s) above ${DISK_THRESHOLD}%" + +# Exit cleanly with no Discord post if everything is healthy +if [[ "$TRIGGERED_COUNT" -eq 0 && "$ALWAYS_POST" -eq 0 && "$DRY_RUN" -eq 0 ]]; then + log "All filesystems healthy โ€” no alert needed." + exit 0 +fi + +# --------------------------------------------------------------------------- +# Build Discord payload +# --------------------------------------------------------------------------- +if [[ "$TRIGGERED_COUNT" -gt 0 ]]; then + EMBED_COLOR=15548997 # 0xED4245 red + TITLE="๐Ÿ”ด ${HOSTNAME}: Disk usage above ${DISK_THRESHOLD}%" + alert_lines=$(printf 'โš ๏ธ %s\n' "${TRIGGERED[@]}") + FIELDS=$(jq -n \ + --arg name "Filesystems Over Threshold" \ + --arg value "$alert_lines" \ + '[{"name": $name, "value": $value, "inline": false}]') +else + EMBED_COLOR=5763719 # 0x57F287 green + TITLE="๐ŸŸข ${HOSTNAME}: All filesystems healthy" + FIELDS='[]' +fi + +# Add summary of all filesystems +all_lines=$(printf '%s\n' "${ALL_FS[@]}") +FIELDS=$(echo "$FIELDS" | jq \ + --arg name "All Filesystems" \ + --arg value "$all_lines" \ + '. + [{"name": $name, "value": $value, "inline": false}]') + +FOOTER="$(date -u '+%Y-%m-%d %H:%M UTC') ยท CT 302 self-health ยท threshold: ${DISK_THRESHOLD}%" + +PAYLOAD=$(jq -n \ + --arg title "$TITLE" \ + --argjson color "$EMBED_COLOR" \ + --argjson fields "$FIELDS" \ + --arg footer "$FOOTER" \ + '{ + "embeds": [{ + "title": $title, + "color": $color, + "fields": $fields, + "footer": {"text": $footer} + }] + }') + +if [[ "$DRY_RUN" -eq 1 ]]; then + log "DRY RUN โ€” Discord payload:" + echo "$PAYLOAD" | jq . + exit 0 +fi + +log "Posting to Discord..." +HTTP_STATUS=$(curl -s -o /tmp/ct302-self-health-discord.out \ + -w "%{http_code}" \ + -X POST "$DISCORD_WEBHOOK" \ + -H "Content-Type: application/json" \ + -d "$PAYLOAD") + +if [[ "$HTTP_STATUS" -ge 200 && "$HTTP_STATUS" -lt 300 ]]; then + log "Discord notification sent (HTTP ${HTTP_STATUS})." +else + log "Warning: Discord returned HTTP ${HTTP_STATUS}." + cat /tmp/ct302-self-health-discord.out >&2 + exit 1 +fi diff --git a/monitoring/scripts/proxmox-backup-check.sh b/monitoring/scripts/proxmox-backup-check.sh new file mode 100644 index 0000000..fcc1186 --- /dev/null +++ b/monitoring/scripts/proxmox-backup-check.sh @@ -0,0 +1,230 @@ +#!/usr/bin/env bash +# proxmox-backup-check.sh โ€” Weekly Proxmox backup verification โ†’ Discord +# +# SSHes to the Proxmox host and checks that every running VM/CT has a +# successful vzdump backup within the last 7 days. Posts a color-coded +# Discord summary with per-guest status. +# +# Usage: +# proxmox-backup-check.sh [--discord-webhook URL] [--days N] [--dry-run] +# +# Environment overrides: +# DISCORD_WEBHOOK Discord webhook URL (required unless --dry-run) +# PROXMOX_NODE Proxmox node name (default: proxmox) +# PROXMOX_SSH SSH alias or host for Proxmox (default: proxmox) +# WINDOW_DAYS Backup recency window in days (default: 7) +# +# Install on CT 302 (weekly, Monday 08:00 UTC): +# 0 8 * * 1 /root/scripts/proxmox-backup-check.sh >> /var/log/proxmox-backup-check.log 2>&1 + +set -uo pipefail + +PROXMOX_NODE="${PROXMOX_NODE:-proxmox}" +PROXMOX_SSH="${PROXMOX_SSH:-proxmox}" +WINDOW_DAYS="${WINDOW_DAYS:-7}" +DISCORD_WEBHOOK="${DISCORD_WEBHOOK:-}" +DRY_RUN=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --discord-webhook) + if [[ $# -lt 2 ]]; then + echo "Error: --discord-webhook requires a value" >&2 + exit 1 + fi + DISCORD_WEBHOOK="$2" + shift 2 + ;; + --days) + if [[ $# -lt 2 ]]; then + echo "Error: --days requires a value" >&2 + exit 1 + fi + WINDOW_DAYS="$2" + shift 2 + ;; + --dry-run) + DRY_RUN=1 + shift + ;; + *) + echo "Unknown option: $1" >&2 + exit 1 + ;; + esac +done + +if [[ "$DRY_RUN" -eq 0 && -z "$DISCORD_WEBHOOK" ]]; then + echo "Error: DISCORD_WEBHOOK not set. Use --discord-webhook URL or set env var." >&2 + exit 1 +fi + +if ! command -v jq &>/dev/null; then + echo "Error: jq is required but not installed." >&2 + exit 1 +fi + +SSH_OPTS="-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes" +CUTOFF=$(date -d "-${WINDOW_DAYS} days" +%s) +NOW=$(date +%s) + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } + +# --------------------------------------------------------------------------- +# Fetch data from Proxmox +# --------------------------------------------------------------------------- +log "Fetching VM and CT list from Proxmox node '${PROXMOX_NODE}'..." +VMS_JSON=$(ssh $SSH_OPTS "$PROXMOX_SSH" \ + "pvesh get /nodes/${PROXMOX_NODE}/qemu --output-format json 2>/dev/null" || echo "[]") +CTS_JSON=$(ssh $SSH_OPTS "$PROXMOX_SSH" \ + "pvesh get /nodes/${PROXMOX_NODE}/lxc --output-format json 2>/dev/null" || echo "[]") + +log "Fetching recent vzdump task history (limit 200)..." +TASKS_JSON=$(ssh $SSH_OPTS "$PROXMOX_SSH" \ + "pvesh get /nodes/${PROXMOX_NODE}/tasks --typefilter vzdump --limit 200 --output-format json 2>/dev/null" || echo "[]") + +# --------------------------------------------------------------------------- +# Build per-guest backup status +# --------------------------------------------------------------------------- +# Merge VMs and CTs into one list: [{vmid, name, type}] +GUESTS_JSON=$(jq -n \ + --argjson vms "$VMS_JSON" \ + --argjson cts "$CTS_JSON" ' + ($vms | map(select(.status == "running") | {vmid: (.vmid | tostring), name, type: "VM"})) + + ($cts | map(select(.status == "running") | {vmid: (.vmid | tostring), name, type: "CT"})) + ') + +GUEST_COUNT=$(echo "$GUESTS_JSON" | jq 'length') +log "Found ${GUEST_COUNT} running guests." + +# For each guest, find the most recent successful (status == "OK") vzdump task +RESULTS=$(jq -n \ + --argjson guests "$GUESTS_JSON" \ + --argjson tasks "$TASKS_JSON" \ + --argjson cutoff "$CUTOFF" \ + --argjson now "$NOW" \ + --argjson window "$WINDOW_DAYS" ' + $guests | map( + . as $g | + ($tasks | map( + select( + (.vmid | tostring) == $g.vmid + and .status == "OK" + ) | .starttime + ) | max // 0) as $last_ts | + { + vmid: $g.vmid, + name: $g.name, + type: $g.type, + last_backup_ts: $last_ts, + age_days: (if $last_ts > 0 then (($now - $last_ts) / 86400 | floor) else -1 end), + status: ( + if $last_ts >= $cutoff then "green" + elif $last_ts > 0 then "yellow" + else "red" + end + ) + } + ) | sort_by(.vmid | tonumber) +') + +GREEN_GUESTS=$(echo "$RESULTS" | jq '[.[] | select(.status == "green")]') +YELLOW_GUESTS=$(echo "$RESULTS" | jq '[.[] | select(.status == "yellow")]') +RED_GUESTS=$(echo "$RESULTS" | jq '[.[] | select(.status == "red")]') + +GREEN_COUNT=$(echo "$GREEN_GUESTS" | jq 'length') +YELLOW_COUNT=$(echo "$YELLOW_GUESTS" | jq 'length') +RED_COUNT=$(echo "$RED_GUESTS" | jq 'length') + +log "Results: ${GREEN_COUNT} green, ${YELLOW_COUNT} yellow, ${RED_COUNT} red" + +# --------------------------------------------------------------------------- +# Build Discord payload +# --------------------------------------------------------------------------- +if [[ "$RED_COUNT" -gt 0 ]]; then + EMBED_COLOR=15548997 # 0xED4245 red + STATUS_LINE="๐Ÿ”ด Backup issues detected โ€” action required" +elif [[ "$YELLOW_COUNT" -gt 0 ]]; then + EMBED_COLOR=16705372 # 0xFF851C orange + STATUS_LINE="๐ŸŸก Some backups are overdue (>${WINDOW_DAYS}d)" +else + EMBED_COLOR=5763719 # 0x57F287 green + STATUS_LINE="๐ŸŸข All ${GUEST_COUNT} guests backed up within ${WINDOW_DAYS} days" +fi + +# Format guest lines: "VM 116 (plex) โ€” 2d ago" or "CT 302 (claude-runner) โ€” NO BACKUPS" +format_guest() { + local prefix="$1" guests="$2" + echo "$guests" | jq -r '.[] | "\(.type) \(.vmid) (\(.name))"' | + while IFS= read -r line; do echo "${prefix} ${line}"; done +} + +format_guest_with_age() { + local prefix="$1" guests="$2" + echo "$guests" | jq -r '.[] | "\(.type) \(.vmid) (\(.name)) โ€” \(.age_days)d ago"' | + while IFS= read -r line; do echo "${prefix} ${line}"; done +} + +# Build fields array +fields='[]' + +if [[ "$GREEN_COUNT" -gt 0 ]]; then + green_lines=$(format_guest_with_age "โœ…" "$GREEN_GUESTS") + fields=$(echo "$fields" | jq \ + --arg name "๐ŸŸข Healthy (${GREEN_COUNT})" \ + --arg value "$green_lines" \ + '. + [{"name": $name, "value": $value, "inline": false}]') +fi + +if [[ "$YELLOW_COUNT" -gt 0 ]]; then + yellow_lines=$(format_guest_with_age "โš ๏ธ" "$YELLOW_GUESTS") + fields=$(echo "$fields" | jq \ + --arg name "๐ŸŸก Overdue โ€” last backup >${WINDOW_DAYS}d ago (${YELLOW_COUNT})" \ + --arg value "$yellow_lines" \ + '. + [{"name": $name, "value": $value, "inline": false}]') +fi + +if [[ "$RED_COUNT" -gt 0 ]]; then + red_lines=$(format_guest "โŒ" "$RED_GUESTS") + fields=$(echo "$fields" | jq \ + --arg name "๐Ÿ”ด No Successful Backups Found (${RED_COUNT})" \ + --arg value "$red_lines" \ + '. + [{"name": $name, "value": $value, "inline": false}]') +fi + +FOOTER="$(date -u '+%Y-%m-%d %H:%M UTC') ยท ${GUEST_COUNT} guests ยท window: ${WINDOW_DAYS}d" + +PAYLOAD=$(jq -n \ + --arg title "Proxmox Backup Check โ€” ${STATUS_LINE}" \ + --argjson color "$EMBED_COLOR" \ + --argjson fields "$fields" \ + --arg footer "$FOOTER" \ + '{ + "embeds": [{ + "title": $title, + "color": $color, + "fields": $fields, + "footer": {"text": $footer} + }] + }') + +if [[ "$DRY_RUN" -eq 1 ]]; then + log "DRY RUN โ€” Discord payload:" + echo "$PAYLOAD" | jq . + exit 0 +fi + +log "Posting to Discord..." +HTTP_STATUS=$(curl -s -o /tmp/proxmox-backup-check-discord.out \ + -w "%{http_code}" \ + -X POST "$DISCORD_WEBHOOK" \ + -H "Content-Type: application/json" \ + -d "$PAYLOAD") + +if [[ "$HTTP_STATUS" -ge 200 && "$HTTP_STATUS" -lt 300 ]]; then + log "Discord notification sent (HTTP ${HTTP_STATUS})." +else + log "Warning: Discord returned HTTP ${HTTP_STATUS}." + cat /tmp/proxmox-backup-check-discord.out >&2 + exit 1 +fi