All checks were successful
Auto-merge docs-only PRs / auto-merge-docs (pull_request) Successful in 2s
Closes #27 - proxmox-backup-check.sh: SSHes to Proxmox, queries pvesh task history, classifies each running VM/CT as green/yellow/red by backup recency, posts a Discord embed summary. Designed for weekly cron on CT 302. - ct302-self-health.sh: Checks disk usage on CT 302 itself, silently exits when healthy, posts a Discord alert when any filesystem exceeds 80% threshold. Closes the blind spot where the monitoring system cannot monitor itself externally. - Updated monitoring/scripts/CONTEXT.md with full operational docs, install instructions, and cron schedules for both new scripts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
159 lines
4.6 KiB
Bash
159 lines
4.6 KiB
Bash
#!/usr/bin/env bash
|
|
# ct302-self-health.sh — CT 302 (claude-runner) disk self-check → Discord
|
|
#
|
|
# Monitors disk usage on CT 302 itself and alerts to Discord when any
|
|
# filesystem exceeds the threshold. Closes the blind spot where the
|
|
# monitoring system cannot monitor itself via external health checks.
|
|
#
|
|
# Designed to run silently when healthy (no Discord spam on green).
|
|
# Only posts when a filesystem is at or above THRESHOLD.
|
|
#
|
|
# Usage:
|
|
# ct302-self-health.sh [--discord-webhook URL] [--threshold N] [--dry-run] [--always-post]
|
|
#
|
|
# Environment overrides:
|
|
# DISCORD_WEBHOOK Discord webhook URL (required unless --dry-run)
|
|
# DISK_THRESHOLD Disk usage % alert threshold (default: 80)
|
|
#
|
|
# Install on CT 302 (daily, 07:00 UTC):
|
|
# 0 7 * * * /root/scripts/ct302-self-health.sh >> /var/log/ct302-self-health.log 2>&1
|
|
|
|
set -uo pipefail
|
|
|
|
DISK_THRESHOLD="${DISK_THRESHOLD:-80}"
|
|
DISCORD_WEBHOOK="${DISCORD_WEBHOOK:-}"
|
|
DRY_RUN=0
|
|
ALWAYS_POST=0
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--discord-webhook)
|
|
if [[ $# -lt 2 ]]; then
|
|
echo "Error: --discord-webhook requires a value" >&2
|
|
exit 1
|
|
fi
|
|
DISCORD_WEBHOOK="$2"
|
|
shift 2
|
|
;;
|
|
--threshold)
|
|
if [[ $# -lt 2 ]]; then
|
|
echo "Error: --threshold requires a value" >&2
|
|
exit 1
|
|
fi
|
|
DISK_THRESHOLD="$2"
|
|
shift 2
|
|
;;
|
|
--dry-run)
|
|
DRY_RUN=1
|
|
shift
|
|
;;
|
|
--always-post)
|
|
ALWAYS_POST=1
|
|
shift
|
|
;;
|
|
*)
|
|
echo "Unknown option: $1" >&2
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
if [[ "$DRY_RUN" -eq 0 && -z "$DISCORD_WEBHOOK" ]]; then
|
|
echo "Error: DISCORD_WEBHOOK not set. Use --discord-webhook URL or set env var." >&2
|
|
exit 1
|
|
fi
|
|
|
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Check disk usage on all real filesystems
|
|
# ---------------------------------------------------------------------------
|
|
# df output: Filesystem Use% Mounted-on (skipping tmpfs, devtmpfs, overlay)
|
|
TRIGGERED=()
|
|
ALL_FS=()
|
|
|
|
while IFS= read -r line; do
|
|
fs=$(echo "$line" | awk '{print $1}')
|
|
pct=$(echo "$line" | awk '{print $5}' | tr -d '%')
|
|
mount=$(echo "$line" | awk '{print $6}')
|
|
ALL_FS+=("${pct}% ${mount} (${fs})")
|
|
if [[ "$pct" -ge "$DISK_THRESHOLD" ]]; then
|
|
TRIGGERED+=("${pct}% used — ${mount} (${fs})")
|
|
fi
|
|
done < <(df -h --output=source,size,used,avail,pcent,target |
|
|
tail -n +2 |
|
|
awk '$1 !~ /^(tmpfs|devtmpfs|overlay|udev)/' |
|
|
awk '{print $1, $5, $6}')
|
|
|
|
HOSTNAME=$(hostname -s)
|
|
TRIGGERED_COUNT=${#TRIGGERED[@]}
|
|
|
|
log "Disk check complete: ${TRIGGERED_COUNT} filesystem(s) above ${DISK_THRESHOLD}%"
|
|
|
|
# Exit cleanly with no Discord post if everything is healthy
|
|
if [[ "$TRIGGERED_COUNT" -eq 0 && "$ALWAYS_POST" -eq 0 && "$DRY_RUN" -eq 0 ]]; then
|
|
log "All filesystems healthy — no alert needed."
|
|
exit 0
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Build Discord payload
|
|
# ---------------------------------------------------------------------------
|
|
if [[ "$TRIGGERED_COUNT" -gt 0 ]]; then
|
|
EMBED_COLOR=15548997 # 0xED4245 red
|
|
TITLE="🔴 ${HOSTNAME}: Disk usage above ${DISK_THRESHOLD}%"
|
|
alert_lines=$(printf '⚠️ %s\n' "${TRIGGERED[@]}")
|
|
FIELDS=$(jq -n \
|
|
--arg name "Filesystems Over Threshold" \
|
|
--arg value "$alert_lines" \
|
|
'[{"name": $name, "value": $value, "inline": false}]')
|
|
else
|
|
EMBED_COLOR=5763719 # 0x57F287 green
|
|
TITLE="🟢 ${HOSTNAME}: All filesystems healthy"
|
|
FIELDS='[]'
|
|
fi
|
|
|
|
# Add summary of all filesystems
|
|
all_lines=$(printf '%s\n' "${ALL_FS[@]}")
|
|
FIELDS=$(echo "$FIELDS" | jq \
|
|
--arg name "All Filesystems" \
|
|
--arg value "$all_lines" \
|
|
'. + [{"name": $name, "value": $value, "inline": false}]')
|
|
|
|
FOOTER="$(date -u '+%Y-%m-%d %H:%M UTC') · CT 302 self-health · threshold: ${DISK_THRESHOLD}%"
|
|
|
|
PAYLOAD=$(jq -n \
|
|
--arg title "$TITLE" \
|
|
--argjson color "$EMBED_COLOR" \
|
|
--argjson fields "$FIELDS" \
|
|
--arg footer "$FOOTER" \
|
|
'{
|
|
"embeds": [{
|
|
"title": $title,
|
|
"color": $color,
|
|
"fields": $fields,
|
|
"footer": {"text": $footer}
|
|
}]
|
|
}')
|
|
|
|
if [[ "$DRY_RUN" -eq 1 ]]; then
|
|
log "DRY RUN — Discord payload:"
|
|
echo "$PAYLOAD" | jq .
|
|
exit 0
|
|
fi
|
|
|
|
log "Posting to Discord..."
|
|
HTTP_STATUS=$(curl -s -o /tmp/ct302-self-health-discord.out \
|
|
-w "%{http_code}" \
|
|
-X POST "$DISCORD_WEBHOOK" \
|
|
-H "Content-Type: application/json" \
|
|
-d "$PAYLOAD")
|
|
|
|
if [[ "$HTTP_STATUS" -ge 200 && "$HTTP_STATUS" -lt 300 ]]; then
|
|
log "Discord notification sent (HTTP ${HTTP_STATUS})."
|
|
else
|
|
log "Warning: Discord returned HTTP ${HTTP_STATUS}."
|
|
cat /tmp/ct302-self-health-discord.out >&2
|
|
exit 1
|
|
fi
|