From 48a804dda2ac2be9ede61e1ca042191b3eb68a76 Mon Sep 17 00:00:00 2001
From: Cal Corum <calcorum@users.noreply.github.com>
Date: Fri, 3 Apr 2026 15:39:35 -0500
Subject: [PATCH 1/6] feat: right-size VM 115 config and add --hosts flag to
 audit script
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reduce VM 115 (docker-sba) from 16 vCPUs (2×8) to 8 vCPUs (1×8) to
match actual workload (0.06 load/core). Add --hosts flag to
homelab-audit.sh for targeted post-change audits.

Closes #18

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 monitoring/scripts/homelab-audit.sh         |  3 +--
 monitoring/scripts/test-audit-collectors.sh | 28 +++++++++++++++++++++
 server-configs/proxmox/qemu/115.conf        |  2 +-
 3 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/monitoring/scripts/homelab-audit.sh b/monitoring/scripts/homelab-audit.sh
index 92d4609..55c8c1c 100755
--- a/monitoring/scripts/homelab-audit.sh
+++ b/monitoring/scripts/homelab-audit.sh
@@ -5,7 +5,7 @@
 # to collect system metrics, then generates a summary report.
 #
 # Usage:
-#   homelab-audit.sh [--output-dir DIR]
+#   homelab-audit.sh [--output-dir DIR] [--hosts label:ip,label:ip,...]
 #
 # Environment overrides:
 #   STUCK_PROC_CPU_WARN  CPU% at which a D-state process is flagged (default: 10)
@@ -29,7 +29,6 @@ LOAD_WARN=2.0
 MEM_WARN=85
 ZOMBIE_WARN=1
 SWAP_WARN=512
-
 HOSTS_FILTER="" # comma-separated host list from --hosts; empty = audit all
 JSON_OUTPUT=0   # set to 1 by --json
 
diff --git a/monitoring/scripts/test-audit-collectors.sh b/monitoring/scripts/test-audit-collectors.sh
index 149aa98..ef37103 100644
--- a/monitoring/scripts/test-audit-collectors.sh
+++ b/monitoring/scripts/test-audit-collectors.sh
@@ -93,6 +93,34 @@ else
   fail "disk_usage" "expected 'N /path', got: '$result'"
 fi
 
+# --- --hosts flag parsing ---
+echo ""
+echo "=== --hosts argument parsing tests ==="
+
+# Single host
+input="vm-115:10.10.0.88"
+IFS=',' read -ra entries <<<"$input"
+label="${entries[0]%%:*}"
+addr="${entries[0]#*:}"
+if [[ "$label" == "vm-115" && "$addr" == "10.10.0.88" ]]; then
+  pass "--hosts single entry parsed: $label $addr"
+else
+  fail "--hosts single" "expected 'vm-115 10.10.0.88', got: '$label $addr'"
+fi
+
+# Multiple hosts
+input="vm-115:10.10.0.88,lxc-225:10.10.0.225"
+IFS=',' read -ra entries <<<"$input"
+label1="${entries[0]%%:*}"
+addr1="${entries[0]#*:}"
+label2="${entries[1]%%:*}"
+addr2="${entries[1]#*:}"
+if [[ "$label1" == "vm-115" && "$addr1" == "10.10.0.88" && "$label2" == "lxc-225" && "$addr2" == "10.10.0.225" ]]; then
+  pass "--hosts multi entry parsed: $label1 $addr1, $label2 $addr2"
+else
+  fail "--hosts multi" "unexpected parse result"
+fi
+
 echo ""
 echo "=== Results: $PASS passed, $FAIL failed ==="
 ((FAIL == 0))
diff --git a/server-configs/proxmox/qemu/115.conf b/server-configs/proxmox/qemu/115.conf
index 6474b44..4cf45c7 100644
--- a/server-configs/proxmox/qemu/115.conf
+++ b/server-configs/proxmox/qemu/115.conf
@@ -12,5 +12,5 @@ ostype: l26
 scsi0: local-lvm:vm-115-disk-0,size=256G
 scsihw: virtio-scsi-pci
 smbios1: uuid=19be98ee-f60d-473d-acd2-9164717fcd11
-sockets: 2
+sockets: 1
 vmgenid: 682dfeab-8c63-4f0b-8ed2-8828c2f808ef

From 29a20fbe06f8addaa0ca9894a3be61dde5cffa5b Mon Sep 17 00:00:00 2001
From: Cal Corum <calcorum@users.noreply.github.com>
Date: Fri, 3 Apr 2026 16:17:55 -0500
Subject: [PATCH 2/6] feat: add monthly Proxmox maintenance reboot automation
 (#26)

Establishes a first-Sunday-of-the-month maintenance window orchestrated
by Ansible on LXC 304. Split into two playbooks to handle the self-reboot
paradox (the controller is a guest on the host being rebooted):

- monthly-reboot.yml: snapshots, tiered shutdown with per-guest polling,
  fire-and-forget host reboot
- post-reboot-startup.yml: controlled tiered startup with staggered delays,
  Pi-hole UDP DNS fix, validation, and snapshot cleanup

Also fixes onboot:1 on VM 109, LXC 221, LXC 223 and creates a recurring
Google Calendar event for the maintenance window.

Closes #26

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 ansible/playbooks/monthly-reboot.yml          | 265 ++++++++++++++++++
 ansible/playbooks/post-reboot-startup.yml     | 214 ++++++++++++++
 .../systemd/ansible-monthly-reboot.service    |  15 +
 ansible/systemd/ansible-monthly-reboot.timer  |  13 +
 ansible/systemd/ansible-post-reboot.service   |  21 ++
 server-configs/proxmox/maintenance-reboot.md  |  98 +++++--
 6 files changed, 595 insertions(+), 31 deletions(-)
 create mode 100644 ansible/playbooks/monthly-reboot.yml
 create mode 100644 ansible/playbooks/post-reboot-startup.yml
 create mode 100644 ansible/systemd/ansible-monthly-reboot.service
 create mode 100644 ansible/systemd/ansible-monthly-reboot.timer
 create mode 100644 ansible/systemd/ansible-post-reboot.service

diff --git a/ansible/playbooks/monthly-reboot.yml b/ansible/playbooks/monthly-reboot.yml
new file mode 100644
index 0000000..f3a77c8
--- /dev/null
+++ b/ansible/playbooks/monthly-reboot.yml
@@ -0,0 +1,265 @@
+---
+# Monthly Proxmox Maintenance Reboot — Shutdown & Reboot
+#
+# Orchestrates a graceful shutdown of all guests in dependency order,
+# then issues a fire-and-forget reboot to the Proxmox host.
+#
+# After the host reboots, LXC 304 auto-starts via onboot:1 and the
+# post-reboot-startup.yml playbook runs automatically via the
+# ansible-post-reboot.service systemd unit (triggered by @reboot).
+#
+# Schedule: 1st Sunday of each month, 08:00 UTC (3 AM ET)
+# Controller: LXC 304 (ansible-controller) at 10.10.0.232
+#
+# Usage:
+#   # Dry run
+#   ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --check
+#
+#   # Full execution
+#   ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml
+#
+#   # Shutdown only (skip the host reboot)
+#   ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --tags shutdown
+#
+# Note: VM 109 (homeassistant) is excluded from Ansible inventory
+# (self-managed via HA Supervisor) but is included in pvesh start/stop.
+
+- name: Pre-reboot health check and snapshots
+  hosts: pve-node
+  gather_facts: false
+  tags: [pre-reboot, shutdown]
+
+  tasks:
+    - name: Check Proxmox cluster health
+      ansible.builtin.command: pvesh get /cluster/status --output-format json
+      register: cluster_status
+      changed_when: false
+
+    - name: Get list of running QEMU VMs
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/qemu --output-format json |
+        python3 -c "import sys,json; [print(vm['vmid']) for vm in json.load(sys.stdin) if vm.get('status')=='running']"
+      register: running_vms
+      changed_when: false
+
+    - name: Get list of running LXC containers
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/lxc --output-format json |
+        python3 -c "import sys,json; [print(ct['vmid']) for ct in json.load(sys.stdin) if ct.get('status')=='running']"
+      register: running_lxcs
+      changed_when: false
+
+    - name: Display running guests
+      ansible.builtin.debug:
+        msg: "Running VMs: {{ running_vms.stdout_lines }} | Running LXCs: {{ running_lxcs.stdout_lines }}"
+
+    - name: Snapshot running VMs
+      ansible.builtin.command: >
+        pvesh create /nodes/proxmox/qemu/{{ item }}/snapshot
+        --snapname pre-maintenance-{{ lookup('pipe', 'date +%Y-%m-%d') }}
+        --description "Auto snapshot before monthly maintenance reboot"
+      loop: "{{ running_vms.stdout_lines }}"
+      when: running_vms.stdout_lines | length > 0
+      ignore_errors: true
+
+    - name: Snapshot running LXCs
+      ansible.builtin.command: >
+        pvesh create /nodes/proxmox/lxc/{{ item }}/snapshot
+        --snapname pre-maintenance-{{ lookup('pipe', 'date +%Y-%m-%d') }}
+        --description "Auto snapshot before monthly maintenance reboot"
+      loop: "{{ running_lxcs.stdout_lines }}"
+      when: running_lxcs.stdout_lines | length > 0
+      ignore_errors: true
+
+- name: "Shutdown Tier 4 — Media & Others"
+  hosts: pve-node
+  gather_facts: false
+  tags: [shutdown]
+
+  vars:
+    tier4_vms: [109]
+    # LXC 303 (mcp-gateway) is onboot=0 and operator-managed — not included here
+    tier4_lxcs: [221, 222, 223, 302]
+
+  tasks:
+    - name: Shutdown Tier 4 VMs
+      ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
+      loop: "{{ tier4_vms }}"
+      ignore_errors: true
+
+    - name: Shutdown Tier 4 LXCs
+      ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown
+      loop: "{{ tier4_lxcs }}"
+      ignore_errors: true
+
+    - name: Wait for Tier 4 VMs to stop
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
+        python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
+      register: t4_vm_status
+      until: t4_vm_status.stdout.strip() == "stopped"
+      retries: 12
+      delay: 5
+      loop: "{{ tier4_vms }}"
+      ignore_errors: true
+
+    - name: Wait for Tier 4 LXCs to stop
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json |
+        python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
+      register: t4_lxc_status
+      until: t4_lxc_status.stdout.strip() == "stopped"
+      retries: 12
+      delay: 5
+      loop: "{{ tier4_lxcs }}"
+      ignore_errors: true
+
+- name: "Shutdown Tier 3 — Applications"
+  hosts: pve-node
+  gather_facts: false
+  tags: [shutdown]
+
+  vars:
+    tier3_vms: [115, 110]
+    tier3_lxcs: [301]
+
+  tasks:
+    - name: Shutdown Tier 3 VMs
+      ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
+      loop: "{{ tier3_vms }}"
+      ignore_errors: true
+
+    - name: Shutdown Tier 3 LXCs
+      ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown
+      loop: "{{ tier3_lxcs }}"
+      ignore_errors: true
+
+    - name: Wait for Tier 3 VMs to stop
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
+        python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
+      register: t3_vm_status
+      until: t3_vm_status.stdout.strip() == "stopped"
+      retries: 12
+      delay: 5
+      loop: "{{ tier3_vms }}"
+      ignore_errors: true
+
+    - name: Wait for Tier 3 LXCs to stop
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json |
+        python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
+      register: t3_lxc_status
+      until: t3_lxc_status.stdout.strip() == "stopped"
+      retries: 12
+      delay: 5
+      loop: "{{ tier3_lxcs }}"
+      ignore_errors: true
+
+- name: "Shutdown Tier 2 — Infrastructure"
+  hosts: pve-node
+  gather_facts: false
+  tags: [shutdown]
+
+  vars:
+    tier2_vms: [106, 116]
+    tier2_lxcs: [225, 210, 227]
+
+  tasks:
+    - name: Shutdown Tier 2 VMs
+      ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
+      loop: "{{ tier2_vms }}"
+      ignore_errors: true
+
+    - name: Shutdown Tier 2 LXCs
+      ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown
+      loop: "{{ tier2_lxcs }}"
+      ignore_errors: true
+
+    - name: Wait for Tier 2 VMs to stop
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
+        python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
+      register: t2_vm_status
+      until: t2_vm_status.stdout.strip() == "stopped"
+      retries: 12
+      delay: 5
+      loop: "{{ tier2_vms }}"
+      ignore_errors: true
+
+    - name: Wait for Tier 2 LXCs to stop
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json |
+        python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
+      register: t2_lxc_status
+      until: t2_lxc_status.stdout.strip() == "stopped"
+      retries: 12
+      delay: 5
+      loop: "{{ tier2_lxcs }}"
+      ignore_errors: true
+
+- name: "Shutdown Tier 1 — Databases"
+  hosts: pve-node
+  gather_facts: false
+  tags: [shutdown]
+
+  vars:
+    tier1_vms: [112]
+
+  tasks:
+    - name: Shutdown database VMs
+      ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
+      loop: "{{ tier1_vms }}"
+      ignore_errors: true
+
+    - name: Wait for database VMs to stop (up to 90s)
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
+        python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
+      register: t1_vm_status
+      until: t1_vm_status.stdout.strip() == "stopped"
+      retries: 18
+      delay: 5
+      loop: "{{ tier1_vms }}"
+      ignore_errors: true
+
+    - name: Force stop database VMs if still running
+      ansible.builtin.shell: >
+        status=$(pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
+        python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))");
+        if [ "$status" = "running" ]; then
+          pvesh create /nodes/proxmox/qemu/{{ item }}/status/stop;
+          echo "Force stopped VM {{ item }}";
+        else
+          echo "VM {{ item }} already stopped";
+        fi
+      loop: "{{ tier1_vms }}"
+      register: force_stop_result
+      changed_when: force_stop_result.results | default([]) | selectattr('stdout', 'defined') | selectattr('stdout', 'search', 'Force stopped') | list | length > 0
+
+- name: "Verify and reboot Proxmox host"
+  hosts: pve-node
+  gather_facts: false
+  tags: [reboot]
+
+  tasks:
+    - name: Verify all guests are stopped (excluding LXC 304)
+      ansible.builtin.shell: >
+        running_vms=$(pvesh get /nodes/proxmox/qemu --output-format json |
+        python3 -c "import sys,json; vms=[v for v in json.load(sys.stdin) if v.get('status')=='running']; print(len(vms))");
+        running_lxcs=$(pvesh get /nodes/proxmox/lxc --output-format json |
+        python3 -c "import sys,json; cts=[c for c in json.load(sys.stdin) if c.get('status')=='running' and c['vmid'] != 304]; print(len(cts))");
+        echo "Running VMs: $running_vms, Running LXCs: $running_lxcs";
+        if [ "$running_vms" != "0" ] || [ "$running_lxcs" != "0" ]; then exit 1; fi
+      register: verify_stopped
+
+    - name: Issue fire-and-forget reboot (controller will be killed)
+      ansible.builtin.shell: >
+        nohup bash -c 'sleep 10 && reboot' &>/dev/null &
+        echo "Reboot scheduled in 10 seconds"
+      register: reboot_issued
+      when: not ansible_check_mode
+
+    - name: Log reboot issued
+      ansible.builtin.debug:
+        msg: "{{ reboot_issued.stdout }} — Ansible process will terminate when host reboots. Post-reboot startup handled by ansible-post-reboot.service on LXC 304."
diff --git a/ansible/playbooks/post-reboot-startup.yml b/ansible/playbooks/post-reboot-startup.yml
new file mode 100644
index 0000000..d05c77c
--- /dev/null
+++ b/ansible/playbooks/post-reboot-startup.yml
@@ -0,0 +1,214 @@
+---
+# Post-Reboot Startup — Controlled Guest Startup After Proxmox Reboot
+#
+# Starts all guests in dependency order with staggered delays to avoid
+# I/O storms. Runs automatically via ansible-post-reboot.service on
+# LXC 304 after the Proxmox host reboots.
+#
+# Can also be run manually:
+#   ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml
+#
+# Note: VM 109 (homeassistant) is excluded from Ansible inventory
+# (self-managed via HA Supervisor) but is included in pvesh start/stop.
+
+- name: Wait for Proxmox API to be ready
+  hosts: pve-node
+  gather_facts: false
+  tags: [startup]
+
+  tasks:
+    - name: Wait for Proxmox API
+      ansible.builtin.command: pvesh get /version --output-format json
+      register: pve_version
+      until: pve_version.rc == 0
+      retries: 30
+      delay: 10
+      changed_when: false
+
+    - name: Display Proxmox version
+      ansible.builtin.debug:
+        msg: "Proxmox API ready: {{ pve_version.stdout | from_json | json_query('version') | default('unknown') }}"
+
+- name: "Startup Tier 1 — Databases"
+  hosts: pve-node
+  gather_facts: false
+  tags: [startup]
+
+  tasks:
+    - name: Start database VM (112)
+      ansible.builtin.command: pvesh create /nodes/proxmox/qemu/112/status/start
+      ignore_errors: true
+
+    - name: Wait for VM 112 to be running
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/qemu/112/status/current --output-format json |
+        python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
+      register: db_status
+      until: db_status.stdout.strip() == "running"
+      retries: 12
+      delay: 5
+      changed_when: false
+
+    - name: Wait for database services to initialize
+      ansible.builtin.pause:
+        seconds: 30
+
+- name: "Startup Tier 2 — Infrastructure"
+  hosts: pve-node
+  gather_facts: false
+  tags: [startup]
+
+  vars:
+    tier2_vms: [106, 116]
+    tier2_lxcs: [225, 210, 227]
+
+  tasks:
+    - name: Start Tier 2 VMs
+      ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start
+      loop: "{{ tier2_vms }}"
+      ignore_errors: true
+
+    - name: Start Tier 2 LXCs
+      ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start
+      loop: "{{ tier2_lxcs }}"
+      ignore_errors: true
+
+    - name: Wait for infrastructure to come up
+      ansible.builtin.pause:
+        seconds: 30
+
+- name: "Startup Tier 3 — Applications"
+  hosts: pve-node
+  gather_facts: false
+  tags: [startup]
+
+  vars:
+    tier3_vms: [115, 110]
+    tier3_lxcs: [301]
+
+  tasks:
+    - name: Start Tier 3 VMs
+      ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start
+      loop: "{{ tier3_vms }}"
+      ignore_errors: true
+
+    - name: Start Tier 3 LXCs
+      ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start
+      loop: "{{ tier3_lxcs }}"
+      ignore_errors: true
+
+    - name: Wait for applications to start
+      ansible.builtin.pause:
+        seconds: 30
+
+    - name: Restart Pi-hole container via SSH (UDP DNS fix)
+      ansible.builtin.command: ssh docker-home "docker restart pihole"
+      ignore_errors: true
+
+    - name: Wait for Pi-hole to stabilize
+      ansible.builtin.pause:
+        seconds: 10
+
+- name: "Startup Tier 4 — Media & Others"
+  hosts: pve-node
+  gather_facts: false
+  tags: [startup]
+
+  vars:
+    tier4_vms: [109]
+    tier4_lxcs: [221, 222, 223, 302]
+
+  tasks:
+    - name: Start Tier 4 VMs
+      ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start
+      loop: "{{ tier4_vms }}"
+      ignore_errors: true
+
+    - name: Start Tier 4 LXCs
+      ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start
+      loop: "{{ tier4_lxcs }}"
+      ignore_errors: true
+
+- name: Post-reboot validation
+  hosts: pve-node
+  gather_facts: false
+  tags: [startup, validate]
+
+  tasks:
+    - name: Wait for all services to initialize
+      ansible.builtin.pause:
+        seconds: 60
+
+    - name: Check all expected VMs are running
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/qemu --output-format json |
+        python3 -c "
+        import sys, json
+        vms = json.load(sys.stdin)
+        expected = {106, 109, 110, 112, 115, 116}
+        running = {v['vmid'] for v in vms if v.get('status') == 'running'}
+        missing = expected - running
+        if missing:
+            print(f'WARN: VMs not running: {missing}')
+            sys.exit(1)
+        print(f'All expected VMs running: {running & expected}')
+        "
+      register: vm_check
+      ignore_errors: true
+
+    - name: Check all expected LXCs are running
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/lxc --output-format json |
+        python3 -c "
+        import sys, json
+        cts = json.load(sys.stdin)
+        # LXC 303 (mcp-gateway) intentionally excluded — onboot=0, operator-managed
+        expected = {210, 221, 222, 223, 225, 227, 301, 302, 304}
+        running = {c['vmid'] for c in cts if c.get('status') == 'running'}
+        missing = expected - running
+        if missing:
+            print(f'WARN: LXCs not running: {missing}')
+            sys.exit(1)
+        print(f'All expected LXCs running: {running & expected}')
+        "
+      register: lxc_check
+      ignore_errors: true
+
+    - name: Clean up old maintenance snapshots (older than 7 days)
+      ansible.builtin.shell: >
+        cutoff=$(date -d '7 days ago' +%s);
+        for vmid in $(pvesh get /nodes/proxmox/qemu --output-format json |
+          python3 -c "import sys,json; [print(v['vmid']) for v in json.load(sys.stdin)]"); do
+          for snap in $(pvesh get /nodes/proxmox/qemu/$vmid/snapshot --output-format json |
+            python3 -c "import sys,json; [print(s['name']) for s in json.load(sys.stdin) if s['name'].startswith('pre-maintenance-')]" 2>/dev/null); do
+            snap_date=$(echo $snap | sed 's/pre-maintenance-//');
+            snap_epoch=$(date -d "$snap_date" +%s 2>/dev/null);
+            if [ -z "$snap_epoch" ]; then
+              echo "WARN: could not parse date for snapshot $snap on VM $vmid";
+            elif [ "$snap_epoch" -lt "$cutoff" ]; then
+              pvesh delete /nodes/proxmox/qemu/$vmid/snapshot/$snap && echo "Deleted $snap from VM $vmid";
+            fi
+          done
+        done;
+        for ctid in $(pvesh get /nodes/proxmox/lxc --output-format json |
+          python3 -c "import sys,json; [print(c['vmid']) for c in json.load(sys.stdin)]"); do
+          for snap in $(pvesh get /nodes/proxmox/lxc/$ctid/snapshot --output-format json |
+            python3 -c "import sys,json; [print(s['name']) for s in json.load(sys.stdin) if s['name'].startswith('pre-maintenance-')]" 2>/dev/null); do
+            snap_date=$(echo $snap | sed 's/pre-maintenance-//');
+            snap_epoch=$(date -d "$snap_date" +%s 2>/dev/null);
+            if [ -z "$snap_epoch" ]; then
+              echo "WARN: could not parse date for snapshot $snap on LXC $ctid";
+            elif [ "$snap_epoch" -lt "$cutoff" ]; then
+              pvesh delete /nodes/proxmox/lxc/$ctid/snapshot/$snap && echo "Deleted $snap from LXC $ctid";
+            fi
+          done
+        done;
+        echo "Snapshot cleanup complete"
+      ignore_errors: true
+
+    - name: Display validation results
+      ansible.builtin.debug:
+        msg:
+          - "VM status: {{ vm_check.stdout }}"
+          - "LXC status: {{ lxc_check.stdout }}"
+          - "Maintenance reboot complete — post-reboot startup finished"
diff --git a/ansible/systemd/ansible-monthly-reboot.service b/ansible/systemd/ansible-monthly-reboot.service
new file mode 100644
index 0000000..02b2db2
--- /dev/null
+++ b/ansible/systemd/ansible-monthly-reboot.service
@@ -0,0 +1,15 @@
+[Unit]
+Description=Monthly Proxmox maintenance reboot (Ansible)
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+Type=oneshot
+User=cal
+WorkingDirectory=/opt/ansible
+ExecStart=/usr/bin/ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml
+StandardOutput=append:/opt/ansible/logs/monthly-reboot.log
+StandardError=append:/opt/ansible/logs/monthly-reboot.log
+TimeoutStartSec=900
+
+# No [Install] section — this service is activated exclusively by ansible-monthly-reboot.timer
diff --git a/ansible/systemd/ansible-monthly-reboot.timer b/ansible/systemd/ansible-monthly-reboot.timer
new file mode 100644
index 0000000..5711dda
--- /dev/null
+++ b/ansible/systemd/ansible-monthly-reboot.timer
@@ -0,0 +1,13 @@
+[Unit]
+Description=Monthly Proxmox maintenance reboot timer
+Documentation=https://git.manticorum.com/cal/claude-home/src/branch/main/server-configs/proxmox/maintenance-reboot.md
+
+[Timer]
+# First Sunday of the month at 08:00 UTC (3:00 AM ET during EDT)
+# Day range 01-07 ensures it's always the first occurrence of that weekday
+OnCalendar=Sun *-*-01..07 08:00:00
+Persistent=true
+RandomizedDelaySec=600
+
+[Install]
+WantedBy=timers.target
diff --git a/ansible/systemd/ansible-post-reboot.service b/ansible/systemd/ansible-post-reboot.service
new file mode 100644
index 0000000..132ac6b
--- /dev/null
+++ b/ansible/systemd/ansible-post-reboot.service
@@ -0,0 +1,21 @@
+[Unit]
+Description=Post-reboot controlled guest startup (Ansible)
+After=network-online.target
+Wants=network-online.target
+# Only run after a fresh boot — not on service restart
+ConditionUpTimeSec=600
+
+[Service]
+Type=oneshot
+User=cal
+WorkingDirectory=/opt/ansible
+# Delay 120s to let Proxmox API stabilize and onboot guests settle
+ExecStartPre=/bin/sleep 120
+ExecStart=/usr/bin/ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml
+StandardOutput=append:/opt/ansible/logs/post-reboot-startup.log
+StandardError=append:/opt/ansible/logs/post-reboot-startup.log
+TimeoutStartSec=1800
+
+[Install]
+# Runs automatically on every boot of LXC 304
+WantedBy=multi-user.target
diff --git a/server-configs/proxmox/maintenance-reboot.md b/server-configs/proxmox/maintenance-reboot.md
index 0c72d5a..36e63da 100644
--- a/server-configs/proxmox/maintenance-reboot.md
+++ b/server-configs/proxmox/maintenance-reboot.md
@@ -14,7 +14,7 @@ tags: [proxmox, maintenance, reboot, ansible, operations, systemd]
 |--------|-------|
 | **Schedule** | 1st Sunday of every month, 3:00 AM ET (08:00 UTC) |
 | **Expected downtime** | ~15 minutes (host reboot + VM/LXC startup) |
-| **Orchestration** | Ansible playbook on LXC 304 (ansible-controller) |
+| **Orchestration** | Ansible on LXC 304 — shutdown playbook → host reboot → post-reboot startup playbook |
 | **Calendar** | Google Calendar recurring event: "Proxmox Monthly Maintenance Reboot" |
 | **HA DNS** | ubuntu-manticore (10.10.0.226) provides Pi-hole 2 during Proxmox downtime |
 
@@ -24,16 +24,25 @@ tags: [proxmox, maintenance, reboot, ansible, operations, systemd]
 - Long uptimes allow memory leaks and process state drift (e.g., avahi busy-loops)
 - Validates that all VMs/LXCs auto-start cleanly with `onboot: 1`
 
+## Architecture
+
+The reboot is split into two playbooks because LXC 304 (the Ansible controller) is itself a guest on the Proxmox host being rebooted:
+
+1. **`monthly-reboot.yml`** — Snapshots all guests, shuts them down in dependency order, issues a fire-and-forget `reboot` to the Proxmox host, then exits. LXC 304 is killed when the host reboots.
+2. **`post-reboot-startup.yml`** — After the host reboots, LXC 304 auto-starts via `onboot: 1`. A systemd service (`ansible-post-reboot.service`) waits 120 seconds for the Proxmox API to stabilize, then starts all guests in dependency order with staggered delays.
+
+The `onboot: 1` flag on all production guests acts as a safety net — even if the post-reboot playbook fails, Proxmox will start everything (though without controlled ordering).
+
 ## Prerequisites (Before Maintenance)
 
 - [ ] Verify no active Tdarr transcodes on ubuntu-manticore
 - [ ] Verify no running database backups
-- [ ] Switch workstation DNS to `1.1.1.1` (Pi-hole 1 on VM 106 will be offline)
+- [ ] Ensure workstation has Pi-hole 2 (10.10.0.226) as a fallback DNS server so it fails over automatically during downtime
 - [ ] Confirm ubuntu-manticore Pi-hole 2 is healthy: `ssh manticore "docker exec pihole pihole status"`
 
 ## `onboot` Audit
 
-All production VMs and LXCs must have `onboot: 1` so they restart automatically if the playbook fails mid-sequence.
+All production VMs and LXCs must have `onboot: 1` so they restart automatically as a safety net.
 
 **Check VMs:**
 ```bash
@@ -55,18 +64,18 @@ done"
 
 **Audit results (2026-04-03):**
 
-| ID | Name | Type | `onboot` | Action needed |
-|----|------|------|----------|---------------|
+| ID | Name | Type | `onboot` | Status |
+|----|------|------|----------|--------|
 | 106 | docker-home | VM | 1 | OK |
-| 109 | homeassistant | VM | NOT SET | **Add `onboot: 1`** |
+| 109 | homeassistant | VM | 1 | OK (fixed 2026-04-03) |
 | 110 | discord-bots | VM | 1 | OK |
 | 112 | databases-bots | VM | 1 | OK |
 | 115 | docker-sba | VM | 1 | OK |
 | 116 | docker-home-servers | VM | 1 | OK |
 | 210 | docker-n8n-lxc | LXC | 1 | OK |
-| 221 | arr-stack | LXC | NOT SET | **Add `onboot: 1`** |
+| 221 | arr-stack | LXC | 1 | OK (fixed 2026-04-03) |
 | 222 | memos | LXC | 1 | OK |
-| 223 | foundry-lxc | LXC | NOT SET | **Add `onboot: 1`** |
+| 223 | foundry-lxc | LXC | 1 | OK (fixed 2026-04-03) |
 | 225 | gitea | LXC | 1 | OK |
 | 227 | uptime-kuma | LXC | 1 | OK |
 | 301 | claude-discord-coordinator | LXC | 1 | OK |
@@ -74,16 +83,15 @@ done"
 | 303 | mcp-gateway | LXC | 0 | Intentional (on-demand) |
 | 304 | ansible-controller | LXC | 1 | OK |
 
-**Fix missing `onboot`:**
+**If any production guest is missing `onboot: 1`:**
 ```bash
-ssh proxmox "qm set 109 --onboot 1"
-ssh proxmox "pct set 221 --onboot 1"
-ssh proxmox "pct set 223 --onboot 1"
+ssh proxmox "qm set <VMID> --onboot 1"   # for VMs
+ssh proxmox "pct set <CTID> --onboot 1"   # for LXCs
 ```
 
 ## Shutdown Order (Dependency-Aware)
 
-Reverse of the validated startup sequence. Stop consumers before their dependencies.
+Reverse of the validated startup sequence. Stop consumers before their dependencies. Each tier polls per-guest status rather than using fixed waits.
 
 ```
 Tier 4 — Media & Others (no downstream dependents)
@@ -92,7 +100,6 @@ Tier 4 — Media & Others (no downstream dependents)
   LXC 222 memos
   LXC 223 foundry-lxc
   LXC 302 claude-runner
-  LXC 303 mcp-gateway (if running)
 
 Tier 3 — Applications (depend on databases + infra)
   VM 115  docker-sba (Paper Dynasty, Major Domo)
@@ -107,21 +114,19 @@ Tier 2 — Infrastructure + DNS (depend on databases)
   VM 116  docker-home-servers
 
 Tier 1 — Databases (no dependencies, shut down last)
-  VM 112  databases-bots
+  VM 112  databases-bots (force-stop after 90s if ACPI ignored)
 
-Tier 0 — Ansible controller shuts itself down last
-  LXC 304 ansible-controller
-
-→ Proxmox host reboots
+→ LXC 304 issues fire-and-forget reboot to Proxmox host, then is killed
 ```
 
 **Known quirks:**
-- VM 112 (databases-bots) may ignore ACPI shutdown — use `--forceStop` after timeout
+- VM 112 (databases-bots) may ignore ACPI shutdown — playbook force-stops after 90s
 - VM 109 (homeassistant) is self-managed via HA Supervisor, excluded from Ansible inventory
+- LXC 303 (mcp-gateway) has `onboot: 0` and is operator-managed — not included in shutdown/startup. If it was running before maintenance, bring it up manually afterward
 
 ## Startup Order (Staggered)
 
-After the Proxmox host reboots, guests with `onboot: 1` will auto-start. The Ansible playbook overrides this with a controlled sequence:
+After the Proxmox host reboots, LXC 304 auto-starts and the `ansible-post-reboot.service` waits 120s before running the controlled startup:
 
 ```
 Tier 1 — Databases first
@@ -142,8 +147,8 @@ Tier 3 — Applications
   LXC 301 claude-discord-coordinator
   → wait 30s
 
-Pi-hole fix — restart container to clear UDP DNS bug
-  qm guest exec 106 -- docker restart pihole
+Pi-hole fix — restart container via SSH to clear UDP DNS bug
+  ssh docker-home "docker restart pihole"
   → wait 10s
 
 Tier 4 — Media & Others
@@ -151,6 +156,7 @@ Tier 4 — Media & Others
   LXC 221 arr-stack
   LXC 222 memos
   LXC 223 foundry-lxc
+  LXC 302 claude-runner
 ```
 
 ## Post-Reboot Validation
@@ -161,28 +167,35 @@ Tier 4 — Media & Others
 - [ ] Discord bots responding (check Discord)
 - [ ] Uptime Kuma dashboard green: `curl -sf http://10.10.0.227:3001/api/status-page/homelab`
 - [ ] Home Assistant running: `curl -sf http://10.10.0.109:8123/api/ -H 'Authorization: Bearer <token>'`
-- [ ] Switch workstation DNS back from `1.1.1.1` to Pi-hole
+- [ ] Maintenance snapshots cleaned up (auto, 7-day retention)
 
 ## Automation
 
-### Ansible Playbook
+### Ansible Playbooks
 
-Located at `/opt/ansible/playbooks/monthly-reboot.yml` on LXC 304.
+Both located at `/opt/ansible/playbooks/` on LXC 304.
 
 ```bash
-# Dry run (check mode)
+# Dry run — shutdown only
 ssh ansible "ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --check"
 
-# Manual execution
+# Manual full execution — shutdown + reboot
 ssh ansible "ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml"
 
-# Limit to shutdown only (skip reboot)
+# Manual post-reboot startup (if automatic startup failed)
+ssh ansible "ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml"
+
+# Shutdown only — skip the host reboot
 ssh ansible "ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --tags shutdown"
 ```
 
-### Systemd Timer
+### Systemd Units (on LXC 304)
 
-The playbook runs automatically via systemd timer on LXC 304:
+| Unit | Purpose | Schedule |
+|------|---------|----------|
+| `ansible-monthly-reboot.timer` | Triggers shutdown + reboot playbook | 1st Sunday of month, 08:00 UTC |
+| `ansible-monthly-reboot.service` | Runs `monthly-reboot.yml` | Activated by timer |
+| `ansible-post-reboot.service` | Runs `post-reboot-startup.yml` | On boot (multi-user.target), only if uptime < 10 min |
 
 ```bash
 # Check timer status
@@ -191,10 +204,32 @@ ssh ansible "systemctl status ansible-monthly-reboot.timer"
 # Next scheduled run
 ssh ansible "systemctl list-timers ansible-monthly-reboot.timer"
 
+# Check post-reboot service status
+ssh ansible "systemctl status ansible-post-reboot.service"
+
 # Disable for a month (e.g., during an incident)
 ssh ansible "systemctl stop ansible-monthly-reboot.timer"
 ```
 
+### Deployment (one-time setup on LXC 304)
+
+```bash
+# Copy playbooks
+scp ansible/playbooks/monthly-reboot.yml ansible:/opt/ansible/playbooks/
+scp ansible/playbooks/post-reboot-startup.yml ansible:/opt/ansible/playbooks/
+
+# Copy and enable systemd units
+scp ansible/systemd/ansible-monthly-reboot.timer ansible:/etc/systemd/system/
+scp ansible/systemd/ansible-monthly-reboot.service ansible:/etc/systemd/system/
+scp ansible/systemd/ansible-post-reboot.service ansible:/etc/systemd/system/
+ssh ansible "sudo systemctl daemon-reload && \
+  sudo systemctl enable --now ansible-monthly-reboot.timer && \
+  sudo systemctl enable ansible-post-reboot.service"
+
+# Verify SSH key access from LXC 304 to docker-home (needed for Pi-hole restart)
+ssh ansible "ssh -o BatchMode=yes docker-home 'echo ok'"
+```
+
 ## Rollback
 
 If a guest fails to start after reboot:
@@ -202,6 +237,7 @@ If a guest fails to start after reboot:
 2. Review guest logs: `ssh proxmox "journalctl -u pve-guests -n 50"`
 3. Manual start: `ssh proxmox "pvesh create /nodes/proxmox/qemu/<VMID>/status/start"`
 4. If guest is corrupted, restore from the pre-reboot Proxmox snapshot
+5. If post-reboot startup failed entirely, run manually: `ssh ansible "ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml"`
 
 ## Related Documentation
 

From 95bae333090c1033e45d8393c4c82661460d8108 Mon Sep 17 00:00:00 2001
From: Cal Corum <calcorum@users.noreply.github.com>
Date: Sat, 4 Apr 2026 06:07:57 -0500
Subject: [PATCH 3/6] feat: add weekly Proxmox backup verification and CT 302
 self-health check (#27)

Closes #27

- proxmox-backup-check.sh: SSHes to Proxmox, queries pvesh task history,
  classifies each running VM/CT as green/yellow/red by backup recency,
  posts a Discord embed summary. Designed for weekly cron on CT 302.

- ct302-self-health.sh: Checks disk usage on CT 302 itself, silently
  exits when healthy, posts a Discord alert when any filesystem exceeds
  80% threshold. Closes the blind spot where the monitoring system
  cannot monitor itself externally.

- Updated monitoring/scripts/CONTEXT.md with full operational docs,
  install instructions, and cron schedules for both new scripts.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 monitoring/scripts/CONTEXT.md              |  86 +++++++-
 monitoring/scripts/ct302-self-health.sh    | 158 ++++++++++++++
 monitoring/scripts/proxmox-backup-check.sh | 230 +++++++++++++++++++++
 3 files changed, 472 insertions(+), 2 deletions(-)
 create mode 100644 monitoring/scripts/ct302-self-health.sh
 create mode 100644 monitoring/scripts/proxmox-backup-check.sh

diff --git a/monitoring/scripts/CONTEXT.md b/monitoring/scripts/CONTEXT.md
index 2990a3f..4f06f76 100644
--- a/monitoring/scripts/CONTEXT.md
+++ b/monitoring/scripts/CONTEXT.md
@@ -1,9 +1,9 @@
 ---
 title: "Monitoring Scripts Context"
-description: "Operational context for all monitoring scripts: Jellyfin GPU health monitor, NVIDIA driver update checker, Tdarr API/file monitors, and Windows reboot detection. Includes cron schedules, Discord integration patterns, and troubleshooting."
+description: "Operational context for all monitoring scripts: Proxmox backup checker, CT 302 self-health, Jellyfin GPU health monitor, NVIDIA driver update checker, Tdarr API/file monitors, and Windows reboot detection. Includes cron schedules, Discord integration patterns, and troubleshooting."
 type: context
 domain: monitoring
-tags: [jellyfin, gpu, nvidia, tdarr, discord, cron, python, windows, scripts]
+tags: [proxmox, backup, jellyfin, gpu, nvidia, tdarr, discord, cron, python, bash, windows, scripts]
 ---
 
 # Monitoring Scripts - Operational Context
@@ -13,6 +13,77 @@ This directory contains active operational scripts for system monitoring, health
 
 ## Core Monitoring Scripts
 
+### Proxmox Backup Verification
+**Script**: `proxmox-backup-check.sh`
+**Purpose**: Weekly check that every running VM/CT has a successful vzdump backup within 7 days. Posts a color-coded Discord embed with per-guest status.
+
+**Key Features**:
+- SSHes to Proxmox host and queries `pvesh` task history + guest lists via API
+- Categorizes each guest: 🟢 green (backed up), 🟡 yellow (overdue), 🔴 red (no backup)
+- Sorts output by VMID; only posts to Discord — no local side effects
+- `--dry-run` mode prints the Discord payload without sending
+- `--days N` overrides the default 7-day window
+
+**Schedule**: Weekly on Monday 08:00 UTC (CT 302 cron)
+```bash
+0 8 * * 1 DISCORD_WEBHOOK="<url>" /root/scripts/proxmox-backup-check.sh >> /var/log/proxmox-backup-check.log 2>&1
+```
+
+**Usage**:
+```bash
+# Dry run (no Discord)
+proxmox-backup-check.sh --dry-run
+
+# Post to Discord
+DISCORD_WEBHOOK="https://discord.com/api/webhooks/..." proxmox-backup-check.sh
+
+# Custom window
+proxmox-backup-check.sh --days 14 --discord-webhook "https://..."
+```
+
+**Dependencies**: `jq`, `curl`, SSH access to Proxmox host alias `proxmox`
+
+**Install on CT 302**:
+```bash
+cp proxmox-backup-check.sh /root/scripts/
+chmod +x /root/scripts/proxmox-backup-check.sh
+```
+
+### CT 302 Self-Health Monitor
+**Script**: `ct302-self-health.sh`
+**Purpose**: Monitors disk usage on CT 302 (claude-runner) itself. Alerts to Discord when any filesystem exceeds the threshold (default 80%). Runs silently when healthy — no Discord spam on green.
+
+**Key Features**:
+- Checks all non-virtual filesystems (`df`, excludes tmpfs/devtmpfs/overlay)
+- Only sends a Discord alert when a filesystem is at or above threshold
+- `--always-post` flag forces a post even when healthy (useful for testing)
+- `--dry-run` mode prints payload without sending
+
+**Schedule**: Daily at 07:00 UTC (CT 302 cron)
+```bash
+0 7 * * * DISCORD_WEBHOOK="<url>" /root/scripts/ct302-self-health.sh >> /var/log/ct302-self-health.log 2>&1
+```
+
+**Usage**:
+```bash
+# Check and alert if over 80%
+DISCORD_WEBHOOK="https://discord.com/api/webhooks/..." ct302-self-health.sh
+
+# Lower threshold test
+ct302-self-health.sh --threshold 50 --dry-run
+
+# Always post (weekly status report pattern)
+ct302-self-health.sh --always-post --discord-webhook "https://..."
+```
+
+**Dependencies**: `jq`, `curl`, `df`
+
+**Install on CT 302**:
+```bash
+cp ct302-self-health.sh /root/scripts/
+chmod +x /root/scripts/ct302-self-health.sh
+```
+
 ### Jellyfin GPU Health Monitor
 **Script**: `jellyfin_gpu_monitor.py`
 **Purpose**: Monitor Jellyfin container GPU access with Discord alerts and auto-restart capability
@@ -235,6 +306,17 @@ python3 tdarr_file_monitor.py >> /mnt/NV2/Development/claude-home/logs/tdarr-fil
 0 9 * * 1 /usr/bin/python3 /home/cal/scripts/nvidia_update_checker.py --check --discord-alerts >> /home/cal/logs/nvidia-update-checker.log 2>&1
 ```
 
+**Active Cron Jobs** (on CT 302 / claude-runner, root user):
+```bash
+# Proxmox backup verification - Weekly (Mondays at 8 AM UTC)
+0 8 * * 1 DISCORD_WEBHOOK="<homelab-alerts-webhook>" /root/scripts/proxmox-backup-check.sh >> /var/log/proxmox-backup-check.log 2>&1
+
+# CT 302 self-health disk check - Daily at 7 AM UTC (alerts only when >80%)
+0 7 * * * DISCORD_WEBHOOK="<homelab-alerts-webhook>" /root/scripts/ct302-self-health.sh >> /var/log/ct302-self-health.log 2>&1
+```
+
+**Note**: Scripts must be installed manually on CT 302. Source of truth is `monitoring/scripts/` in this repo — copy to `/root/scripts/` on CT 302 to deploy.
+
 **Manual/On-Demand**:
 - `tdarr_monitor.py` - Run as needed for Tdarr health checks
 - `tdarr_file_monitor.py` - Can be scheduled if automatic backup needed
diff --git a/monitoring/scripts/ct302-self-health.sh b/monitoring/scripts/ct302-self-health.sh
new file mode 100644
index 0000000..e2adc2d
--- /dev/null
+++ b/monitoring/scripts/ct302-self-health.sh
@@ -0,0 +1,158 @@
+#!/usr/bin/env bash
+# ct302-self-health.sh — CT 302 (claude-runner) disk self-check → Discord
+#
+# Monitors disk usage on CT 302 itself and alerts to Discord when any
+# filesystem exceeds the threshold. Closes the blind spot where the
+# monitoring system cannot monitor itself via external health checks.
+#
+# Designed to run silently when healthy (no Discord spam on green).
+# Only posts when a filesystem is at or above THRESHOLD.
+#
+# Usage:
+#   ct302-self-health.sh [--discord-webhook URL] [--threshold N] [--dry-run] [--always-post]
+#
+# Environment overrides:
+#   DISCORD_WEBHOOK   Discord webhook URL (required unless --dry-run)
+#   DISK_THRESHOLD    Disk usage % alert threshold (default: 80)
+#
+# Install on CT 302 (daily, 07:00 UTC):
+#   0 7 * * * /root/scripts/ct302-self-health.sh >> /var/log/ct302-self-health.log 2>&1
+
+set -uo pipefail
+
+DISK_THRESHOLD="${DISK_THRESHOLD:-80}"
+DISCORD_WEBHOOK="${DISCORD_WEBHOOK:-}"
+DRY_RUN=0
+ALWAYS_POST=0
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --discord-webhook)
+      if [[ $# -lt 2 ]]; then
+        echo "Error: --discord-webhook requires a value" >&2
+        exit 1
+      fi
+      DISCORD_WEBHOOK="$2"
+      shift 2
+      ;;
+    --threshold)
+      if [[ $# -lt 2 ]]; then
+        echo "Error: --threshold requires a value" >&2
+        exit 1
+      fi
+      DISK_THRESHOLD="$2"
+      shift 2
+      ;;
+    --dry-run)
+      DRY_RUN=1
+      shift
+      ;;
+    --always-post)
+      ALWAYS_POST=1
+      shift
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      exit 1
+      ;;
+  esac
+done
+
+if [[ "$DRY_RUN" -eq 0 && -z "$DISCORD_WEBHOOK" ]]; then
+  echo "Error: DISCORD_WEBHOOK not set. Use --discord-webhook URL or set env var." >&2
+  exit 1
+fi
+
+log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
+
+# ---------------------------------------------------------------------------
+# Check disk usage on all real filesystems
+# ---------------------------------------------------------------------------
+# df output: Filesystem Use% Mounted-on (skipping tmpfs, devtmpfs, overlay)
+TRIGGERED=()
+ALL_FS=()
+
+while IFS= read -r line; do
+  fs=$(echo "$line" | awk '{print $1}')
+  pct=$(echo "$line" | awk '{print $5}' | tr -d '%')
+  mount=$(echo "$line" | awk '{print $6}')
+  ALL_FS+=("${pct}% ${mount} (${fs})")
+  if [[ "$pct" -ge "$DISK_THRESHOLD" ]]; then
+    TRIGGERED+=("${pct}% used — ${mount} (${fs})")
+  fi
+done < <(df -h --output=source,size,used,avail,pcent,target |
+  tail -n +2 |
+  awk '$1 !~ /^(tmpfs|devtmpfs|overlay|udev)/' |
+  awk '{print $1, $5, $6}')
+
+HOSTNAME=$(hostname -s)
+TRIGGERED_COUNT=${#TRIGGERED[@]}
+
+log "Disk check complete: ${TRIGGERED_COUNT} filesystem(s) above ${DISK_THRESHOLD}%"
+
+# Exit cleanly with no Discord post if everything is healthy
+if [[ "$TRIGGERED_COUNT" -eq 0 && "$ALWAYS_POST" -eq 0 && "$DRY_RUN" -eq 0 ]]; then
+  log "All filesystems healthy — no alert needed."
+  exit 0
+fi
+
+# ---------------------------------------------------------------------------
+# Build Discord payload
+# ---------------------------------------------------------------------------
+if [[ "$TRIGGERED_COUNT" -gt 0 ]]; then
+  EMBED_COLOR=15548997 # 0xED4245 red
+  TITLE="🔴 ${HOSTNAME}: Disk usage above ${DISK_THRESHOLD}%"
+  alert_lines=$(printf '⚠️ %s\n' "${TRIGGERED[@]}")
+  FIELDS=$(jq -n \
+    --arg name "Filesystems Over Threshold" \
+    --arg value "$alert_lines" \
+    '[{"name": $name, "value": $value, "inline": false}]')
+else
+  EMBED_COLOR=5763719 # 0x57F287 green
+  TITLE="🟢 ${HOSTNAME}: All filesystems healthy"
+  FIELDS='[]'
+fi
+
+# Add summary of all filesystems
+all_lines=$(printf '%s\n' "${ALL_FS[@]}")
+FIELDS=$(echo "$FIELDS" | jq \
+  --arg name "All Filesystems" \
+  --arg value "$all_lines" \
+  '. + [{"name": $name, "value": $value, "inline": false}]')
+
+FOOTER="$(date -u '+%Y-%m-%d %H:%M UTC') · CT 302 self-health · threshold: ${DISK_THRESHOLD}%"
+
+PAYLOAD=$(jq -n \
+  --arg title "$TITLE" \
+  --argjson color "$EMBED_COLOR" \
+  --argjson fields "$FIELDS" \
+  --arg footer "$FOOTER" \
+  '{
+    "embeds": [{
+      "title": $title,
+      "color": $color,
+      "fields": $fields,
+      "footer": {"text": $footer}
+    }]
+  }')
+
+if [[ "$DRY_RUN" -eq 1 ]]; then
+  log "DRY RUN — Discord payload:"
+  echo "$PAYLOAD" | jq .
+  exit 0
+fi
+
+log "Posting to Discord..."
+HTTP_STATUS=$(curl -s -o /tmp/ct302-self-health-discord.out \
+  -w "%{http_code}" \
+  -X POST "$DISCORD_WEBHOOK" \
+  -H "Content-Type: application/json" \
+  -d "$PAYLOAD")
+
+if [[ "$HTTP_STATUS" -ge 200 && "$HTTP_STATUS" -lt 300 ]]; then
+  log "Discord notification sent (HTTP ${HTTP_STATUS})."
+else
+  log "Warning: Discord returned HTTP ${HTTP_STATUS}."
+  cat /tmp/ct302-self-health-discord.out >&2
+  exit 1
+fi
diff --git a/monitoring/scripts/proxmox-backup-check.sh b/monitoring/scripts/proxmox-backup-check.sh
new file mode 100644
index 0000000..fcc1186
--- /dev/null
+++ b/monitoring/scripts/proxmox-backup-check.sh
@@ -0,0 +1,230 @@
+#!/usr/bin/env bash
+# proxmox-backup-check.sh — Weekly Proxmox backup verification → Discord
+#
+# SSHes to the Proxmox host and checks that every running VM/CT has a
+# successful vzdump backup within the last 7 days. Posts a color-coded
+# Discord summary with per-guest status.
+#
+# Usage:
+#   proxmox-backup-check.sh [--discord-webhook URL] [--days N] [--dry-run]
+#
+# Environment overrides:
+#   DISCORD_WEBHOOK   Discord webhook URL (required unless --dry-run)
+#   PROXMOX_NODE      Proxmox node name (default: proxmox)
+#   PROXMOX_SSH       SSH alias or host for Proxmox (default: proxmox)
+#   WINDOW_DAYS       Backup recency window in days (default: 7)
+#
+# Install on CT 302 (weekly, Monday 08:00 UTC):
+#   0 8 * * 1 /root/scripts/proxmox-backup-check.sh >> /var/log/proxmox-backup-check.log 2>&1
+
+set -uo pipefail
+
+PROXMOX_NODE="${PROXMOX_NODE:-proxmox}"
+PROXMOX_SSH="${PROXMOX_SSH:-proxmox}"
+WINDOW_DAYS="${WINDOW_DAYS:-7}"
+DISCORD_WEBHOOK="${DISCORD_WEBHOOK:-}"
+DRY_RUN=0
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --discord-webhook)
+      if [[ $# -lt 2 ]]; then
+        echo "Error: --discord-webhook requires a value" >&2
+        exit 1
+      fi
+      DISCORD_WEBHOOK="$2"
+      shift 2
+      ;;
+    --days)
+      if [[ $# -lt 2 ]]; then
+        echo "Error: --days requires a value" >&2
+        exit 1
+      fi
+      WINDOW_DAYS="$2"
+      shift 2
+      ;;
+    --dry-run)
+      DRY_RUN=1
+      shift
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      exit 1
+      ;;
+  esac
+done
+
+if [[ "$DRY_RUN" -eq 0 && -z "$DISCORD_WEBHOOK" ]]; then
+  echo "Error: DISCORD_WEBHOOK not set. Use --discord-webhook URL or set env var." >&2
+  exit 1
+fi
+
+if ! command -v jq &>/dev/null; then
+  echo "Error: jq is required but not installed." >&2
+  exit 1
+fi
+
+SSH_OPTS="-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes"
+CUTOFF=$(date -d "-${WINDOW_DAYS} days" +%s)
+NOW=$(date +%s)
+
+log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
+
+# ---------------------------------------------------------------------------
+# Fetch data from Proxmox
+# ---------------------------------------------------------------------------
+log "Fetching VM and CT list from Proxmox node '${PROXMOX_NODE}'..."
+VMS_JSON=$(ssh $SSH_OPTS "$PROXMOX_SSH" \
+  "pvesh get /nodes/${PROXMOX_NODE}/qemu --output-format json 2>/dev/null" || echo "[]")
+CTS_JSON=$(ssh $SSH_OPTS "$PROXMOX_SSH" \
+  "pvesh get /nodes/${PROXMOX_NODE}/lxc --output-format json 2>/dev/null" || echo "[]")
+
+log "Fetching recent vzdump task history (limit 200)..."
+TASKS_JSON=$(ssh $SSH_OPTS "$PROXMOX_SSH" \
+  "pvesh get /nodes/${PROXMOX_NODE}/tasks --typefilter vzdump --limit 200 --output-format json 2>/dev/null" || echo "[]")
+
+# ---------------------------------------------------------------------------
+# Build per-guest backup status
+# ---------------------------------------------------------------------------
+# Merge VMs and CTs into one list: [{vmid, name, type}]
+GUESTS_JSON=$(jq -n \
+  --argjson vms "$VMS_JSON" \
+  --argjson cts "$CTS_JSON" '
+    ($vms | map(select(.status == "running") | {vmid: (.vmid | tostring), name, type: "VM"})) +
+    ($cts  | map(select(.status == "running") | {vmid: (.vmid | tostring), name, type: "CT"}))
+  ')
+
+GUEST_COUNT=$(echo "$GUESTS_JSON" | jq 'length')
+log "Found ${GUEST_COUNT} running guests."
+
+# For each guest, find the most recent successful (status == "OK") vzdump task
+RESULTS=$(jq -n \
+  --argjson guests "$GUESTS_JSON" \
+  --argjson tasks "$TASKS_JSON" \
+  --argjson cutoff "$CUTOFF" \
+  --argjson now "$NOW" \
+  --argjson window "$WINDOW_DAYS" '
+  $guests | map(
+    . as $g |
+    ($tasks | map(
+      select(
+        (.vmid | tostring) == $g.vmid
+        and .status == "OK"
+      ) | .starttime
+    ) | max // 0) as $last_ts |
+    {
+      vmid: $g.vmid,
+      name: $g.name,
+      type: $g.type,
+      last_backup_ts: $last_ts,
+      age_days: (if $last_ts > 0 then (($now - $last_ts) / 86400 | floor) else -1 end),
+      status: (
+        if   $last_ts >= $cutoff then "green"
+        elif $last_ts > 0        then "yellow"
+        else                          "red"
+        end
+      )
+    }
+  ) | sort_by(.vmid | tonumber)
+')
+
+GREEN_GUESTS=$(echo "$RESULTS" | jq '[.[] | select(.status == "green")]')
+YELLOW_GUESTS=$(echo "$RESULTS" | jq '[.[] | select(.status == "yellow")]')
+RED_GUESTS=$(echo "$RESULTS" | jq '[.[] | select(.status == "red")]')
+
+GREEN_COUNT=$(echo "$GREEN_GUESTS" | jq 'length')
+YELLOW_COUNT=$(echo "$YELLOW_GUESTS" | jq 'length')
+RED_COUNT=$(echo "$RED_GUESTS" | jq 'length')
+
+log "Results: ${GREEN_COUNT} green, ${YELLOW_COUNT} yellow, ${RED_COUNT} red"
+
+# ---------------------------------------------------------------------------
+# Build Discord payload
+# ---------------------------------------------------------------------------
+if [[ "$RED_COUNT" -gt 0 ]]; then
+  EMBED_COLOR=15548997 # 0xED4245 red
+  STATUS_LINE="🔴 Backup issues detected — action required"
+elif [[ "$YELLOW_COUNT" -gt 0 ]]; then
+  EMBED_COLOR=16705372 # 0xFF851C orange
+  STATUS_LINE="🟡 Some backups are overdue (>${WINDOW_DAYS}d)"
+else
+  EMBED_COLOR=5763719 # 0x57F287 green
+  STATUS_LINE="🟢 All ${GUEST_COUNT} guests backed up within ${WINDOW_DAYS} days"
+fi
+
+# Format guest lines: "VM 116 (plex) — 2d ago" or "CT 302 (claude-runner) — NO BACKUPS"
+format_guest() {
+  local prefix="$1" guests="$2"
+  echo "$guests" | jq -r '.[] | "\(.type) \(.vmid) (\(.name))"' |
+    while IFS= read -r line; do echo "${prefix} ${line}"; done
+}
+
+format_guest_with_age() {
+  local prefix="$1" guests="$2"
+  echo "$guests" | jq -r '.[] | "\(.type) \(.vmid) (\(.name)) — \(.age_days)d ago"' |
+    while IFS= read -r line; do echo "${prefix} ${line}"; done
+}
+
+# Build fields array
+fields='[]'
+
+if [[ "$GREEN_COUNT" -gt 0 ]]; then
+  green_lines=$(format_guest_with_age "✅" "$GREEN_GUESTS")
+  fields=$(echo "$fields" | jq \
+    --arg name "🟢 Healthy (${GREEN_COUNT})" \
+    --arg value "$green_lines" \
+    '. + [{"name": $name, "value": $value, "inline": false}]')
+fi
+
+if [[ "$YELLOW_COUNT" -gt 0 ]]; then
+  yellow_lines=$(format_guest_with_age "⚠️" "$YELLOW_GUESTS")
+  fields=$(echo "$fields" | jq \
+    --arg name "🟡 Overdue — last backup >${WINDOW_DAYS}d ago (${YELLOW_COUNT})" \
+    --arg value "$yellow_lines" \
+    '. + [{"name": $name, "value": $value, "inline": false}]')
+fi
+
+if [[ "$RED_COUNT" -gt 0 ]]; then
+  red_lines=$(format_guest "❌" "$RED_GUESTS")
+  fields=$(echo "$fields" | jq \
+    --arg name "🔴 No Successful Backups Found (${RED_COUNT})" \
+    --arg value "$red_lines" \
+    '. + [{"name": $name, "value": $value, "inline": false}]')
+fi
+
+FOOTER="$(date -u '+%Y-%m-%d %H:%M UTC') · ${GUEST_COUNT} guests · window: ${WINDOW_DAYS}d"
+
+PAYLOAD=$(jq -n \
+  --arg title "Proxmox Backup Check — ${STATUS_LINE}" \
+  --argjson color "$EMBED_COLOR" \
+  --argjson fields "$fields" \
+  --arg footer "$FOOTER" \
+  '{
+    "embeds": [{
+      "title": $title,
+      "color": $color,
+      "fields": $fields,
+      "footer": {"text": $footer}
+    }]
+  }')
+
+if [[ "$DRY_RUN" -eq 1 ]]; then
+  log "DRY RUN — Discord payload:"
+  echo "$PAYLOAD" | jq .
+  exit 0
+fi
+
+log "Posting to Discord..."
+HTTP_STATUS=$(curl -s -o /tmp/proxmox-backup-check-discord.out \
+  -w "%{http_code}" \
+  -X POST "$DISCORD_WEBHOOK" \
+  -H "Content-Type: application/json" \
+  -d "$PAYLOAD")
+
+if [[ "$HTTP_STATUS" -ge 200 && "$HTTP_STATUS" -lt 300 ]]; then
+  log "Discord notification sent (HTTP ${HTTP_STATUS})."
+else
+  log "Warning: Discord returned HTTP ${HTTP_STATUS}."
+  cat /tmp/proxmox-backup-check-discord.out >&2
+  exit 1
+fi

From cacf4a9043c43732d32d1eb73cf464a0be29eeb0 Mon Sep 17 00:00:00 2001
From: Cal Corum <calcorum@users.noreply.github.com>
Date: Sun, 5 Apr 2026 19:24:59 -0500
Subject: [PATCH 4/6] feat: add weekly Gitea disk cleanup Ansible playbook

Gitea LXC 225 hit 100% disk from accumulated Docker buildx volumes,
repo-archive cache, and journal logs. Adds automated weekly cleanup
managed by systemd timer on the Ansible controller (Wed 04:00 UTC).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 ansible/playbooks/gitea-cleanup.yml | 80 +++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 ansible/playbooks/gitea-cleanup.yml

diff --git a/ansible/playbooks/gitea-cleanup.yml b/ansible/playbooks/gitea-cleanup.yml
new file mode 100644
index 0000000..83157c4
--- /dev/null
+++ b/ansible/playbooks/gitea-cleanup.yml
@@ -0,0 +1,80 @@
+---
+# gitea-cleanup.yml — Weekly cleanup of Gitea server disk space
+#
+# Removes stale Docker buildx volumes, unused images, Gitea repo-archive
+# cache, and vacuums journal logs to prevent disk exhaustion on LXC 225.
+#
+# Schedule: Weekly via systemd timer on LXC 304 (ansible-controller)
+#
+# Usage:
+#   ansible-playbook /opt/ansible/playbooks/gitea-cleanup.yml          # full run
+#   ansible-playbook /opt/ansible/playbooks/gitea-cleanup.yml --check  # dry run
+
+- name: Gitea server disk cleanup
+  hosts: gitea
+  gather_facts: false
+
+  tasks:
+    - name: Check current disk usage
+      ansible.builtin.shell: df --output=pcent / | tail -1
+      register: disk_before
+      changed_when: false
+
+    - name: Display current disk usage
+      ansible.builtin.debug:
+        msg: "Disk usage before cleanup: {{ disk_before.stdout | trim }}"
+
+    - name: Clear Gitea repo-archive cache
+      ansible.builtin.find:
+        paths: /var/lib/gitea/data/repo-archive
+        file_type: any
+      register: repo_archive_files
+
+    - name: Remove repo-archive files
+      ansible.builtin.file:
+        path: "{{ item.path }}"
+        state: absent
+      loop: "{{ repo_archive_files.files }}"
+      loop_control:
+        label: "{{ item.path | basename }}"
+      when: repo_archive_files.files | length > 0
+
+    - name: Remove orphaned Docker buildx volumes
+      ansible.builtin.shell: |
+        volumes=$(docker volume ls -q --filter name=buildx_buildkit)
+        if [ -n "$volumes" ]; then
+          echo "$volumes" | xargs docker volume rm 2>&1
+        else
+          echo "No buildx volumes to remove"
+        fi
+      register: buildx_cleanup
+      changed_when: "'No buildx volumes' not in buildx_cleanup.stdout"
+
+    - name: Prune unused Docker images
+      ansible.builtin.command: docker image prune -af
+      register: image_prune
+      changed_when: "'Total reclaimed space: 0B' not in image_prune.stdout"
+
+    - name: Prune unused Docker volumes
+      ansible.builtin.command: docker volume prune -f
+      register: volume_prune
+      changed_when: "'Total reclaimed space: 0B' not in volume_prune.stdout"
+
+    - name: Vacuum journal logs to 500M
+      ansible.builtin.command: journalctl --vacuum-size=500M
+      register: journal_vacuum
+      changed_when: "'freed 0B' not in journal_vacuum.stderr"
+
+    - name: Check disk usage after cleanup
+      ansible.builtin.shell: df --output=pcent / | tail -1
+      register: disk_after
+      changed_when: false
+
+    - name: Display cleanup summary
+      ansible.builtin.debug:
+        msg: >-
+          Cleanup complete.
+          Disk: {{ disk_before.stdout | default('N/A') | trim }} → {{ disk_after.stdout | default('N/A') | trim }}.
+          Buildx: {{ (buildx_cleanup.stdout_lines | default(['N/A'])) | last }}.
+          Images: {{ (image_prune.stdout_lines | default(['N/A'])) | last }}.
+          Journal: {{ (journal_vacuum.stderr_lines | default(['N/A'])) | last }}.

From acb8fef0843dd79201e94aae37f96a7ee43b1724 Mon Sep 17 00:00:00 2001
From: Cal Corum <calcorum@users.noreply.github.com>
Date: Mon, 6 Apr 2026 00:00:03 -0500
Subject: [PATCH 5/6] =?UTF-8?q?docs:=20sync=20KB=20=E2=80=94=20database-de?=
 =?UTF-8?q?ployment-guide.md,refractor-in-app-test-plan.md?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 paper-dynasty/database-deployment-guide.md  |   6 +-
 paper-dynasty/refractor-in-app-test-plan.md | 107 ++++++++++++++++++++
 2 files changed, 111 insertions(+), 2 deletions(-)
 create mode 100644 paper-dynasty/refractor-in-app-test-plan.md

diff --git a/paper-dynasty/database-deployment-guide.md b/paper-dynasty/database-deployment-guide.md
index bfb407c..014014f 100644
--- a/paper-dynasty/database-deployment-guide.md
+++ b/paper-dynasty/database-deployment-guide.md
@@ -178,7 +178,7 @@ When merging many PRs at once (e.g., batch pagination PRs), branch protection ru
 | `LOG_LEVEL` | Logging verbosity (default: INFO) |
 | `DATABASE_TYPE` | `postgresql` |
 | `POSTGRES_HOST` | Container name of PostgreSQL |
-| `POSTGRES_DB` | Database name (`pd_master`) |
+| `POSTGRES_DB` | Database name — `pd_master` (prod) / `paperdynasty_dev` (dev) |
 | `POSTGRES_USER` | DB username |
 | `POSTGRES_PASSWORD` | DB password |
 
@@ -189,4 +189,6 @@ When merging many PRs at once (e.g., batch pagination PRs), branch protection ru
 | Database API (prod) | `ssh akamai` | `pd_api` | 815 |
 | Database API (dev) | `ssh pd-database` | `dev_pd_database` | 813 |
 | PostgreSQL (prod) | `ssh akamai` | `pd_postgres` | 5432 |
-| PostgreSQL (dev) | `ssh pd-database` | `pd_postgres` | 5432 |
+| PostgreSQL (dev) | `ssh pd-database` | `sba_postgres` | 5432 |
+
+**Dev database credentials:** container `sba_postgres`, database `paperdynasty_dev`, user `sba_admin`. Prod uses `pd_postgres`, database `pd_master`.
diff --git a/paper-dynasty/refractor-in-app-test-plan.md b/paper-dynasty/refractor-in-app-test-plan.md
new file mode 100644
index 0000000..1f250c7
--- /dev/null
+++ b/paper-dynasty/refractor-in-app-test-plan.md
@@ -0,0 +1,107 @@
+---
+title: "Refractor In-App Test Plan"
+description: "Comprehensive manual test plan for the Refractor card evolution system — covers /refractor status, tier badges, post-game hooks, tier-up notifications, card art tiers, and known issues."
+type: guide
+domain: paper-dynasty
+tags: [paper-dynasty, testing, refractor, discord, database]
+---
+
+# Refractor In-App Test Plan
+
+Manual test plan for the Refractor (card evolution) system. All testing targets **dev** environment (`pddev.manticorum.com` / dev Discord bot).
+
+## Prerequisites
+
+- Dev bot running on `sba-bots`
+- Dev API at `pddev.manticorum.com` (port 813)
+- Team with seeded refractor data (team 31 from prior session)
+- At least one game playable to trigger post-game hooks
+
+---
+
+## REF-10: `/refractor status` — Basic Display
+
+| # | Test | Steps | Expected |
+|---|---|---|---|
+| 10 | No filters | `/refractor status` | Ephemeral embed with team branding, tier summary line, 10 cards sorted by tier DESC, pagination buttons if >10 cards |
+| 11 | Card type filter | `/refractor status card_type:Batter` | Only batter cards shown, count matches |
+| 12 | Tier filter | `/refractor status tier:T2—Refractor` | Only T2 cards, embed color changes to tier color |
+| 13 | Progress filter | `/refractor status progress:Close to next tier` | Only cards >=80% to next threshold, fully evolved excluded |
+| 14 | Combined filters | `/refractor status card_type:Batter tier:T1—Base Chrome` | Intersection of both filters |
+| 15 | Empty result | `/refractor status tier:T4—Superfractor` (if none exist) | "No cards match your filters..." message with filter details |
+
+## REF-20: `/refractor status` — Pagination
+
+| # | Test | Steps | Expected |
+|---|---|---|---|
+| 20 | Page buttons appear | `/refractor status` with >10 cards | Prev/Next buttons visible |
+| 21 | Next page | Click `Next >` | Page 2 shown, footer updates to "Page 2/N" |
+| 22 | Prev page | From page 2, click `< Prev` | Back to page 1 |
+| 23 | First page prev | On page 1, click `< Prev` | Nothing happens / stays on page 1 |
+| 24 | Last page next | On last page, click `Next >` | Nothing happens / stays on last page |
+| 25 | Button timeout | Wait 120s after command | Buttons become unresponsive |
+| 26 | Wrong user clicks | Another user clicks buttons | Silently ignored |
+
+## REF-30: Tier Badges in Card Embeds
+
+| # | Test | Steps | Expected |
+|---|---|---|---|
+| 30 | T0 card display | View a T0 card via `/myteam` or `/roster` | No badge prefix, just player name |
+| 31 | T1 badge | View a T1 card | Title shows `[BC] Player Name` |
+| 32 | T2 badge | View a T2 card | Title shows `[R] Player Name` |
+| 33 | T3 badge | View a T3 card | Title shows `[GR] Player Name` |
+| 34 | T4 badge | View a T4 card (if exists) | Title shows `[SF] Player Name` |
+| 35 | Badge in pack open | Open a pack with an evolved card | Badge appears in pack embed |
+| 36 | API down gracefully | (hard to test) | Card displays normally with no badge, no error |
+
+## REF-50: Post-Game Hook & Tier-Up Notifications
+
+| # | Test | Steps | Expected |
+|---|---|---|---|
+| 50 | Game completes normally | Play a full game | No errors in bot logs; refractor evaluate-game fires after season-stats update |
+| 51 | Tier-up notification | Play game where a card crosses a threshold | Embed in game channel: "Refractor Tier Up!", player name, tier name, correct color |
+| 52 | No tier-up | Play game where no thresholds crossed | No refractor embed posted, game completes normally |
+| 53 | Multiple tier-ups | Game where 2+ players tier up | One embed per tier-up, all posted |
+| 54 | Auto-init new card | Play game with a card that has no RefractorCardState | State created automatically, player evaluated, no error |
+| 55 | Superfractor notification | (may need forced data) | "SUPERFRACTOR!" title, teal color |
+
+## REF-60: Card Art with Tiers (API-level)
+
+| # | Test | Steps | Expected |
+|---|---|---|---|
+| 60 | T0 card image | `GET /api/v2/players/{id}/card-image?card_type=batting` | Base card, no tier styling |
+| 61 | Tier override | `GET ...?card_type=batting&tier=2` | Refractor styling visible (border, diamond indicator) |
+| 62 | Each tier visual | `?tier=1` through `?tier=4` | Correct border colors, diamond fill, header gradients per tier |
+| 63 | Pitcher card | `?card_type=pitching&tier=2` | Tier styling applies correctly to pitcher layout |
+
+## REF-70: Known Issues to Verify
+
+| # | Issue | Check | Status |
+|---|---|---|---|
+| 70 | Superfractor embed says "Rating boosts coming in a future update!" | Verify — boosts ARE implemented now, text is stale | **Fix needed** |
+| 71 | `on_timeout` doesn't edit message | Buttons stay visually active after 120s | **Known, low priority** |
+| 72 | Card embed perf (1 API call per card) | Note latency on roster views with 10+ cards | **Monitor** |
+| 73 | Season-stats failure kills refractor eval | Both in same try/except | **Known risk, verify logging** |
+
+---
+
+## API Endpoints Under Test
+
+| Method | Endpoint | Used By |
+|---|---|---|
+| GET | `/api/v2/refractor/tracks` | Track listing |
+| GET | `/api/v2/refractor/cards?team_id=X` | `/refractor status` command |
+| GET | `/api/v2/refractor/cards/{card_id}` | Tier badge in card embeds |
+| POST | `/api/v2/refractor/cards/{card_id}/evaluate` | Force re-evaluation |
+| POST | `/api/v2/refractor/evaluate-game/{game_id}` | Post-game hook |
+| GET | `/api/v2/teams/{team_id}/refractors` | Teams alias endpoint |
+| GET | `/api/v2/players/{id}/card-image?tier=N` | Card art tier preview |
+
+## Notification Embed Colors
+
+| Tier | Name | Color |
+|---|---|---|
+| T1 | Base Chrome | Green (0x2ECC71) |
+| T2 | Refractor | Gold (0xF1C40F) |
+| T3 | Gold Refractor | Purple (0x9B59B6) |
+| T4 | Superfractor | Teal (0x1ABC9C) |

From dd7c68c13a8a4a242b20fc20ef01ba0c4a6218b0 Mon Sep 17 00:00:00 2001
From: Cal Corum <calcorum@users.noreply.github.com>
Date: Mon, 6 Apr 2026 02:00:38 -0500
Subject: [PATCH 6/6] =?UTF-8?q?docs:=20sync=20KB=20=E2=80=94=20discord-bro?=
 =?UTF-8?q?wser-testing-workflow.md?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../discord-browser-testing-workflow.md       | 170 ++++++++++++++++++
 1 file changed, 170 insertions(+)
 create mode 100644 paper-dynasty/discord-browser-testing-workflow.md

diff --git a/paper-dynasty/discord-browser-testing-workflow.md b/paper-dynasty/discord-browser-testing-workflow.md
new file mode 100644
index 0000000..cff469f
--- /dev/null
+++ b/paper-dynasty/discord-browser-testing-workflow.md
@@ -0,0 +1,170 @@
+---
+title: "Discord Bot Browser Testing via Playwright + CDP"
+description: "Step-by-step workflow for automated Discord bot testing using Playwright connected to Brave browser via Chrome DevTools Protocol. Covers setup, slash command execution, and screenshot capture."
+type: runbook
+domain: paper-dynasty
+tags: [paper-dynasty, discord, testing, playwright, automation]
+---
+
+# Discord Bot Browser Testing via Playwright + CDP
+
+Automated testing of Paper Dynasty Discord bot commands by connecting Playwright to a running Brave browser instance with Discord open.
+
+## Prerequisites
+
+- Brave browser installed (`brave-browser-stable`)
+- Playwright installed (`pip install playwright && playwright install chromium`)
+- Discord logged in via browser (not desktop app)
+- Discord bot running (locally via docker-compose or on remote host)
+- Bot's `API_TOKEN` must match the target API environment
+
+## Setup
+
+### 1. Launch Brave with CDP enabled
+
+Brave must be started with `--remote-debugging-port`. If Brave is already running, **kill it first** — otherwise the flag is ignored and the new process merges into the existing one.
+
+```bash
+killall brave && sleep 2 && brave-browser-stable --remote-debugging-port=9222 &
+```
+
+### 2. Verify CDP is responding
+
+```bash
+curl -s http://localhost:9222/json/version | python3 -m json.tool
+```
+
+Should return JSON with `Browser`, `webSocketDebuggerUrl`, etc.
+
+### 3. Open Discord in browser
+
+Navigate to `https://discord.com/channels/<server_id>/<channel_id>` in Brave.
+
+**Paper Dynasty test server:**
+- Server: Cals Test Server (`669356687294988350`)
+- Channel: #pd-game-test (`982850262903451658`)
+- URL: `https://discord.com/channels/669356687294988350/982850262903451658`
+
+### 4. Verify bot is running with correct API token
+
+```bash
+# Check docker-compose.yml has the right API_TOKEN for the target environment
+grep API_TOKEN /mnt/NV2/Development/paper-dynasty/discord-app/docker-compose.yml
+
+# Dev API token lives on the dev host:
+ssh pd-database "docker exec sba_postgres psql -U sba_admin -d paperdynasty_dev -c \"SELECT 1;\""
+
+# Restart bot if token was changed:
+cd /mnt/NV2/Development/paper-dynasty/discord-app && docker compose up -d
+```
+
+## Running Commands
+
+### Find the Discord tab
+
+```python
+from playwright.sync_api import sync_playwright
+import time
+
+with sync_playwright() as p:
+    browser = p.chromium.connect_over_cdp('http://localhost:9222')
+    for ctx in browser.contexts:
+        for page in ctx.pages:
+            if 'discord' in page.url.lower():
+                print(f'Found: {page.url}')
+                break
+    browser.close()
+```
+
+### Execute a slash command and capture result
+
+```python
+from playwright.sync_api import sync_playwright
+import time
+
+def run_slash_command(command: str, wait_seconds: int = 5, screenshot_path: str = '/tmp/discord_result.png'):
+    """
+    Type a slash command in Discord, select the top autocomplete option,
+    submit it, wait for the bot response, and take a screenshot.
+    """
+    with sync_playwright() as p:
+        browser = p.chromium.connect_over_cdp('http://localhost:9222')
+        for ctx in browser.contexts:
+            for page in ctx.pages:
+                if 'discord' in page.url.lower():
+                    msg_box = page.locator('[role="textbox"][data-slate-editor="true"]')
+                    msg_box.click()
+                    time.sleep(0.3)
+
+                    # Type the command (delay simulates human typing for autocomplete)
+                    msg_box.type(command, delay=80)
+                    time.sleep(2)
+
+                    # Tab selects the top autocomplete option
+                    page.keyboard.press('Tab')
+                    time.sleep(1)
+
+                    # Enter submits the command
+                    page.keyboard.press('Enter')
+                    time.sleep(wait_seconds)
+
+                    page.screenshot(path=screenshot_path)
+                    print(f'Screenshot saved to {screenshot_path}')
+                    break
+        browser.close()
+
+# Example usage:
+run_slash_command('/refractor status')
+```
+
+### Commands with parameters
+
+After pressing Tab to select the command, Discord shows an options panel. To fill parameters:
+
+1. The first parameter input is auto-focused after Tab
+2. Type the value, then Tab to move to the next parameter
+3. Press Enter when ready to submit
+
+```python
+# Example: /refractor status with tier filter
+msg_box.type('/refractor status', delay=80)
+time.sleep(2)
+page.keyboard.press('Tab')  # Select command from autocomplete
+time.sleep(1)
+# Now fill parameters if needed, or just submit
+page.keyboard.press('Enter')
+```
+
+## Key Selectors
+
+| Element | Selector |
+|---------|----------|
+| Message input box | `[role="textbox"][data-slate-editor="true"]` |
+| Autocomplete popup | `[class*="autocomplete"]` |
+
+## Gotchas
+
+- **Brave must be killed before relaunch** — if an instance is already running, `--remote-debugging-port` is silently ignored
+- **Bot token mismatch** — the bot's `API_TOKEN` in `docker-compose.yml` must match the target API (dev or prod). Symptoms: `{"detail":"Unauthorized"}` in bot logs
+- **Viewport is None** — when connecting via CDP, `page.viewport_size` returns None. Use `page.evaluate('() => ({w: window.innerWidth, h: window.innerHeight})')` instead
+- **Autocomplete timing** — typing too fast may not trigger Discord's autocomplete. The `delay=80` on `msg_box.type()` simulates human speed
+- **Multiple bots** — if multiple bots register the same slash command (e.g. MantiTestBot and PucklTestBot), Tab selects the top option. Verify the correct bot name in the autocomplete popup before proceeding
+
+## Test Plan Reference
+
+The Refractor integration test plan is at:
+`discord-app/tests/refractor-integration-test-plan.md`
+
+Key test case groups:
+- REF-01 to REF-06: Tier badges and display
+- REF-10 to REF-15: Progress bars and filtering
+- REF-40 to REF-42: Cross-command badges (card, roster)
+- REF-70 to REF-72: Cross-command badge propagation (the current priority)
+
+## Verified On
+
+- **Date:** 2026-04-06
+- **Browser:** Brave 146.0.7680.178 (Chromium-based)
+- **Playwright:** Node.js driver via Python sync API
+- **Bot:** MantiTestBot on Cals Test Server, #pd-game-test channel
+- **API:** pddev.manticorum.com (dev environment)