From 48a804dda2ac2be9ede61e1ca042191b3eb68a76 Mon Sep 17 00:00:00 2001 From: Cal Corum Date: Fri, 3 Apr 2026 15:39:35 -0500 Subject: [PATCH 1/6] feat: right-size VM 115 config and add --hosts flag to audit script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduce VM 115 (docker-sba) from 16 vCPUs (2×8) to 8 vCPUs (1×8) to match actual workload (0.06 load/core). Add --hosts flag to homelab-audit.sh for targeted post-change audits. Closes #18 Co-Authored-By: Claude Opus 4.6 (1M context) --- monitoring/scripts/homelab-audit.sh | 3 +-- monitoring/scripts/test-audit-collectors.sh | 28 +++++++++++++++++++++ server-configs/proxmox/qemu/115.conf | 2 +- 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/monitoring/scripts/homelab-audit.sh b/monitoring/scripts/homelab-audit.sh index 92d4609..55c8c1c 100755 --- a/monitoring/scripts/homelab-audit.sh +++ b/monitoring/scripts/homelab-audit.sh @@ -5,7 +5,7 @@ # to collect system metrics, then generates a summary report. # # Usage: -# homelab-audit.sh [--output-dir DIR] +# homelab-audit.sh [--output-dir DIR] [--hosts label:ip,label:ip,...] # # Environment overrides: # STUCK_PROC_CPU_WARN CPU% at which a D-state process is flagged (default: 10) @@ -29,7 +29,6 @@ LOAD_WARN=2.0 MEM_WARN=85 ZOMBIE_WARN=1 SWAP_WARN=512 - HOSTS_FILTER="" # comma-separated host list from --hosts; empty = audit all JSON_OUTPUT=0 # set to 1 by --json diff --git a/monitoring/scripts/test-audit-collectors.sh b/monitoring/scripts/test-audit-collectors.sh index 149aa98..ef37103 100644 --- a/monitoring/scripts/test-audit-collectors.sh +++ b/monitoring/scripts/test-audit-collectors.sh @@ -93,6 +93,34 @@ else fail "disk_usage" "expected 'N /path', got: '$result'" fi +# --- --hosts flag parsing --- +echo "" +echo "=== --hosts argument parsing tests ===" + +# Single host +input="vm-115:10.10.0.88" +IFS=',' read -ra entries <<<"$input" +label="${entries[0]%%:*}" +addr="${entries[0]#*:}" +if [[ "$label" == "vm-115" && "$addr" == "10.10.0.88" ]]; then + pass "--hosts single entry parsed: $label $addr" +else + fail "--hosts single" "expected 'vm-115 10.10.0.88', got: '$label $addr'" +fi + +# Multiple hosts +input="vm-115:10.10.0.88,lxc-225:10.10.0.225" +IFS=',' read -ra entries <<<"$input" +label1="${entries[0]%%:*}" +addr1="${entries[0]#*:}" +label2="${entries[1]%%:*}" +addr2="${entries[1]#*:}" +if [[ "$label1" == "vm-115" && "$addr1" == "10.10.0.88" && "$label2" == "lxc-225" && "$addr2" == "10.10.0.225" ]]; then + pass "--hosts multi entry parsed: $label1 $addr1, $label2 $addr2" +else + fail "--hosts multi" "unexpected parse result" +fi + echo "" echo "=== Results: $PASS passed, $FAIL failed ===" ((FAIL == 0)) diff --git a/server-configs/proxmox/qemu/115.conf b/server-configs/proxmox/qemu/115.conf index 6474b44..4cf45c7 100644 --- a/server-configs/proxmox/qemu/115.conf +++ b/server-configs/proxmox/qemu/115.conf @@ -12,5 +12,5 @@ ostype: l26 scsi0: local-lvm:vm-115-disk-0,size=256G scsihw: virtio-scsi-pci smbios1: uuid=19be98ee-f60d-473d-acd2-9164717fcd11 -sockets: 2 +sockets: 1 vmgenid: 682dfeab-8c63-4f0b-8ed2-8828c2f808ef From 29a20fbe06f8addaa0ca9894a3be61dde5cffa5b Mon Sep 17 00:00:00 2001 From: Cal Corum Date: Fri, 3 Apr 2026 16:17:55 -0500 Subject: [PATCH 2/6] feat: add monthly Proxmox maintenance reboot automation (#26) Establishes a first-Sunday-of-the-month maintenance window orchestrated by Ansible on LXC 304. Split into two playbooks to handle the self-reboot paradox (the controller is a guest on the host being rebooted): - monthly-reboot.yml: snapshots, tiered shutdown with per-guest polling, fire-and-forget host reboot - post-reboot-startup.yml: controlled tiered startup with staggered delays, Pi-hole UDP DNS fix, validation, and snapshot cleanup Also fixes onboot:1 on VM 109, LXC 221, LXC 223 and creates a recurring Google Calendar event for the maintenance window. Closes #26 Co-Authored-By: Claude Opus 4.6 (1M context) --- ansible/playbooks/monthly-reboot.yml | 265 ++++++++++++++++++ ansible/playbooks/post-reboot-startup.yml | 214 ++++++++++++++ .../systemd/ansible-monthly-reboot.service | 15 + ansible/systemd/ansible-monthly-reboot.timer | 13 + ansible/systemd/ansible-post-reboot.service | 21 ++ server-configs/proxmox/maintenance-reboot.md | 98 +++++-- 6 files changed, 595 insertions(+), 31 deletions(-) create mode 100644 ansible/playbooks/monthly-reboot.yml create mode 100644 ansible/playbooks/post-reboot-startup.yml create mode 100644 ansible/systemd/ansible-monthly-reboot.service create mode 100644 ansible/systemd/ansible-monthly-reboot.timer create mode 100644 ansible/systemd/ansible-post-reboot.service diff --git a/ansible/playbooks/monthly-reboot.yml b/ansible/playbooks/monthly-reboot.yml new file mode 100644 index 0000000..f3a77c8 --- /dev/null +++ b/ansible/playbooks/monthly-reboot.yml @@ -0,0 +1,265 @@ +--- +# Monthly Proxmox Maintenance Reboot — Shutdown & Reboot +# +# Orchestrates a graceful shutdown of all guests in dependency order, +# then issues a fire-and-forget reboot to the Proxmox host. +# +# After the host reboots, LXC 304 auto-starts via onboot:1 and the +# post-reboot-startup.yml playbook runs automatically via the +# ansible-post-reboot.service systemd unit (triggered by @reboot). +# +# Schedule: 1st Sunday of each month, 08:00 UTC (3 AM ET) +# Controller: LXC 304 (ansible-controller) at 10.10.0.232 +# +# Usage: +# # Dry run +# ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --check +# +# # Full execution +# ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml +# +# # Shutdown only (skip the host reboot) +# ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --tags shutdown +# +# Note: VM 109 (homeassistant) is excluded from Ansible inventory +# (self-managed via HA Supervisor) but is included in pvesh start/stop. + +- name: Pre-reboot health check and snapshots + hosts: pve-node + gather_facts: false + tags: [pre-reboot, shutdown] + + tasks: + - name: Check Proxmox cluster health + ansible.builtin.command: pvesh get /cluster/status --output-format json + register: cluster_status + changed_when: false + + - name: Get list of running QEMU VMs + ansible.builtin.shell: > + pvesh get /nodes/proxmox/qemu --output-format json | + python3 -c "import sys,json; [print(vm['vmid']) for vm in json.load(sys.stdin) if vm.get('status')=='running']" + register: running_vms + changed_when: false + + - name: Get list of running LXC containers + ansible.builtin.shell: > + pvesh get /nodes/proxmox/lxc --output-format json | + python3 -c "import sys,json; [print(ct['vmid']) for ct in json.load(sys.stdin) if ct.get('status')=='running']" + register: running_lxcs + changed_when: false + + - name: Display running guests + ansible.builtin.debug: + msg: "Running VMs: {{ running_vms.stdout_lines }} | Running LXCs: {{ running_lxcs.stdout_lines }}" + + - name: Snapshot running VMs + ansible.builtin.command: > + pvesh create /nodes/proxmox/qemu/{{ item }}/snapshot + --snapname pre-maintenance-{{ lookup('pipe', 'date +%Y-%m-%d') }} + --description "Auto snapshot before monthly maintenance reboot" + loop: "{{ running_vms.stdout_lines }}" + when: running_vms.stdout_lines | length > 0 + ignore_errors: true + + - name: Snapshot running LXCs + ansible.builtin.command: > + pvesh create /nodes/proxmox/lxc/{{ item }}/snapshot + --snapname pre-maintenance-{{ lookup('pipe', 'date +%Y-%m-%d') }} + --description "Auto snapshot before monthly maintenance reboot" + loop: "{{ running_lxcs.stdout_lines }}" + when: running_lxcs.stdout_lines | length > 0 + ignore_errors: true + +- name: "Shutdown Tier 4 — Media & Others" + hosts: pve-node + gather_facts: false + tags: [shutdown] + + vars: + tier4_vms: [109] + # LXC 303 (mcp-gateway) is onboot=0 and operator-managed — not included here + tier4_lxcs: [221, 222, 223, 302] + + tasks: + - name: Shutdown Tier 4 VMs + ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown + loop: "{{ tier4_vms }}" + ignore_errors: true + + - name: Shutdown Tier 4 LXCs + ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown + loop: "{{ tier4_lxcs }}" + ignore_errors: true + + - name: Wait for Tier 4 VMs to stop + ansible.builtin.shell: > + pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json | + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" + register: t4_vm_status + until: t4_vm_status.stdout.strip() == "stopped" + retries: 12 + delay: 5 + loop: "{{ tier4_vms }}" + ignore_errors: true + + - name: Wait for Tier 4 LXCs to stop + ansible.builtin.shell: > + pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json | + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" + register: t4_lxc_status + until: t4_lxc_status.stdout.strip() == "stopped" + retries: 12 + delay: 5 + loop: "{{ tier4_lxcs }}" + ignore_errors: true + +- name: "Shutdown Tier 3 — Applications" + hosts: pve-node + gather_facts: false + tags: [shutdown] + + vars: + tier3_vms: [115, 110] + tier3_lxcs: [301] + + tasks: + - name: Shutdown Tier 3 VMs + ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown + loop: "{{ tier3_vms }}" + ignore_errors: true + + - name: Shutdown Tier 3 LXCs + ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown + loop: "{{ tier3_lxcs }}" + ignore_errors: true + + - name: Wait for Tier 3 VMs to stop + ansible.builtin.shell: > + pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json | + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" + register: t3_vm_status + until: t3_vm_status.stdout.strip() == "stopped" + retries: 12 + delay: 5 + loop: "{{ tier3_vms }}" + ignore_errors: true + + - name: Wait for Tier 3 LXCs to stop + ansible.builtin.shell: > + pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json | + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" + register: t3_lxc_status + until: t3_lxc_status.stdout.strip() == "stopped" + retries: 12 + delay: 5 + loop: "{{ tier3_lxcs }}" + ignore_errors: true + +- name: "Shutdown Tier 2 — Infrastructure" + hosts: pve-node + gather_facts: false + tags: [shutdown] + + vars: + tier2_vms: [106, 116] + tier2_lxcs: [225, 210, 227] + + tasks: + - name: Shutdown Tier 2 VMs + ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown + loop: "{{ tier2_vms }}" + ignore_errors: true + + - name: Shutdown Tier 2 LXCs + ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown + loop: "{{ tier2_lxcs }}" + ignore_errors: true + + - name: Wait for Tier 2 VMs to stop + ansible.builtin.shell: > + pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json | + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" + register: t2_vm_status + until: t2_vm_status.stdout.strip() == "stopped" + retries: 12 + delay: 5 + loop: "{{ tier2_vms }}" + ignore_errors: true + + - name: Wait for Tier 2 LXCs to stop + ansible.builtin.shell: > + pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json | + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" + register: t2_lxc_status + until: t2_lxc_status.stdout.strip() == "stopped" + retries: 12 + delay: 5 + loop: "{{ tier2_lxcs }}" + ignore_errors: true + +- name: "Shutdown Tier 1 — Databases" + hosts: pve-node + gather_facts: false + tags: [shutdown] + + vars: + tier1_vms: [112] + + tasks: + - name: Shutdown database VMs + ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown + loop: "{{ tier1_vms }}" + ignore_errors: true + + - name: Wait for database VMs to stop (up to 90s) + ansible.builtin.shell: > + pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json | + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" + register: t1_vm_status + until: t1_vm_status.stdout.strip() == "stopped" + retries: 18 + delay: 5 + loop: "{{ tier1_vms }}" + ignore_errors: true + + - name: Force stop database VMs if still running + ansible.builtin.shell: > + status=$(pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json | + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"); + if [ "$status" = "running" ]; then + pvesh create /nodes/proxmox/qemu/{{ item }}/status/stop; + echo "Force stopped VM {{ item }}"; + else + echo "VM {{ item }} already stopped"; + fi + loop: "{{ tier1_vms }}" + register: force_stop_result + changed_when: force_stop_result.results | default([]) | selectattr('stdout', 'defined') | selectattr('stdout', 'search', 'Force stopped') | list | length > 0 + +- name: "Verify and reboot Proxmox host" + hosts: pve-node + gather_facts: false + tags: [reboot] + + tasks: + - name: Verify all guests are stopped (excluding LXC 304) + ansible.builtin.shell: > + running_vms=$(pvesh get /nodes/proxmox/qemu --output-format json | + python3 -c "import sys,json; vms=[v for v in json.load(sys.stdin) if v.get('status')=='running']; print(len(vms))"); + running_lxcs=$(pvesh get /nodes/proxmox/lxc --output-format json | + python3 -c "import sys,json; cts=[c for c in json.load(sys.stdin) if c.get('status')=='running' and c['vmid'] != 304]; print(len(cts))"); + echo "Running VMs: $running_vms, Running LXCs: $running_lxcs"; + if [ "$running_vms" != "0" ] || [ "$running_lxcs" != "0" ]; then exit 1; fi + register: verify_stopped + + - name: Issue fire-and-forget reboot (controller will be killed) + ansible.builtin.shell: > + nohup bash -c 'sleep 10 && reboot' &>/dev/null & + echo "Reboot scheduled in 10 seconds" + register: reboot_issued + when: not ansible_check_mode + + - name: Log reboot issued + ansible.builtin.debug: + msg: "{{ reboot_issued.stdout }} — Ansible process will terminate when host reboots. Post-reboot startup handled by ansible-post-reboot.service on LXC 304." diff --git a/ansible/playbooks/post-reboot-startup.yml b/ansible/playbooks/post-reboot-startup.yml new file mode 100644 index 0000000..d05c77c --- /dev/null +++ b/ansible/playbooks/post-reboot-startup.yml @@ -0,0 +1,214 @@ +--- +# Post-Reboot Startup — Controlled Guest Startup After Proxmox Reboot +# +# Starts all guests in dependency order with staggered delays to avoid +# I/O storms. Runs automatically via ansible-post-reboot.service on +# LXC 304 after the Proxmox host reboots. +# +# Can also be run manually: +# ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml +# +# Note: VM 109 (homeassistant) is excluded from Ansible inventory +# (self-managed via HA Supervisor) but is included in pvesh start/stop. + +- name: Wait for Proxmox API to be ready + hosts: pve-node + gather_facts: false + tags: [startup] + + tasks: + - name: Wait for Proxmox API + ansible.builtin.command: pvesh get /version --output-format json + register: pve_version + until: pve_version.rc == 0 + retries: 30 + delay: 10 + changed_when: false + + - name: Display Proxmox version + ansible.builtin.debug: + msg: "Proxmox API ready: {{ pve_version.stdout | from_json | json_query('version') | default('unknown') }}" + +- name: "Startup Tier 1 — Databases" + hosts: pve-node + gather_facts: false + tags: [startup] + + tasks: + - name: Start database VM (112) + ansible.builtin.command: pvesh create /nodes/proxmox/qemu/112/status/start + ignore_errors: true + + - name: Wait for VM 112 to be running + ansible.builtin.shell: > + pvesh get /nodes/proxmox/qemu/112/status/current --output-format json | + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" + register: db_status + until: db_status.stdout.strip() == "running" + retries: 12 + delay: 5 + changed_when: false + + - name: Wait for database services to initialize + ansible.builtin.pause: + seconds: 30 + +- name: "Startup Tier 2 — Infrastructure" + hosts: pve-node + gather_facts: false + tags: [startup] + + vars: + tier2_vms: [106, 116] + tier2_lxcs: [225, 210, 227] + + tasks: + - name: Start Tier 2 VMs + ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start + loop: "{{ tier2_vms }}" + ignore_errors: true + + - name: Start Tier 2 LXCs + ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start + loop: "{{ tier2_lxcs }}" + ignore_errors: true + + - name: Wait for infrastructure to come up + ansible.builtin.pause: + seconds: 30 + +- name: "Startup Tier 3 — Applications" + hosts: pve-node + gather_facts: false + tags: [startup] + + vars: + tier3_vms: [115, 110] + tier3_lxcs: [301] + + tasks: + - name: Start Tier 3 VMs + ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start + loop: "{{ tier3_vms }}" + ignore_errors: true + + - name: Start Tier 3 LXCs + ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start + loop: "{{ tier3_lxcs }}" + ignore_errors: true + + - name: Wait for applications to start + ansible.builtin.pause: + seconds: 30 + + - name: Restart Pi-hole container via SSH (UDP DNS fix) + ansible.builtin.command: ssh docker-home "docker restart pihole" + ignore_errors: true + + - name: Wait for Pi-hole to stabilize + ansible.builtin.pause: + seconds: 10 + +- name: "Startup Tier 4 — Media & Others" + hosts: pve-node + gather_facts: false + tags: [startup] + + vars: + tier4_vms: [109] + tier4_lxcs: [221, 222, 223, 302] + + tasks: + - name: Start Tier 4 VMs + ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start + loop: "{{ tier4_vms }}" + ignore_errors: true + + - name: Start Tier 4 LXCs + ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start + loop: "{{ tier4_lxcs }}" + ignore_errors: true + +- name: Post-reboot validation + hosts: pve-node + gather_facts: false + tags: [startup, validate] + + tasks: + - name: Wait for all services to initialize + ansible.builtin.pause: + seconds: 60 + + - name: Check all expected VMs are running + ansible.builtin.shell: > + pvesh get /nodes/proxmox/qemu --output-format json | + python3 -c " + import sys, json + vms = json.load(sys.stdin) + expected = {106, 109, 110, 112, 115, 116} + running = {v['vmid'] for v in vms if v.get('status') == 'running'} + missing = expected - running + if missing: + print(f'WARN: VMs not running: {missing}') + sys.exit(1) + print(f'All expected VMs running: {running & expected}') + " + register: vm_check + ignore_errors: true + + - name: Check all expected LXCs are running + ansible.builtin.shell: > + pvesh get /nodes/proxmox/lxc --output-format json | + python3 -c " + import sys, json + cts = json.load(sys.stdin) + # LXC 303 (mcp-gateway) intentionally excluded — onboot=0, operator-managed + expected = {210, 221, 222, 223, 225, 227, 301, 302, 304} + running = {c['vmid'] for c in cts if c.get('status') == 'running'} + missing = expected - running + if missing: + print(f'WARN: LXCs not running: {missing}') + sys.exit(1) + print(f'All expected LXCs running: {running & expected}') + " + register: lxc_check + ignore_errors: true + + - name: Clean up old maintenance snapshots (older than 7 days) + ansible.builtin.shell: > + cutoff=$(date -d '7 days ago' +%s); + for vmid in $(pvesh get /nodes/proxmox/qemu --output-format json | + python3 -c "import sys,json; [print(v['vmid']) for v in json.load(sys.stdin)]"); do + for snap in $(pvesh get /nodes/proxmox/qemu/$vmid/snapshot --output-format json | + python3 -c "import sys,json; [print(s['name']) for s in json.load(sys.stdin) if s['name'].startswith('pre-maintenance-')]" 2>/dev/null); do + snap_date=$(echo $snap | sed 's/pre-maintenance-//'); + snap_epoch=$(date -d "$snap_date" +%s 2>/dev/null); + if [ -z "$snap_epoch" ]; then + echo "WARN: could not parse date for snapshot $snap on VM $vmid"; + elif [ "$snap_epoch" -lt "$cutoff" ]; then + pvesh delete /nodes/proxmox/qemu/$vmid/snapshot/$snap && echo "Deleted $snap from VM $vmid"; + fi + done + done; + for ctid in $(pvesh get /nodes/proxmox/lxc --output-format json | + python3 -c "import sys,json; [print(c['vmid']) for c in json.load(sys.stdin)]"); do + for snap in $(pvesh get /nodes/proxmox/lxc/$ctid/snapshot --output-format json | + python3 -c "import sys,json; [print(s['name']) for s in json.load(sys.stdin) if s['name'].startswith('pre-maintenance-')]" 2>/dev/null); do + snap_date=$(echo $snap | sed 's/pre-maintenance-//'); + snap_epoch=$(date -d "$snap_date" +%s 2>/dev/null); + if [ -z "$snap_epoch" ]; then + echo "WARN: could not parse date for snapshot $snap on LXC $ctid"; + elif [ "$snap_epoch" -lt "$cutoff" ]; then + pvesh delete /nodes/proxmox/lxc/$ctid/snapshot/$snap && echo "Deleted $snap from LXC $ctid"; + fi + done + done; + echo "Snapshot cleanup complete" + ignore_errors: true + + - name: Display validation results + ansible.builtin.debug: + msg: + - "VM status: {{ vm_check.stdout }}" + - "LXC status: {{ lxc_check.stdout }}" + - "Maintenance reboot complete — post-reboot startup finished" diff --git a/ansible/systemd/ansible-monthly-reboot.service b/ansible/systemd/ansible-monthly-reboot.service new file mode 100644 index 0000000..02b2db2 --- /dev/null +++ b/ansible/systemd/ansible-monthly-reboot.service @@ -0,0 +1,15 @@ +[Unit] +Description=Monthly Proxmox maintenance reboot (Ansible) +After=network-online.target +Wants=network-online.target + +[Service] +Type=oneshot +User=cal +WorkingDirectory=/opt/ansible +ExecStart=/usr/bin/ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml +StandardOutput=append:/opt/ansible/logs/monthly-reboot.log +StandardError=append:/opt/ansible/logs/monthly-reboot.log +TimeoutStartSec=900 + +# No [Install] section — this service is activated exclusively by ansible-monthly-reboot.timer diff --git a/ansible/systemd/ansible-monthly-reboot.timer b/ansible/systemd/ansible-monthly-reboot.timer new file mode 100644 index 0000000..5711dda --- /dev/null +++ b/ansible/systemd/ansible-monthly-reboot.timer @@ -0,0 +1,13 @@ +[Unit] +Description=Monthly Proxmox maintenance reboot timer +Documentation=https://git.manticorum.com/cal/claude-home/src/branch/main/server-configs/proxmox/maintenance-reboot.md + +[Timer] +# First Sunday of the month at 08:00 UTC (3:00 AM ET during EDT) +# Day range 01-07 ensures it's always the first occurrence of that weekday +OnCalendar=Sun *-*-01..07 08:00:00 +Persistent=true +RandomizedDelaySec=600 + +[Install] +WantedBy=timers.target diff --git a/ansible/systemd/ansible-post-reboot.service b/ansible/systemd/ansible-post-reboot.service new file mode 100644 index 0000000..132ac6b --- /dev/null +++ b/ansible/systemd/ansible-post-reboot.service @@ -0,0 +1,21 @@ +[Unit] +Description=Post-reboot controlled guest startup (Ansible) +After=network-online.target +Wants=network-online.target +# Only run after a fresh boot — not on service restart +ConditionUpTimeSec=600 + +[Service] +Type=oneshot +User=cal +WorkingDirectory=/opt/ansible +# Delay 120s to let Proxmox API stabilize and onboot guests settle +ExecStartPre=/bin/sleep 120 +ExecStart=/usr/bin/ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml +StandardOutput=append:/opt/ansible/logs/post-reboot-startup.log +StandardError=append:/opt/ansible/logs/post-reboot-startup.log +TimeoutStartSec=1800 + +[Install] +# Runs automatically on every boot of LXC 304 +WantedBy=multi-user.target diff --git a/server-configs/proxmox/maintenance-reboot.md b/server-configs/proxmox/maintenance-reboot.md index 0c72d5a..36e63da 100644 --- a/server-configs/proxmox/maintenance-reboot.md +++ b/server-configs/proxmox/maintenance-reboot.md @@ -14,7 +14,7 @@ tags: [proxmox, maintenance, reboot, ansible, operations, systemd] |--------|-------| | **Schedule** | 1st Sunday of every month, 3:00 AM ET (08:00 UTC) | | **Expected downtime** | ~15 minutes (host reboot + VM/LXC startup) | -| **Orchestration** | Ansible playbook on LXC 304 (ansible-controller) | +| **Orchestration** | Ansible on LXC 304 — shutdown playbook → host reboot → post-reboot startup playbook | | **Calendar** | Google Calendar recurring event: "Proxmox Monthly Maintenance Reboot" | | **HA DNS** | ubuntu-manticore (10.10.0.226) provides Pi-hole 2 during Proxmox downtime | @@ -24,16 +24,25 @@ tags: [proxmox, maintenance, reboot, ansible, operations, systemd] - Long uptimes allow memory leaks and process state drift (e.g., avahi busy-loops) - Validates that all VMs/LXCs auto-start cleanly with `onboot: 1` +## Architecture + +The reboot is split into two playbooks because LXC 304 (the Ansible controller) is itself a guest on the Proxmox host being rebooted: + +1. **`monthly-reboot.yml`** — Snapshots all guests, shuts them down in dependency order, issues a fire-and-forget `reboot` to the Proxmox host, then exits. LXC 304 is killed when the host reboots. +2. **`post-reboot-startup.yml`** — After the host reboots, LXC 304 auto-starts via `onboot: 1`. A systemd service (`ansible-post-reboot.service`) waits 120 seconds for the Proxmox API to stabilize, then starts all guests in dependency order with staggered delays. + +The `onboot: 1` flag on all production guests acts as a safety net — even if the post-reboot playbook fails, Proxmox will start everything (though without controlled ordering). + ## Prerequisites (Before Maintenance) - [ ] Verify no active Tdarr transcodes on ubuntu-manticore - [ ] Verify no running database backups -- [ ] Switch workstation DNS to `1.1.1.1` (Pi-hole 1 on VM 106 will be offline) +- [ ] Ensure workstation has Pi-hole 2 (10.10.0.226) as a fallback DNS server so it fails over automatically during downtime - [ ] Confirm ubuntu-manticore Pi-hole 2 is healthy: `ssh manticore "docker exec pihole pihole status"` ## `onboot` Audit -All production VMs and LXCs must have `onboot: 1` so they restart automatically if the playbook fails mid-sequence. +All production VMs and LXCs must have `onboot: 1` so they restart automatically as a safety net. **Check VMs:** ```bash @@ -55,18 +64,18 @@ done" **Audit results (2026-04-03):** -| ID | Name | Type | `onboot` | Action needed | -|----|------|------|----------|---------------| +| ID | Name | Type | `onboot` | Status | +|----|------|------|----------|--------| | 106 | docker-home | VM | 1 | OK | -| 109 | homeassistant | VM | NOT SET | **Add `onboot: 1`** | +| 109 | homeassistant | VM | 1 | OK (fixed 2026-04-03) | | 110 | discord-bots | VM | 1 | OK | | 112 | databases-bots | VM | 1 | OK | | 115 | docker-sba | VM | 1 | OK | | 116 | docker-home-servers | VM | 1 | OK | | 210 | docker-n8n-lxc | LXC | 1 | OK | -| 221 | arr-stack | LXC | NOT SET | **Add `onboot: 1`** | +| 221 | arr-stack | LXC | 1 | OK (fixed 2026-04-03) | | 222 | memos | LXC | 1 | OK | -| 223 | foundry-lxc | LXC | NOT SET | **Add `onboot: 1`** | +| 223 | foundry-lxc | LXC | 1 | OK (fixed 2026-04-03) | | 225 | gitea | LXC | 1 | OK | | 227 | uptime-kuma | LXC | 1 | OK | | 301 | claude-discord-coordinator | LXC | 1 | OK | @@ -74,16 +83,15 @@ done" | 303 | mcp-gateway | LXC | 0 | Intentional (on-demand) | | 304 | ansible-controller | LXC | 1 | OK | -**Fix missing `onboot`:** +**If any production guest is missing `onboot: 1`:** ```bash -ssh proxmox "qm set 109 --onboot 1" -ssh proxmox "pct set 221 --onboot 1" -ssh proxmox "pct set 223 --onboot 1" +ssh proxmox "qm set --onboot 1" # for VMs +ssh proxmox "pct set --onboot 1" # for LXCs ``` ## Shutdown Order (Dependency-Aware) -Reverse of the validated startup sequence. Stop consumers before their dependencies. +Reverse of the validated startup sequence. Stop consumers before their dependencies. Each tier polls per-guest status rather than using fixed waits. ``` Tier 4 — Media & Others (no downstream dependents) @@ -92,7 +100,6 @@ Tier 4 — Media & Others (no downstream dependents) LXC 222 memos LXC 223 foundry-lxc LXC 302 claude-runner - LXC 303 mcp-gateway (if running) Tier 3 — Applications (depend on databases + infra) VM 115 docker-sba (Paper Dynasty, Major Domo) @@ -107,21 +114,19 @@ Tier 2 — Infrastructure + DNS (depend on databases) VM 116 docker-home-servers Tier 1 — Databases (no dependencies, shut down last) - VM 112 databases-bots + VM 112 databases-bots (force-stop after 90s if ACPI ignored) -Tier 0 — Ansible controller shuts itself down last - LXC 304 ansible-controller - -→ Proxmox host reboots +→ LXC 304 issues fire-and-forget reboot to Proxmox host, then is killed ``` **Known quirks:** -- VM 112 (databases-bots) may ignore ACPI shutdown — use `--forceStop` after timeout +- VM 112 (databases-bots) may ignore ACPI shutdown — playbook force-stops after 90s - VM 109 (homeassistant) is self-managed via HA Supervisor, excluded from Ansible inventory +- LXC 303 (mcp-gateway) has `onboot: 0` and is operator-managed — not included in shutdown/startup. If it was running before maintenance, bring it up manually afterward ## Startup Order (Staggered) -After the Proxmox host reboots, guests with `onboot: 1` will auto-start. The Ansible playbook overrides this with a controlled sequence: +After the Proxmox host reboots, LXC 304 auto-starts and the `ansible-post-reboot.service` waits 120s before running the controlled startup: ``` Tier 1 — Databases first @@ -142,8 +147,8 @@ Tier 3 — Applications LXC 301 claude-discord-coordinator → wait 30s -Pi-hole fix — restart container to clear UDP DNS bug - qm guest exec 106 -- docker restart pihole +Pi-hole fix — restart container via SSH to clear UDP DNS bug + ssh docker-home "docker restart pihole" → wait 10s Tier 4 — Media & Others @@ -151,6 +156,7 @@ Tier 4 — Media & Others LXC 221 arr-stack LXC 222 memos LXC 223 foundry-lxc + LXC 302 claude-runner ``` ## Post-Reboot Validation @@ -161,28 +167,35 @@ Tier 4 — Media & Others - [ ] Discord bots responding (check Discord) - [ ] Uptime Kuma dashboard green: `curl -sf http://10.10.0.227:3001/api/status-page/homelab` - [ ] Home Assistant running: `curl -sf http://10.10.0.109:8123/api/ -H 'Authorization: Bearer '` -- [ ] Switch workstation DNS back from `1.1.1.1` to Pi-hole +- [ ] Maintenance snapshots cleaned up (auto, 7-day retention) ## Automation -### Ansible Playbook +### Ansible Playbooks -Located at `/opt/ansible/playbooks/monthly-reboot.yml` on LXC 304. +Both located at `/opt/ansible/playbooks/` on LXC 304. ```bash -# Dry run (check mode) +# Dry run — shutdown only ssh ansible "ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --check" -# Manual execution +# Manual full execution — shutdown + reboot ssh ansible "ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml" -# Limit to shutdown only (skip reboot) +# Manual post-reboot startup (if automatic startup failed) +ssh ansible "ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml" + +# Shutdown only — skip the host reboot ssh ansible "ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --tags shutdown" ``` -### Systemd Timer +### Systemd Units (on LXC 304) -The playbook runs automatically via systemd timer on LXC 304: +| Unit | Purpose | Schedule | +|------|---------|----------| +| `ansible-monthly-reboot.timer` | Triggers shutdown + reboot playbook | 1st Sunday of month, 08:00 UTC | +| `ansible-monthly-reboot.service` | Runs `monthly-reboot.yml` | Activated by timer | +| `ansible-post-reboot.service` | Runs `post-reboot-startup.yml` | On boot (multi-user.target), only if uptime < 10 min | ```bash # Check timer status @@ -191,10 +204,32 @@ ssh ansible "systemctl status ansible-monthly-reboot.timer" # Next scheduled run ssh ansible "systemctl list-timers ansible-monthly-reboot.timer" +# Check post-reboot service status +ssh ansible "systemctl status ansible-post-reboot.service" + # Disable for a month (e.g., during an incident) ssh ansible "systemctl stop ansible-monthly-reboot.timer" ``` +### Deployment (one-time setup on LXC 304) + +```bash +# Copy playbooks +scp ansible/playbooks/monthly-reboot.yml ansible:/opt/ansible/playbooks/ +scp ansible/playbooks/post-reboot-startup.yml ansible:/opt/ansible/playbooks/ + +# Copy and enable systemd units +scp ansible/systemd/ansible-monthly-reboot.timer ansible:/etc/systemd/system/ +scp ansible/systemd/ansible-monthly-reboot.service ansible:/etc/systemd/system/ +scp ansible/systemd/ansible-post-reboot.service ansible:/etc/systemd/system/ +ssh ansible "sudo systemctl daemon-reload && \ + sudo systemctl enable --now ansible-monthly-reboot.timer && \ + sudo systemctl enable ansible-post-reboot.service" + +# Verify SSH key access from LXC 304 to docker-home (needed for Pi-hole restart) +ssh ansible "ssh -o BatchMode=yes docker-home 'echo ok'" +``` + ## Rollback If a guest fails to start after reboot: @@ -202,6 +237,7 @@ If a guest fails to start after reboot: 2. Review guest logs: `ssh proxmox "journalctl -u pve-guests -n 50"` 3. Manual start: `ssh proxmox "pvesh create /nodes/proxmox/qemu//status/start"` 4. If guest is corrupted, restore from the pre-reboot Proxmox snapshot +5. If post-reboot startup failed entirely, run manually: `ssh ansible "ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml"` ## Related Documentation From 95bae333090c1033e45d8393c4c82661460d8108 Mon Sep 17 00:00:00 2001 From: Cal Corum Date: Sat, 4 Apr 2026 06:07:57 -0500 Subject: [PATCH 3/6] feat: add weekly Proxmox backup verification and CT 302 self-health check (#27) Closes #27 - proxmox-backup-check.sh: SSHes to Proxmox, queries pvesh task history, classifies each running VM/CT as green/yellow/red by backup recency, posts a Discord embed summary. Designed for weekly cron on CT 302. - ct302-self-health.sh: Checks disk usage on CT 302 itself, silently exits when healthy, posts a Discord alert when any filesystem exceeds 80% threshold. Closes the blind spot where the monitoring system cannot monitor itself externally. - Updated monitoring/scripts/CONTEXT.md with full operational docs, install instructions, and cron schedules for both new scripts. Co-Authored-By: Claude Sonnet 4.6 --- monitoring/scripts/CONTEXT.md | 86 +++++++- monitoring/scripts/ct302-self-health.sh | 158 ++++++++++++++ monitoring/scripts/proxmox-backup-check.sh | 230 +++++++++++++++++++++ 3 files changed, 472 insertions(+), 2 deletions(-) create mode 100644 monitoring/scripts/ct302-self-health.sh create mode 100644 monitoring/scripts/proxmox-backup-check.sh diff --git a/monitoring/scripts/CONTEXT.md b/monitoring/scripts/CONTEXT.md index 2990a3f..4f06f76 100644 --- a/monitoring/scripts/CONTEXT.md +++ b/monitoring/scripts/CONTEXT.md @@ -1,9 +1,9 @@ --- title: "Monitoring Scripts Context" -description: "Operational context for all monitoring scripts: Jellyfin GPU health monitor, NVIDIA driver update checker, Tdarr API/file monitors, and Windows reboot detection. Includes cron schedules, Discord integration patterns, and troubleshooting." +description: "Operational context for all monitoring scripts: Proxmox backup checker, CT 302 self-health, Jellyfin GPU health monitor, NVIDIA driver update checker, Tdarr API/file monitors, and Windows reboot detection. Includes cron schedules, Discord integration patterns, and troubleshooting." type: context domain: monitoring -tags: [jellyfin, gpu, nvidia, tdarr, discord, cron, python, windows, scripts] +tags: [proxmox, backup, jellyfin, gpu, nvidia, tdarr, discord, cron, python, bash, windows, scripts] --- # Monitoring Scripts - Operational Context @@ -13,6 +13,77 @@ This directory contains active operational scripts for system monitoring, health ## Core Monitoring Scripts +### Proxmox Backup Verification +**Script**: `proxmox-backup-check.sh` +**Purpose**: Weekly check that every running VM/CT has a successful vzdump backup within 7 days. Posts a color-coded Discord embed with per-guest status. + +**Key Features**: +- SSHes to Proxmox host and queries `pvesh` task history + guest lists via API +- Categorizes each guest: 🟢 green (backed up), 🟡 yellow (overdue), 🔴 red (no backup) +- Sorts output by VMID; only posts to Discord — no local side effects +- `--dry-run` mode prints the Discord payload without sending +- `--days N` overrides the default 7-day window + +**Schedule**: Weekly on Monday 08:00 UTC (CT 302 cron) +```bash +0 8 * * 1 DISCORD_WEBHOOK="" /root/scripts/proxmox-backup-check.sh >> /var/log/proxmox-backup-check.log 2>&1 +``` + +**Usage**: +```bash +# Dry run (no Discord) +proxmox-backup-check.sh --dry-run + +# Post to Discord +DISCORD_WEBHOOK="https://discord.com/api/webhooks/..." proxmox-backup-check.sh + +# Custom window +proxmox-backup-check.sh --days 14 --discord-webhook "https://..." +``` + +**Dependencies**: `jq`, `curl`, SSH access to Proxmox host alias `proxmox` + +**Install on CT 302**: +```bash +cp proxmox-backup-check.sh /root/scripts/ +chmod +x /root/scripts/proxmox-backup-check.sh +``` + +### CT 302 Self-Health Monitor +**Script**: `ct302-self-health.sh` +**Purpose**: Monitors disk usage on CT 302 (claude-runner) itself. Alerts to Discord when any filesystem exceeds the threshold (default 80%). Runs silently when healthy — no Discord spam on green. + +**Key Features**: +- Checks all non-virtual filesystems (`df`, excludes tmpfs/devtmpfs/overlay) +- Only sends a Discord alert when a filesystem is at or above threshold +- `--always-post` flag forces a post even when healthy (useful for testing) +- `--dry-run` mode prints payload without sending + +**Schedule**: Daily at 07:00 UTC (CT 302 cron) +```bash +0 7 * * * DISCORD_WEBHOOK="" /root/scripts/ct302-self-health.sh >> /var/log/ct302-self-health.log 2>&1 +``` + +**Usage**: +```bash +# Check and alert if over 80% +DISCORD_WEBHOOK="https://discord.com/api/webhooks/..." ct302-self-health.sh + +# Lower threshold test +ct302-self-health.sh --threshold 50 --dry-run + +# Always post (weekly status report pattern) +ct302-self-health.sh --always-post --discord-webhook "https://..." +``` + +**Dependencies**: `jq`, `curl`, `df` + +**Install on CT 302**: +```bash +cp ct302-self-health.sh /root/scripts/ +chmod +x /root/scripts/ct302-self-health.sh +``` + ### Jellyfin GPU Health Monitor **Script**: `jellyfin_gpu_monitor.py` **Purpose**: Monitor Jellyfin container GPU access with Discord alerts and auto-restart capability @@ -235,6 +306,17 @@ python3 tdarr_file_monitor.py >> /mnt/NV2/Development/claude-home/logs/tdarr-fil 0 9 * * 1 /usr/bin/python3 /home/cal/scripts/nvidia_update_checker.py --check --discord-alerts >> /home/cal/logs/nvidia-update-checker.log 2>&1 ``` +**Active Cron Jobs** (on CT 302 / claude-runner, root user): +```bash +# Proxmox backup verification - Weekly (Mondays at 8 AM UTC) +0 8 * * 1 DISCORD_WEBHOOK="" /root/scripts/proxmox-backup-check.sh >> /var/log/proxmox-backup-check.log 2>&1 + +# CT 302 self-health disk check - Daily at 7 AM UTC (alerts only when >80%) +0 7 * * * DISCORD_WEBHOOK="" /root/scripts/ct302-self-health.sh >> /var/log/ct302-self-health.log 2>&1 +``` + +**Note**: Scripts must be installed manually on CT 302. Source of truth is `monitoring/scripts/` in this repo — copy to `/root/scripts/` on CT 302 to deploy. + **Manual/On-Demand**: - `tdarr_monitor.py` - Run as needed for Tdarr health checks - `tdarr_file_monitor.py` - Can be scheduled if automatic backup needed diff --git a/monitoring/scripts/ct302-self-health.sh b/monitoring/scripts/ct302-self-health.sh new file mode 100644 index 0000000..e2adc2d --- /dev/null +++ b/monitoring/scripts/ct302-self-health.sh @@ -0,0 +1,158 @@ +#!/usr/bin/env bash +# ct302-self-health.sh — CT 302 (claude-runner) disk self-check → Discord +# +# Monitors disk usage on CT 302 itself and alerts to Discord when any +# filesystem exceeds the threshold. Closes the blind spot where the +# monitoring system cannot monitor itself via external health checks. +# +# Designed to run silently when healthy (no Discord spam on green). +# Only posts when a filesystem is at or above THRESHOLD. +# +# Usage: +# ct302-self-health.sh [--discord-webhook URL] [--threshold N] [--dry-run] [--always-post] +# +# Environment overrides: +# DISCORD_WEBHOOK Discord webhook URL (required unless --dry-run) +# DISK_THRESHOLD Disk usage % alert threshold (default: 80) +# +# Install on CT 302 (daily, 07:00 UTC): +# 0 7 * * * /root/scripts/ct302-self-health.sh >> /var/log/ct302-self-health.log 2>&1 + +set -uo pipefail + +DISK_THRESHOLD="${DISK_THRESHOLD:-80}" +DISCORD_WEBHOOK="${DISCORD_WEBHOOK:-}" +DRY_RUN=0 +ALWAYS_POST=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --discord-webhook) + if [[ $# -lt 2 ]]; then + echo "Error: --discord-webhook requires a value" >&2 + exit 1 + fi + DISCORD_WEBHOOK="$2" + shift 2 + ;; + --threshold) + if [[ $# -lt 2 ]]; then + echo "Error: --threshold requires a value" >&2 + exit 1 + fi + DISK_THRESHOLD="$2" + shift 2 + ;; + --dry-run) + DRY_RUN=1 + shift + ;; + --always-post) + ALWAYS_POST=1 + shift + ;; + *) + echo "Unknown option: $1" >&2 + exit 1 + ;; + esac +done + +if [[ "$DRY_RUN" -eq 0 && -z "$DISCORD_WEBHOOK" ]]; then + echo "Error: DISCORD_WEBHOOK not set. Use --discord-webhook URL or set env var." >&2 + exit 1 +fi + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } + +# --------------------------------------------------------------------------- +# Check disk usage on all real filesystems +# --------------------------------------------------------------------------- +# df output: Filesystem Use% Mounted-on (skipping tmpfs, devtmpfs, overlay) +TRIGGERED=() +ALL_FS=() + +while IFS= read -r line; do + fs=$(echo "$line" | awk '{print $1}') + pct=$(echo "$line" | awk '{print $5}' | tr -d '%') + mount=$(echo "$line" | awk '{print $6}') + ALL_FS+=("${pct}% ${mount} (${fs})") + if [[ "$pct" -ge "$DISK_THRESHOLD" ]]; then + TRIGGERED+=("${pct}% used — ${mount} (${fs})") + fi +done < <(df -h --output=source,size,used,avail,pcent,target | + tail -n +2 | + awk '$1 !~ /^(tmpfs|devtmpfs|overlay|udev)/' | + awk '{print $1, $5, $6}') + +HOSTNAME=$(hostname -s) +TRIGGERED_COUNT=${#TRIGGERED[@]} + +log "Disk check complete: ${TRIGGERED_COUNT} filesystem(s) above ${DISK_THRESHOLD}%" + +# Exit cleanly with no Discord post if everything is healthy +if [[ "$TRIGGERED_COUNT" -eq 0 && "$ALWAYS_POST" -eq 0 && "$DRY_RUN" -eq 0 ]]; then + log "All filesystems healthy — no alert needed." + exit 0 +fi + +# --------------------------------------------------------------------------- +# Build Discord payload +# --------------------------------------------------------------------------- +if [[ "$TRIGGERED_COUNT" -gt 0 ]]; then + EMBED_COLOR=15548997 # 0xED4245 red + TITLE="🔴 ${HOSTNAME}: Disk usage above ${DISK_THRESHOLD}%" + alert_lines=$(printf '⚠️ %s\n' "${TRIGGERED[@]}") + FIELDS=$(jq -n \ + --arg name "Filesystems Over Threshold" \ + --arg value "$alert_lines" \ + '[{"name": $name, "value": $value, "inline": false}]') +else + EMBED_COLOR=5763719 # 0x57F287 green + TITLE="🟢 ${HOSTNAME}: All filesystems healthy" + FIELDS='[]' +fi + +# Add summary of all filesystems +all_lines=$(printf '%s\n' "${ALL_FS[@]}") +FIELDS=$(echo "$FIELDS" | jq \ + --arg name "All Filesystems" \ + --arg value "$all_lines" \ + '. + [{"name": $name, "value": $value, "inline": false}]') + +FOOTER="$(date -u '+%Y-%m-%d %H:%M UTC') · CT 302 self-health · threshold: ${DISK_THRESHOLD}%" + +PAYLOAD=$(jq -n \ + --arg title "$TITLE" \ + --argjson color "$EMBED_COLOR" \ + --argjson fields "$FIELDS" \ + --arg footer "$FOOTER" \ + '{ + "embeds": [{ + "title": $title, + "color": $color, + "fields": $fields, + "footer": {"text": $footer} + }] + }') + +if [[ "$DRY_RUN" -eq 1 ]]; then + log "DRY RUN — Discord payload:" + echo "$PAYLOAD" | jq . + exit 0 +fi + +log "Posting to Discord..." +HTTP_STATUS=$(curl -s -o /tmp/ct302-self-health-discord.out \ + -w "%{http_code}" \ + -X POST "$DISCORD_WEBHOOK" \ + -H "Content-Type: application/json" \ + -d "$PAYLOAD") + +if [[ "$HTTP_STATUS" -ge 200 && "$HTTP_STATUS" -lt 300 ]]; then + log "Discord notification sent (HTTP ${HTTP_STATUS})." +else + log "Warning: Discord returned HTTP ${HTTP_STATUS}." + cat /tmp/ct302-self-health-discord.out >&2 + exit 1 +fi diff --git a/monitoring/scripts/proxmox-backup-check.sh b/monitoring/scripts/proxmox-backup-check.sh new file mode 100644 index 0000000..fcc1186 --- /dev/null +++ b/monitoring/scripts/proxmox-backup-check.sh @@ -0,0 +1,230 @@ +#!/usr/bin/env bash +# proxmox-backup-check.sh — Weekly Proxmox backup verification → Discord +# +# SSHes to the Proxmox host and checks that every running VM/CT has a +# successful vzdump backup within the last 7 days. Posts a color-coded +# Discord summary with per-guest status. +# +# Usage: +# proxmox-backup-check.sh [--discord-webhook URL] [--days N] [--dry-run] +# +# Environment overrides: +# DISCORD_WEBHOOK Discord webhook URL (required unless --dry-run) +# PROXMOX_NODE Proxmox node name (default: proxmox) +# PROXMOX_SSH SSH alias or host for Proxmox (default: proxmox) +# WINDOW_DAYS Backup recency window in days (default: 7) +# +# Install on CT 302 (weekly, Monday 08:00 UTC): +# 0 8 * * 1 /root/scripts/proxmox-backup-check.sh >> /var/log/proxmox-backup-check.log 2>&1 + +set -uo pipefail + +PROXMOX_NODE="${PROXMOX_NODE:-proxmox}" +PROXMOX_SSH="${PROXMOX_SSH:-proxmox}" +WINDOW_DAYS="${WINDOW_DAYS:-7}" +DISCORD_WEBHOOK="${DISCORD_WEBHOOK:-}" +DRY_RUN=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --discord-webhook) + if [[ $# -lt 2 ]]; then + echo "Error: --discord-webhook requires a value" >&2 + exit 1 + fi + DISCORD_WEBHOOK="$2" + shift 2 + ;; + --days) + if [[ $# -lt 2 ]]; then + echo "Error: --days requires a value" >&2 + exit 1 + fi + WINDOW_DAYS="$2" + shift 2 + ;; + --dry-run) + DRY_RUN=1 + shift + ;; + *) + echo "Unknown option: $1" >&2 + exit 1 + ;; + esac +done + +if [[ "$DRY_RUN" -eq 0 && -z "$DISCORD_WEBHOOK" ]]; then + echo "Error: DISCORD_WEBHOOK not set. Use --discord-webhook URL or set env var." >&2 + exit 1 +fi + +if ! command -v jq &>/dev/null; then + echo "Error: jq is required but not installed." >&2 + exit 1 +fi + +SSH_OPTS="-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes" +CUTOFF=$(date -d "-${WINDOW_DAYS} days" +%s) +NOW=$(date +%s) + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } + +# --------------------------------------------------------------------------- +# Fetch data from Proxmox +# --------------------------------------------------------------------------- +log "Fetching VM and CT list from Proxmox node '${PROXMOX_NODE}'..." +VMS_JSON=$(ssh $SSH_OPTS "$PROXMOX_SSH" \ + "pvesh get /nodes/${PROXMOX_NODE}/qemu --output-format json 2>/dev/null" || echo "[]") +CTS_JSON=$(ssh $SSH_OPTS "$PROXMOX_SSH" \ + "pvesh get /nodes/${PROXMOX_NODE}/lxc --output-format json 2>/dev/null" || echo "[]") + +log "Fetching recent vzdump task history (limit 200)..." +TASKS_JSON=$(ssh $SSH_OPTS "$PROXMOX_SSH" \ + "pvesh get /nodes/${PROXMOX_NODE}/tasks --typefilter vzdump --limit 200 --output-format json 2>/dev/null" || echo "[]") + +# --------------------------------------------------------------------------- +# Build per-guest backup status +# --------------------------------------------------------------------------- +# Merge VMs and CTs into one list: [{vmid, name, type}] +GUESTS_JSON=$(jq -n \ + --argjson vms "$VMS_JSON" \ + --argjson cts "$CTS_JSON" ' + ($vms | map(select(.status == "running") | {vmid: (.vmid | tostring), name, type: "VM"})) + + ($cts | map(select(.status == "running") | {vmid: (.vmid | tostring), name, type: "CT"})) + ') + +GUEST_COUNT=$(echo "$GUESTS_JSON" | jq 'length') +log "Found ${GUEST_COUNT} running guests." + +# For each guest, find the most recent successful (status == "OK") vzdump task +RESULTS=$(jq -n \ + --argjson guests "$GUESTS_JSON" \ + --argjson tasks "$TASKS_JSON" \ + --argjson cutoff "$CUTOFF" \ + --argjson now "$NOW" \ + --argjson window "$WINDOW_DAYS" ' + $guests | map( + . as $g | + ($tasks | map( + select( + (.vmid | tostring) == $g.vmid + and .status == "OK" + ) | .starttime + ) | max // 0) as $last_ts | + { + vmid: $g.vmid, + name: $g.name, + type: $g.type, + last_backup_ts: $last_ts, + age_days: (if $last_ts > 0 then (($now - $last_ts) / 86400 | floor) else -1 end), + status: ( + if $last_ts >= $cutoff then "green" + elif $last_ts > 0 then "yellow" + else "red" + end + ) + } + ) | sort_by(.vmid | tonumber) +') + +GREEN_GUESTS=$(echo "$RESULTS" | jq '[.[] | select(.status == "green")]') +YELLOW_GUESTS=$(echo "$RESULTS" | jq '[.[] | select(.status == "yellow")]') +RED_GUESTS=$(echo "$RESULTS" | jq '[.[] | select(.status == "red")]') + +GREEN_COUNT=$(echo "$GREEN_GUESTS" | jq 'length') +YELLOW_COUNT=$(echo "$YELLOW_GUESTS" | jq 'length') +RED_COUNT=$(echo "$RED_GUESTS" | jq 'length') + +log "Results: ${GREEN_COUNT} green, ${YELLOW_COUNT} yellow, ${RED_COUNT} red" + +# --------------------------------------------------------------------------- +# Build Discord payload +# --------------------------------------------------------------------------- +if [[ "$RED_COUNT" -gt 0 ]]; then + EMBED_COLOR=15548997 # 0xED4245 red + STATUS_LINE="🔴 Backup issues detected — action required" +elif [[ "$YELLOW_COUNT" -gt 0 ]]; then + EMBED_COLOR=16705372 # 0xFF851C orange + STATUS_LINE="🟡 Some backups are overdue (>${WINDOW_DAYS}d)" +else + EMBED_COLOR=5763719 # 0x57F287 green + STATUS_LINE="🟢 All ${GUEST_COUNT} guests backed up within ${WINDOW_DAYS} days" +fi + +# Format guest lines: "VM 116 (plex) — 2d ago" or "CT 302 (claude-runner) — NO BACKUPS" +format_guest() { + local prefix="$1" guests="$2" + echo "$guests" | jq -r '.[] | "\(.type) \(.vmid) (\(.name))"' | + while IFS= read -r line; do echo "${prefix} ${line}"; done +} + +format_guest_with_age() { + local prefix="$1" guests="$2" + echo "$guests" | jq -r '.[] | "\(.type) \(.vmid) (\(.name)) — \(.age_days)d ago"' | + while IFS= read -r line; do echo "${prefix} ${line}"; done +} + +# Build fields array +fields='[]' + +if [[ "$GREEN_COUNT" -gt 0 ]]; then + green_lines=$(format_guest_with_age "✅" "$GREEN_GUESTS") + fields=$(echo "$fields" | jq \ + --arg name "🟢 Healthy (${GREEN_COUNT})" \ + --arg value "$green_lines" \ + '. + [{"name": $name, "value": $value, "inline": false}]') +fi + +if [[ "$YELLOW_COUNT" -gt 0 ]]; then + yellow_lines=$(format_guest_with_age "⚠️" "$YELLOW_GUESTS") + fields=$(echo "$fields" | jq \ + --arg name "🟡 Overdue — last backup >${WINDOW_DAYS}d ago (${YELLOW_COUNT})" \ + --arg value "$yellow_lines" \ + '. + [{"name": $name, "value": $value, "inline": false}]') +fi + +if [[ "$RED_COUNT" -gt 0 ]]; then + red_lines=$(format_guest "❌" "$RED_GUESTS") + fields=$(echo "$fields" | jq \ + --arg name "🔴 No Successful Backups Found (${RED_COUNT})" \ + --arg value "$red_lines" \ + '. + [{"name": $name, "value": $value, "inline": false}]') +fi + +FOOTER="$(date -u '+%Y-%m-%d %H:%M UTC') · ${GUEST_COUNT} guests · window: ${WINDOW_DAYS}d" + +PAYLOAD=$(jq -n \ + --arg title "Proxmox Backup Check — ${STATUS_LINE}" \ + --argjson color "$EMBED_COLOR" \ + --argjson fields "$fields" \ + --arg footer "$FOOTER" \ + '{ + "embeds": [{ + "title": $title, + "color": $color, + "fields": $fields, + "footer": {"text": $footer} + }] + }') + +if [[ "$DRY_RUN" -eq 1 ]]; then + log "DRY RUN — Discord payload:" + echo "$PAYLOAD" | jq . + exit 0 +fi + +log "Posting to Discord..." +HTTP_STATUS=$(curl -s -o /tmp/proxmox-backup-check-discord.out \ + -w "%{http_code}" \ + -X POST "$DISCORD_WEBHOOK" \ + -H "Content-Type: application/json" \ + -d "$PAYLOAD") + +if [[ "$HTTP_STATUS" -ge 200 && "$HTTP_STATUS" -lt 300 ]]; then + log "Discord notification sent (HTTP ${HTTP_STATUS})." +else + log "Warning: Discord returned HTTP ${HTTP_STATUS}." + cat /tmp/proxmox-backup-check-discord.out >&2 + exit 1 +fi From cacf4a9043c43732d32d1eb73cf464a0be29eeb0 Mon Sep 17 00:00:00 2001 From: Cal Corum Date: Sun, 5 Apr 2026 19:24:59 -0500 Subject: [PATCH 4/6] feat: add weekly Gitea disk cleanup Ansible playbook Gitea LXC 225 hit 100% disk from accumulated Docker buildx volumes, repo-archive cache, and journal logs. Adds automated weekly cleanup managed by systemd timer on the Ansible controller (Wed 04:00 UTC). Co-Authored-By: Claude Opus 4.6 (1M context) --- ansible/playbooks/gitea-cleanup.yml | 80 +++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 ansible/playbooks/gitea-cleanup.yml diff --git a/ansible/playbooks/gitea-cleanup.yml b/ansible/playbooks/gitea-cleanup.yml new file mode 100644 index 0000000..83157c4 --- /dev/null +++ b/ansible/playbooks/gitea-cleanup.yml @@ -0,0 +1,80 @@ +--- +# gitea-cleanup.yml — Weekly cleanup of Gitea server disk space +# +# Removes stale Docker buildx volumes, unused images, Gitea repo-archive +# cache, and vacuums journal logs to prevent disk exhaustion on LXC 225. +# +# Schedule: Weekly via systemd timer on LXC 304 (ansible-controller) +# +# Usage: +# ansible-playbook /opt/ansible/playbooks/gitea-cleanup.yml # full run +# ansible-playbook /opt/ansible/playbooks/gitea-cleanup.yml --check # dry run + +- name: Gitea server disk cleanup + hosts: gitea + gather_facts: false + + tasks: + - name: Check current disk usage + ansible.builtin.shell: df --output=pcent / | tail -1 + register: disk_before + changed_when: false + + - name: Display current disk usage + ansible.builtin.debug: + msg: "Disk usage before cleanup: {{ disk_before.stdout | trim }}" + + - name: Clear Gitea repo-archive cache + ansible.builtin.find: + paths: /var/lib/gitea/data/repo-archive + file_type: any + register: repo_archive_files + + - name: Remove repo-archive files + ansible.builtin.file: + path: "{{ item.path }}" + state: absent + loop: "{{ repo_archive_files.files }}" + loop_control: + label: "{{ item.path | basename }}" + when: repo_archive_files.files | length > 0 + + - name: Remove orphaned Docker buildx volumes + ansible.builtin.shell: | + volumes=$(docker volume ls -q --filter name=buildx_buildkit) + if [ -n "$volumes" ]; then + echo "$volumes" | xargs docker volume rm 2>&1 + else + echo "No buildx volumes to remove" + fi + register: buildx_cleanup + changed_when: "'No buildx volumes' not in buildx_cleanup.stdout" + + - name: Prune unused Docker images + ansible.builtin.command: docker image prune -af + register: image_prune + changed_when: "'Total reclaimed space: 0B' not in image_prune.stdout" + + - name: Prune unused Docker volumes + ansible.builtin.command: docker volume prune -f + register: volume_prune + changed_when: "'Total reclaimed space: 0B' not in volume_prune.stdout" + + - name: Vacuum journal logs to 500M + ansible.builtin.command: journalctl --vacuum-size=500M + register: journal_vacuum + changed_when: "'freed 0B' not in journal_vacuum.stderr" + + - name: Check disk usage after cleanup + ansible.builtin.shell: df --output=pcent / | tail -1 + register: disk_after + changed_when: false + + - name: Display cleanup summary + ansible.builtin.debug: + msg: >- + Cleanup complete. + Disk: {{ disk_before.stdout | default('N/A') | trim }} → {{ disk_after.stdout | default('N/A') | trim }}. + Buildx: {{ (buildx_cleanup.stdout_lines | default(['N/A'])) | last }}. + Images: {{ (image_prune.stdout_lines | default(['N/A'])) | last }}. + Journal: {{ (journal_vacuum.stderr_lines | default(['N/A'])) | last }}. From acb8fef0843dd79201e94aae37f96a7ee43b1724 Mon Sep 17 00:00:00 2001 From: Cal Corum Date: Mon, 6 Apr 2026 00:00:03 -0500 Subject: [PATCH 5/6] =?UTF-8?q?docs:=20sync=20KB=20=E2=80=94=20database-de?= =?UTF-8?q?ployment-guide.md,refractor-in-app-test-plan.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paper-dynasty/database-deployment-guide.md | 6 +- paper-dynasty/refractor-in-app-test-plan.md | 107 ++++++++++++++++++++ 2 files changed, 111 insertions(+), 2 deletions(-) create mode 100644 paper-dynasty/refractor-in-app-test-plan.md diff --git a/paper-dynasty/database-deployment-guide.md b/paper-dynasty/database-deployment-guide.md index bfb407c..014014f 100644 --- a/paper-dynasty/database-deployment-guide.md +++ b/paper-dynasty/database-deployment-guide.md @@ -178,7 +178,7 @@ When merging many PRs at once (e.g., batch pagination PRs), branch protection ru | `LOG_LEVEL` | Logging verbosity (default: INFO) | | `DATABASE_TYPE` | `postgresql` | | `POSTGRES_HOST` | Container name of PostgreSQL | -| `POSTGRES_DB` | Database name (`pd_master`) | +| `POSTGRES_DB` | Database name — `pd_master` (prod) / `paperdynasty_dev` (dev) | | `POSTGRES_USER` | DB username | | `POSTGRES_PASSWORD` | DB password | @@ -189,4 +189,6 @@ When merging many PRs at once (e.g., batch pagination PRs), branch protection ru | Database API (prod) | `ssh akamai` | `pd_api` | 815 | | Database API (dev) | `ssh pd-database` | `dev_pd_database` | 813 | | PostgreSQL (prod) | `ssh akamai` | `pd_postgres` | 5432 | -| PostgreSQL (dev) | `ssh pd-database` | `pd_postgres` | 5432 | +| PostgreSQL (dev) | `ssh pd-database` | `sba_postgres` | 5432 | + +**Dev database credentials:** container `sba_postgres`, database `paperdynasty_dev`, user `sba_admin`. Prod uses `pd_postgres`, database `pd_master`. diff --git a/paper-dynasty/refractor-in-app-test-plan.md b/paper-dynasty/refractor-in-app-test-plan.md new file mode 100644 index 0000000..1f250c7 --- /dev/null +++ b/paper-dynasty/refractor-in-app-test-plan.md @@ -0,0 +1,107 @@ +--- +title: "Refractor In-App Test Plan" +description: "Comprehensive manual test plan for the Refractor card evolution system — covers /refractor status, tier badges, post-game hooks, tier-up notifications, card art tiers, and known issues." +type: guide +domain: paper-dynasty +tags: [paper-dynasty, testing, refractor, discord, database] +--- + +# Refractor In-App Test Plan + +Manual test plan for the Refractor (card evolution) system. All testing targets **dev** environment (`pddev.manticorum.com` / dev Discord bot). + +## Prerequisites + +- Dev bot running on `sba-bots` +- Dev API at `pddev.manticorum.com` (port 813) +- Team with seeded refractor data (team 31 from prior session) +- At least one game playable to trigger post-game hooks + +--- + +## REF-10: `/refractor status` — Basic Display + +| # | Test | Steps | Expected | +|---|---|---|---| +| 10 | No filters | `/refractor status` | Ephemeral embed with team branding, tier summary line, 10 cards sorted by tier DESC, pagination buttons if >10 cards | +| 11 | Card type filter | `/refractor status card_type:Batter` | Only batter cards shown, count matches | +| 12 | Tier filter | `/refractor status tier:T2—Refractor` | Only T2 cards, embed color changes to tier color | +| 13 | Progress filter | `/refractor status progress:Close to next tier` | Only cards >=80% to next threshold, fully evolved excluded | +| 14 | Combined filters | `/refractor status card_type:Batter tier:T1—Base Chrome` | Intersection of both filters | +| 15 | Empty result | `/refractor status tier:T4—Superfractor` (if none exist) | "No cards match your filters..." message with filter details | + +## REF-20: `/refractor status` — Pagination + +| # | Test | Steps | Expected | +|---|---|---|---| +| 20 | Page buttons appear | `/refractor status` with >10 cards | Prev/Next buttons visible | +| 21 | Next page | Click `Next >` | Page 2 shown, footer updates to "Page 2/N" | +| 22 | Prev page | From page 2, click `< Prev` | Back to page 1 | +| 23 | First page prev | On page 1, click `< Prev` | Nothing happens / stays on page 1 | +| 24 | Last page next | On last page, click `Next >` | Nothing happens / stays on last page | +| 25 | Button timeout | Wait 120s after command | Buttons become unresponsive | +| 26 | Wrong user clicks | Another user clicks buttons | Silently ignored | + +## REF-30: Tier Badges in Card Embeds + +| # | Test | Steps | Expected | +|---|---|---|---| +| 30 | T0 card display | View a T0 card via `/myteam` or `/roster` | No badge prefix, just player name | +| 31 | T1 badge | View a T1 card | Title shows `[BC] Player Name` | +| 32 | T2 badge | View a T2 card | Title shows `[R] Player Name` | +| 33 | T3 badge | View a T3 card | Title shows `[GR] Player Name` | +| 34 | T4 badge | View a T4 card (if exists) | Title shows `[SF] Player Name` | +| 35 | Badge in pack open | Open a pack with an evolved card | Badge appears in pack embed | +| 36 | API down gracefully | (hard to test) | Card displays normally with no badge, no error | + +## REF-50: Post-Game Hook & Tier-Up Notifications + +| # | Test | Steps | Expected | +|---|---|---|---| +| 50 | Game completes normally | Play a full game | No errors in bot logs; refractor evaluate-game fires after season-stats update | +| 51 | Tier-up notification | Play game where a card crosses a threshold | Embed in game channel: "Refractor Tier Up!", player name, tier name, correct color | +| 52 | No tier-up | Play game where no thresholds crossed | No refractor embed posted, game completes normally | +| 53 | Multiple tier-ups | Game where 2+ players tier up | One embed per tier-up, all posted | +| 54 | Auto-init new card | Play game with a card that has no RefractorCardState | State created automatically, player evaluated, no error | +| 55 | Superfractor notification | (may need forced data) | "SUPERFRACTOR!" title, teal color | + +## REF-60: Card Art with Tiers (API-level) + +| # | Test | Steps | Expected | +|---|---|---|---| +| 60 | T0 card image | `GET /api/v2/players/{id}/card-image?card_type=batting` | Base card, no tier styling | +| 61 | Tier override | `GET ...?card_type=batting&tier=2` | Refractor styling visible (border, diamond indicator) | +| 62 | Each tier visual | `?tier=1` through `?tier=4` | Correct border colors, diamond fill, header gradients per tier | +| 63 | Pitcher card | `?card_type=pitching&tier=2` | Tier styling applies correctly to pitcher layout | + +## REF-70: Known Issues to Verify + +| # | Issue | Check | Status | +|---|---|---|---| +| 70 | Superfractor embed says "Rating boosts coming in a future update!" | Verify — boosts ARE implemented now, text is stale | **Fix needed** | +| 71 | `on_timeout` doesn't edit message | Buttons stay visually active after 120s | **Known, low priority** | +| 72 | Card embed perf (1 API call per card) | Note latency on roster views with 10+ cards | **Monitor** | +| 73 | Season-stats failure kills refractor eval | Both in same try/except | **Known risk, verify logging** | + +--- + +## API Endpoints Under Test + +| Method | Endpoint | Used By | +|---|---|---| +| GET | `/api/v2/refractor/tracks` | Track listing | +| GET | `/api/v2/refractor/cards?team_id=X` | `/refractor status` command | +| GET | `/api/v2/refractor/cards/{card_id}` | Tier badge in card embeds | +| POST | `/api/v2/refractor/cards/{card_id}/evaluate` | Force re-evaluation | +| POST | `/api/v2/refractor/evaluate-game/{game_id}` | Post-game hook | +| GET | `/api/v2/teams/{team_id}/refractors` | Teams alias endpoint | +| GET | `/api/v2/players/{id}/card-image?tier=N` | Card art tier preview | + +## Notification Embed Colors + +| Tier | Name | Color | +|---|---|---| +| T1 | Base Chrome | Green (0x2ECC71) | +| T2 | Refractor | Gold (0xF1C40F) | +| T3 | Gold Refractor | Purple (0x9B59B6) | +| T4 | Superfractor | Teal (0x1ABC9C) | From dd7c68c13a8a4a242b20fc20ef01ba0c4a6218b0 Mon Sep 17 00:00:00 2001 From: Cal Corum Date: Mon, 6 Apr 2026 02:00:38 -0500 Subject: [PATCH 6/6] =?UTF-8?q?docs:=20sync=20KB=20=E2=80=94=20discord-bro?= =?UTF-8?q?wser-testing-workflow.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../discord-browser-testing-workflow.md | 170 ++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 paper-dynasty/discord-browser-testing-workflow.md diff --git a/paper-dynasty/discord-browser-testing-workflow.md b/paper-dynasty/discord-browser-testing-workflow.md new file mode 100644 index 0000000..cff469f --- /dev/null +++ b/paper-dynasty/discord-browser-testing-workflow.md @@ -0,0 +1,170 @@ +--- +title: "Discord Bot Browser Testing via Playwright + CDP" +description: "Step-by-step workflow for automated Discord bot testing using Playwright connected to Brave browser via Chrome DevTools Protocol. Covers setup, slash command execution, and screenshot capture." +type: runbook +domain: paper-dynasty +tags: [paper-dynasty, discord, testing, playwright, automation] +--- + +# Discord Bot Browser Testing via Playwright + CDP + +Automated testing of Paper Dynasty Discord bot commands by connecting Playwright to a running Brave browser instance with Discord open. + +## Prerequisites + +- Brave browser installed (`brave-browser-stable`) +- Playwright installed (`pip install playwright && playwright install chromium`) +- Discord logged in via browser (not desktop app) +- Discord bot running (locally via docker-compose or on remote host) +- Bot's `API_TOKEN` must match the target API environment + +## Setup + +### 1. Launch Brave with CDP enabled + +Brave must be started with `--remote-debugging-port`. If Brave is already running, **kill it first** — otherwise the flag is ignored and the new process merges into the existing one. + +```bash +killall brave && sleep 2 && brave-browser-stable --remote-debugging-port=9222 & +``` + +### 2. Verify CDP is responding + +```bash +curl -s http://localhost:9222/json/version | python3 -m json.tool +``` + +Should return JSON with `Browser`, `webSocketDebuggerUrl`, etc. + +### 3. Open Discord in browser + +Navigate to `https://discord.com/channels//` in Brave. + +**Paper Dynasty test server:** +- Server: Cals Test Server (`669356687294988350`) +- Channel: #pd-game-test (`982850262903451658`) +- URL: `https://discord.com/channels/669356687294988350/982850262903451658` + +### 4. Verify bot is running with correct API token + +```bash +# Check docker-compose.yml has the right API_TOKEN for the target environment +grep API_TOKEN /mnt/NV2/Development/paper-dynasty/discord-app/docker-compose.yml + +# Dev API token lives on the dev host: +ssh pd-database "docker exec sba_postgres psql -U sba_admin -d paperdynasty_dev -c \"SELECT 1;\"" + +# Restart bot if token was changed: +cd /mnt/NV2/Development/paper-dynasty/discord-app && docker compose up -d +``` + +## Running Commands + +### Find the Discord tab + +```python +from playwright.sync_api import sync_playwright +import time + +with sync_playwright() as p: + browser = p.chromium.connect_over_cdp('http://localhost:9222') + for ctx in browser.contexts: + for page in ctx.pages: + if 'discord' in page.url.lower(): + print(f'Found: {page.url}') + break + browser.close() +``` + +### Execute a slash command and capture result + +```python +from playwright.sync_api import sync_playwright +import time + +def run_slash_command(command: str, wait_seconds: int = 5, screenshot_path: str = '/tmp/discord_result.png'): + """ + Type a slash command in Discord, select the top autocomplete option, + submit it, wait for the bot response, and take a screenshot. + """ + with sync_playwright() as p: + browser = p.chromium.connect_over_cdp('http://localhost:9222') + for ctx in browser.contexts: + for page in ctx.pages: + if 'discord' in page.url.lower(): + msg_box = page.locator('[role="textbox"][data-slate-editor="true"]') + msg_box.click() + time.sleep(0.3) + + # Type the command (delay simulates human typing for autocomplete) + msg_box.type(command, delay=80) + time.sleep(2) + + # Tab selects the top autocomplete option + page.keyboard.press('Tab') + time.sleep(1) + + # Enter submits the command + page.keyboard.press('Enter') + time.sleep(wait_seconds) + + page.screenshot(path=screenshot_path) + print(f'Screenshot saved to {screenshot_path}') + break + browser.close() + +# Example usage: +run_slash_command('/refractor status') +``` + +### Commands with parameters + +After pressing Tab to select the command, Discord shows an options panel. To fill parameters: + +1. The first parameter input is auto-focused after Tab +2. Type the value, then Tab to move to the next parameter +3. Press Enter when ready to submit + +```python +# Example: /refractor status with tier filter +msg_box.type('/refractor status', delay=80) +time.sleep(2) +page.keyboard.press('Tab') # Select command from autocomplete +time.sleep(1) +# Now fill parameters if needed, or just submit +page.keyboard.press('Enter') +``` + +## Key Selectors + +| Element | Selector | +|---------|----------| +| Message input box | `[role="textbox"][data-slate-editor="true"]` | +| Autocomplete popup | `[class*="autocomplete"]` | + +## Gotchas + +- **Brave must be killed before relaunch** — if an instance is already running, `--remote-debugging-port` is silently ignored +- **Bot token mismatch** — the bot's `API_TOKEN` in `docker-compose.yml` must match the target API (dev or prod). Symptoms: `{"detail":"Unauthorized"}` in bot logs +- **Viewport is None** — when connecting via CDP, `page.viewport_size` returns None. Use `page.evaluate('() => ({w: window.innerWidth, h: window.innerHeight})')` instead +- **Autocomplete timing** — typing too fast may not trigger Discord's autocomplete. The `delay=80` on `msg_box.type()` simulates human speed +- **Multiple bots** — if multiple bots register the same slash command (e.g. MantiTestBot and PucklTestBot), Tab selects the top option. Verify the correct bot name in the autocomplete popup before proceeding + +## Test Plan Reference + +The Refractor integration test plan is at: +`discord-app/tests/refractor-integration-test-plan.md` + +Key test case groups: +- REF-01 to REF-06: Tier badges and display +- REF-10 to REF-15: Progress bars and filtering +- REF-40 to REF-42: Cross-command badges (card, roster) +- REF-70 to REF-72: Cross-command badge propagation (the current priority) + +## Verified On + +- **Date:** 2026-04-06 +- **Browser:** Brave 146.0.7680.178 (Chromium-based) +- **Playwright:** Node.js driver via Python sync API +- **Bot:** MantiTestBot on Cals Test Server, #pd-game-test channel +- **API:** pddev.manticorum.com (dev environment)