Compare commits
6 Commits
issue/29-d
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dd7c68c13a | ||
|
|
acb8fef084 | ||
|
|
cacf4a9043 | ||
|
|
29a20fbe06 | ||
| fdc44acb28 | |||
|
|
48a804dda2 |
80
ansible/playbooks/gitea-cleanup.yml
Normal file
80
ansible/playbooks/gitea-cleanup.yml
Normal file
@ -0,0 +1,80 @@
|
||||
---
|
||||
# gitea-cleanup.yml — Weekly cleanup of Gitea server disk space
|
||||
#
|
||||
# Removes stale Docker buildx volumes, unused images, Gitea repo-archive
|
||||
# cache, and vacuums journal logs to prevent disk exhaustion on LXC 225.
|
||||
#
|
||||
# Schedule: Weekly via systemd timer on LXC 304 (ansible-controller)
|
||||
#
|
||||
# Usage:
|
||||
# ansible-playbook /opt/ansible/playbooks/gitea-cleanup.yml # full run
|
||||
# ansible-playbook /opt/ansible/playbooks/gitea-cleanup.yml --check # dry run
|
||||
|
||||
- name: Gitea server disk cleanup
|
||||
hosts: gitea
|
||||
gather_facts: false
|
||||
|
||||
tasks:
|
||||
- name: Check current disk usage
|
||||
ansible.builtin.shell: df --output=pcent / | tail -1
|
||||
register: disk_before
|
||||
changed_when: false
|
||||
|
||||
- name: Display current disk usage
|
||||
ansible.builtin.debug:
|
||||
msg: "Disk usage before cleanup: {{ disk_before.stdout | trim }}"
|
||||
|
||||
- name: Clear Gitea repo-archive cache
|
||||
ansible.builtin.find:
|
||||
paths: /var/lib/gitea/data/repo-archive
|
||||
file_type: any
|
||||
register: repo_archive_files
|
||||
|
||||
- name: Remove repo-archive files
|
||||
ansible.builtin.file:
|
||||
path: "{{ item.path }}"
|
||||
state: absent
|
||||
loop: "{{ repo_archive_files.files }}"
|
||||
loop_control:
|
||||
label: "{{ item.path | basename }}"
|
||||
when: repo_archive_files.files | length > 0
|
||||
|
||||
- name: Remove orphaned Docker buildx volumes
|
||||
ansible.builtin.shell: |
|
||||
volumes=$(docker volume ls -q --filter name=buildx_buildkit)
|
||||
if [ -n "$volumes" ]; then
|
||||
echo "$volumes" | xargs docker volume rm 2>&1
|
||||
else
|
||||
echo "No buildx volumes to remove"
|
||||
fi
|
||||
register: buildx_cleanup
|
||||
changed_when: "'No buildx volumes' not in buildx_cleanup.stdout"
|
||||
|
||||
- name: Prune unused Docker images
|
||||
ansible.builtin.command: docker image prune -af
|
||||
register: image_prune
|
||||
changed_when: "'Total reclaimed space: 0B' not in image_prune.stdout"
|
||||
|
||||
- name: Prune unused Docker volumes
|
||||
ansible.builtin.command: docker volume prune -f
|
||||
register: volume_prune
|
||||
changed_when: "'Total reclaimed space: 0B' not in volume_prune.stdout"
|
||||
|
||||
- name: Vacuum journal logs to 500M
|
||||
ansible.builtin.command: journalctl --vacuum-size=500M
|
||||
register: journal_vacuum
|
||||
changed_when: "'freed 0B' not in journal_vacuum.stderr"
|
||||
|
||||
- name: Check disk usage after cleanup
|
||||
ansible.builtin.shell: df --output=pcent / | tail -1
|
||||
register: disk_after
|
||||
changed_when: false
|
||||
|
||||
- name: Display cleanup summary
|
||||
ansible.builtin.debug:
|
||||
msg: >-
|
||||
Cleanup complete.
|
||||
Disk: {{ disk_before.stdout | default('N/A') | trim }} → {{ disk_after.stdout | default('N/A') | trim }}.
|
||||
Buildx: {{ (buildx_cleanup.stdout_lines | default(['N/A'])) | last }}.
|
||||
Images: {{ (image_prune.stdout_lines | default(['N/A'])) | last }}.
|
||||
Journal: {{ (journal_vacuum.stderr_lines | default(['N/A'])) | last }}.
|
||||
265
ansible/playbooks/monthly-reboot.yml
Normal file
265
ansible/playbooks/monthly-reboot.yml
Normal file
@ -0,0 +1,265 @@
|
||||
---
|
||||
# Monthly Proxmox Maintenance Reboot — Shutdown & Reboot
|
||||
#
|
||||
# Orchestrates a graceful shutdown of all guests in dependency order,
|
||||
# then issues a fire-and-forget reboot to the Proxmox host.
|
||||
#
|
||||
# After the host reboots, LXC 304 auto-starts via onboot:1 and the
|
||||
# post-reboot-startup.yml playbook runs automatically via the
|
||||
# ansible-post-reboot.service systemd unit (triggered by @reboot).
|
||||
#
|
||||
# Schedule: 1st Sunday of each month, 08:00 UTC (3 AM ET)
|
||||
# Controller: LXC 304 (ansible-controller) at 10.10.0.232
|
||||
#
|
||||
# Usage:
|
||||
# # Dry run
|
||||
# ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --check
|
||||
#
|
||||
# # Full execution
|
||||
# ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml
|
||||
#
|
||||
# # Shutdown only (skip the host reboot)
|
||||
# ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --tags shutdown
|
||||
#
|
||||
# Note: VM 109 (homeassistant) is excluded from Ansible inventory
|
||||
# (self-managed via HA Supervisor) but is included in pvesh start/stop.
|
||||
|
||||
- name: Pre-reboot health check and snapshots
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [pre-reboot, shutdown]
|
||||
|
||||
tasks:
|
||||
- name: Check Proxmox cluster health
|
||||
ansible.builtin.command: pvesh get /cluster/status --output-format json
|
||||
register: cluster_status
|
||||
changed_when: false
|
||||
|
||||
- name: Get list of running QEMU VMs
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/qemu --output-format json |
|
||||
python3 -c "import sys,json; [print(vm['vmid']) for vm in json.load(sys.stdin) if vm.get('status')=='running']"
|
||||
register: running_vms
|
||||
changed_when: false
|
||||
|
||||
- name: Get list of running LXC containers
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/lxc --output-format json |
|
||||
python3 -c "import sys,json; [print(ct['vmid']) for ct in json.load(sys.stdin) if ct.get('status')=='running']"
|
||||
register: running_lxcs
|
||||
changed_when: false
|
||||
|
||||
- name: Display running guests
|
||||
ansible.builtin.debug:
|
||||
msg: "Running VMs: {{ running_vms.stdout_lines }} | Running LXCs: {{ running_lxcs.stdout_lines }}"
|
||||
|
||||
- name: Snapshot running VMs
|
||||
ansible.builtin.command: >
|
||||
pvesh create /nodes/proxmox/qemu/{{ item }}/snapshot
|
||||
--snapname pre-maintenance-{{ lookup('pipe', 'date +%Y-%m-%d') }}
|
||||
--description "Auto snapshot before monthly maintenance reboot"
|
||||
loop: "{{ running_vms.stdout_lines }}"
|
||||
when: running_vms.stdout_lines | length > 0
|
||||
ignore_errors: true
|
||||
|
||||
- name: Snapshot running LXCs
|
||||
ansible.builtin.command: >
|
||||
pvesh create /nodes/proxmox/lxc/{{ item }}/snapshot
|
||||
--snapname pre-maintenance-{{ lookup('pipe', 'date +%Y-%m-%d') }}
|
||||
--description "Auto snapshot before monthly maintenance reboot"
|
||||
loop: "{{ running_lxcs.stdout_lines }}"
|
||||
when: running_lxcs.stdout_lines | length > 0
|
||||
ignore_errors: true
|
||||
|
||||
- name: "Shutdown Tier 4 — Media & Others"
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [shutdown]
|
||||
|
||||
vars:
|
||||
tier4_vms: [109]
|
||||
# LXC 303 (mcp-gateway) is onboot=0 and operator-managed — not included here
|
||||
tier4_lxcs: [221, 222, 223, 302]
|
||||
|
||||
tasks:
|
||||
- name: Shutdown Tier 4 VMs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
|
||||
loop: "{{ tier4_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Shutdown Tier 4 LXCs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown
|
||||
loop: "{{ tier4_lxcs }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for Tier 4 VMs to stop
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
||||
register: t4_vm_status
|
||||
until: t4_vm_status.stdout.strip() == "stopped"
|
||||
retries: 12
|
||||
delay: 5
|
||||
loop: "{{ tier4_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for Tier 4 LXCs to stop
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json |
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
||||
register: t4_lxc_status
|
||||
until: t4_lxc_status.stdout.strip() == "stopped"
|
||||
retries: 12
|
||||
delay: 5
|
||||
loop: "{{ tier4_lxcs }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: "Shutdown Tier 3 — Applications"
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [shutdown]
|
||||
|
||||
vars:
|
||||
tier3_vms: [115, 110]
|
||||
tier3_lxcs: [301]
|
||||
|
||||
tasks:
|
||||
- name: Shutdown Tier 3 VMs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
|
||||
loop: "{{ tier3_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Shutdown Tier 3 LXCs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown
|
||||
loop: "{{ tier3_lxcs }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for Tier 3 VMs to stop
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
||||
register: t3_vm_status
|
||||
until: t3_vm_status.stdout.strip() == "stopped"
|
||||
retries: 12
|
||||
delay: 5
|
||||
loop: "{{ tier3_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for Tier 3 LXCs to stop
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json |
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
||||
register: t3_lxc_status
|
||||
until: t3_lxc_status.stdout.strip() == "stopped"
|
||||
retries: 12
|
||||
delay: 5
|
||||
loop: "{{ tier3_lxcs }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: "Shutdown Tier 2 — Infrastructure"
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [shutdown]
|
||||
|
||||
vars:
|
||||
tier2_vms: [106, 116]
|
||||
tier2_lxcs: [225, 210, 227]
|
||||
|
||||
tasks:
|
||||
- name: Shutdown Tier 2 VMs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
|
||||
loop: "{{ tier2_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Shutdown Tier 2 LXCs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown
|
||||
loop: "{{ tier2_lxcs }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for Tier 2 VMs to stop
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
||||
register: t2_vm_status
|
||||
until: t2_vm_status.stdout.strip() == "stopped"
|
||||
retries: 12
|
||||
delay: 5
|
||||
loop: "{{ tier2_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for Tier 2 LXCs to stop
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json |
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
||||
register: t2_lxc_status
|
||||
until: t2_lxc_status.stdout.strip() == "stopped"
|
||||
retries: 12
|
||||
delay: 5
|
||||
loop: "{{ tier2_lxcs }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: "Shutdown Tier 1 — Databases"
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [shutdown]
|
||||
|
||||
vars:
|
||||
tier1_vms: [112]
|
||||
|
||||
tasks:
|
||||
- name: Shutdown database VMs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
|
||||
loop: "{{ tier1_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for database VMs to stop (up to 90s)
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
||||
register: t1_vm_status
|
||||
until: t1_vm_status.stdout.strip() == "stopped"
|
||||
retries: 18
|
||||
delay: 5
|
||||
loop: "{{ tier1_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Force stop database VMs if still running
|
||||
ansible.builtin.shell: >
|
||||
status=$(pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))");
|
||||
if [ "$status" = "running" ]; then
|
||||
pvesh create /nodes/proxmox/qemu/{{ item }}/status/stop;
|
||||
echo "Force stopped VM {{ item }}";
|
||||
else
|
||||
echo "VM {{ item }} already stopped";
|
||||
fi
|
||||
loop: "{{ tier1_vms }}"
|
||||
register: force_stop_result
|
||||
changed_when: force_stop_result.results | default([]) | selectattr('stdout', 'defined') | selectattr('stdout', 'search', 'Force stopped') | list | length > 0
|
||||
|
||||
- name: "Verify and reboot Proxmox host"
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [reboot]
|
||||
|
||||
tasks:
|
||||
- name: Verify all guests are stopped (excluding LXC 304)
|
||||
ansible.builtin.shell: >
|
||||
running_vms=$(pvesh get /nodes/proxmox/qemu --output-format json |
|
||||
python3 -c "import sys,json; vms=[v for v in json.load(sys.stdin) if v.get('status')=='running']; print(len(vms))");
|
||||
running_lxcs=$(pvesh get /nodes/proxmox/lxc --output-format json |
|
||||
python3 -c "import sys,json; cts=[c for c in json.load(sys.stdin) if c.get('status')=='running' and c['vmid'] != 304]; print(len(cts))");
|
||||
echo "Running VMs: $running_vms, Running LXCs: $running_lxcs";
|
||||
if [ "$running_vms" != "0" ] || [ "$running_lxcs" != "0" ]; then exit 1; fi
|
||||
register: verify_stopped
|
||||
|
||||
- name: Issue fire-and-forget reboot (controller will be killed)
|
||||
ansible.builtin.shell: >
|
||||
nohup bash -c 'sleep 10 && reboot' &>/dev/null &
|
||||
echo "Reboot scheduled in 10 seconds"
|
||||
register: reboot_issued
|
||||
when: not ansible_check_mode
|
||||
|
||||
- name: Log reboot issued
|
||||
ansible.builtin.debug:
|
||||
msg: "{{ reboot_issued.stdout }} — Ansible process will terminate when host reboots. Post-reboot startup handled by ansible-post-reboot.service on LXC 304."
|
||||
214
ansible/playbooks/post-reboot-startup.yml
Normal file
214
ansible/playbooks/post-reboot-startup.yml
Normal file
@ -0,0 +1,214 @@
|
||||
---
|
||||
# Post-Reboot Startup — Controlled Guest Startup After Proxmox Reboot
|
||||
#
|
||||
# Starts all guests in dependency order with staggered delays to avoid
|
||||
# I/O storms. Runs automatically via ansible-post-reboot.service on
|
||||
# LXC 304 after the Proxmox host reboots.
|
||||
#
|
||||
# Can also be run manually:
|
||||
# ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml
|
||||
#
|
||||
# Note: VM 109 (homeassistant) is excluded from Ansible inventory
|
||||
# (self-managed via HA Supervisor) but is included in pvesh start/stop.
|
||||
|
||||
- name: Wait for Proxmox API to be ready
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [startup]
|
||||
|
||||
tasks:
|
||||
- name: Wait for Proxmox API
|
||||
ansible.builtin.command: pvesh get /version --output-format json
|
||||
register: pve_version
|
||||
until: pve_version.rc == 0
|
||||
retries: 30
|
||||
delay: 10
|
||||
changed_when: false
|
||||
|
||||
- name: Display Proxmox version
|
||||
ansible.builtin.debug:
|
||||
msg: "Proxmox API ready: {{ pve_version.stdout | from_json | json_query('version') | default('unknown') }}"
|
||||
|
||||
- name: "Startup Tier 1 — Databases"
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [startup]
|
||||
|
||||
tasks:
|
||||
- name: Start database VM (112)
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/112/status/start
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for VM 112 to be running
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/qemu/112/status/current --output-format json |
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
||||
register: db_status
|
||||
until: db_status.stdout.strip() == "running"
|
||||
retries: 12
|
||||
delay: 5
|
||||
changed_when: false
|
||||
|
||||
- name: Wait for database services to initialize
|
||||
ansible.builtin.pause:
|
||||
seconds: 30
|
||||
|
||||
- name: "Startup Tier 2 — Infrastructure"
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [startup]
|
||||
|
||||
vars:
|
||||
tier2_vms: [106, 116]
|
||||
tier2_lxcs: [225, 210, 227]
|
||||
|
||||
tasks:
|
||||
- name: Start Tier 2 VMs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start
|
||||
loop: "{{ tier2_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Start Tier 2 LXCs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start
|
||||
loop: "{{ tier2_lxcs }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for infrastructure to come up
|
||||
ansible.builtin.pause:
|
||||
seconds: 30
|
||||
|
||||
- name: "Startup Tier 3 — Applications"
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [startup]
|
||||
|
||||
vars:
|
||||
tier3_vms: [115, 110]
|
||||
tier3_lxcs: [301]
|
||||
|
||||
tasks:
|
||||
- name: Start Tier 3 VMs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start
|
||||
loop: "{{ tier3_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Start Tier 3 LXCs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start
|
||||
loop: "{{ tier3_lxcs }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for applications to start
|
||||
ansible.builtin.pause:
|
||||
seconds: 30
|
||||
|
||||
- name: Restart Pi-hole container via SSH (UDP DNS fix)
|
||||
ansible.builtin.command: ssh docker-home "docker restart pihole"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for Pi-hole to stabilize
|
||||
ansible.builtin.pause:
|
||||
seconds: 10
|
||||
|
||||
- name: "Startup Tier 4 — Media & Others"
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [startup]
|
||||
|
||||
vars:
|
||||
tier4_vms: [109]
|
||||
tier4_lxcs: [221, 222, 223, 302]
|
||||
|
||||
tasks:
|
||||
- name: Start Tier 4 VMs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start
|
||||
loop: "{{ tier4_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Start Tier 4 LXCs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start
|
||||
loop: "{{ tier4_lxcs }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Post-reboot validation
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [startup, validate]
|
||||
|
||||
tasks:
|
||||
- name: Wait for all services to initialize
|
||||
ansible.builtin.pause:
|
||||
seconds: 60
|
||||
|
||||
- name: Check all expected VMs are running
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/qemu --output-format json |
|
||||
python3 -c "
|
||||
import sys, json
|
||||
vms = json.load(sys.stdin)
|
||||
expected = {106, 109, 110, 112, 115, 116}
|
||||
running = {v['vmid'] for v in vms if v.get('status') == 'running'}
|
||||
missing = expected - running
|
||||
if missing:
|
||||
print(f'WARN: VMs not running: {missing}')
|
||||
sys.exit(1)
|
||||
print(f'All expected VMs running: {running & expected}')
|
||||
"
|
||||
register: vm_check
|
||||
ignore_errors: true
|
||||
|
||||
- name: Check all expected LXCs are running
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/lxc --output-format json |
|
||||
python3 -c "
|
||||
import sys, json
|
||||
cts = json.load(sys.stdin)
|
||||
# LXC 303 (mcp-gateway) intentionally excluded — onboot=0, operator-managed
|
||||
expected = {210, 221, 222, 223, 225, 227, 301, 302, 304}
|
||||
running = {c['vmid'] for c in cts if c.get('status') == 'running'}
|
||||
missing = expected - running
|
||||
if missing:
|
||||
print(f'WARN: LXCs not running: {missing}')
|
||||
sys.exit(1)
|
||||
print(f'All expected LXCs running: {running & expected}')
|
||||
"
|
||||
register: lxc_check
|
||||
ignore_errors: true
|
||||
|
||||
- name: Clean up old maintenance snapshots (older than 7 days)
|
||||
ansible.builtin.shell: >
|
||||
cutoff=$(date -d '7 days ago' +%s);
|
||||
for vmid in $(pvesh get /nodes/proxmox/qemu --output-format json |
|
||||
python3 -c "import sys,json; [print(v['vmid']) for v in json.load(sys.stdin)]"); do
|
||||
for snap in $(pvesh get /nodes/proxmox/qemu/$vmid/snapshot --output-format json |
|
||||
python3 -c "import sys,json; [print(s['name']) for s in json.load(sys.stdin) if s['name'].startswith('pre-maintenance-')]" 2>/dev/null); do
|
||||
snap_date=$(echo $snap | sed 's/pre-maintenance-//');
|
||||
snap_epoch=$(date -d "$snap_date" +%s 2>/dev/null);
|
||||
if [ -z "$snap_epoch" ]; then
|
||||
echo "WARN: could not parse date for snapshot $snap on VM $vmid";
|
||||
elif [ "$snap_epoch" -lt "$cutoff" ]; then
|
||||
pvesh delete /nodes/proxmox/qemu/$vmid/snapshot/$snap && echo "Deleted $snap from VM $vmid";
|
||||
fi
|
||||
done
|
||||
done;
|
||||
for ctid in $(pvesh get /nodes/proxmox/lxc --output-format json |
|
||||
python3 -c "import sys,json; [print(c['vmid']) for c in json.load(sys.stdin)]"); do
|
||||
for snap in $(pvesh get /nodes/proxmox/lxc/$ctid/snapshot --output-format json |
|
||||
python3 -c "import sys,json; [print(s['name']) for s in json.load(sys.stdin) if s['name'].startswith('pre-maintenance-')]" 2>/dev/null); do
|
||||
snap_date=$(echo $snap | sed 's/pre-maintenance-//');
|
||||
snap_epoch=$(date -d "$snap_date" +%s 2>/dev/null);
|
||||
if [ -z "$snap_epoch" ]; then
|
||||
echo "WARN: could not parse date for snapshot $snap on LXC $ctid";
|
||||
elif [ "$snap_epoch" -lt "$cutoff" ]; then
|
||||
pvesh delete /nodes/proxmox/lxc/$ctid/snapshot/$snap && echo "Deleted $snap from LXC $ctid";
|
||||
fi
|
||||
done
|
||||
done;
|
||||
echo "Snapshot cleanup complete"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Display validation results
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "VM status: {{ vm_check.stdout }}"
|
||||
- "LXC status: {{ lxc_check.stdout }}"
|
||||
- "Maintenance reboot complete — post-reboot startup finished"
|
||||
15
ansible/systemd/ansible-monthly-reboot.service
Normal file
15
ansible/systemd/ansible-monthly-reboot.service
Normal file
@ -0,0 +1,15 @@
|
||||
[Unit]
|
||||
Description=Monthly Proxmox maintenance reboot (Ansible)
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=cal
|
||||
WorkingDirectory=/opt/ansible
|
||||
ExecStart=/usr/bin/ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml
|
||||
StandardOutput=append:/opt/ansible/logs/monthly-reboot.log
|
||||
StandardError=append:/opt/ansible/logs/monthly-reboot.log
|
||||
TimeoutStartSec=900
|
||||
|
||||
# No [Install] section — this service is activated exclusively by ansible-monthly-reboot.timer
|
||||
13
ansible/systemd/ansible-monthly-reboot.timer
Normal file
13
ansible/systemd/ansible-monthly-reboot.timer
Normal file
@ -0,0 +1,13 @@
|
||||
[Unit]
|
||||
Description=Monthly Proxmox maintenance reboot timer
|
||||
Documentation=https://git.manticorum.com/cal/claude-home/src/branch/main/server-configs/proxmox/maintenance-reboot.md
|
||||
|
||||
[Timer]
|
||||
# First Sunday of the month at 08:00 UTC (3:00 AM ET during EDT)
|
||||
# Day range 01-07 ensures it's always the first occurrence of that weekday
|
||||
OnCalendar=Sun *-*-01..07 08:00:00
|
||||
Persistent=true
|
||||
RandomizedDelaySec=600
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
21
ansible/systemd/ansible-post-reboot.service
Normal file
21
ansible/systemd/ansible-post-reboot.service
Normal file
@ -0,0 +1,21 @@
|
||||
[Unit]
|
||||
Description=Post-reboot controlled guest startup (Ansible)
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
# Only run after a fresh boot — not on service restart
|
||||
ConditionUpTimeSec=600
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=cal
|
||||
WorkingDirectory=/opt/ansible
|
||||
# Delay 120s to let Proxmox API stabilize and onboot guests settle
|
||||
ExecStartPre=/bin/sleep 120
|
||||
ExecStart=/usr/bin/ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml
|
||||
StandardOutput=append:/opt/ansible/logs/post-reboot-startup.log
|
||||
StandardError=append:/opt/ansible/logs/post-reboot-startup.log
|
||||
TimeoutStartSec=1800
|
||||
|
||||
[Install]
|
||||
# Runs automatically on every boot of LXC 304
|
||||
WantedBy=multi-user.target
|
||||
@ -5,7 +5,7 @@
|
||||
# to collect system metrics, then generates a summary report.
|
||||
#
|
||||
# Usage:
|
||||
# homelab-audit.sh [--output-dir DIR]
|
||||
# homelab-audit.sh [--output-dir DIR] [--hosts label:ip,label:ip,...]
|
||||
#
|
||||
# Environment overrides:
|
||||
# STUCK_PROC_CPU_WARN CPU% at which a D-state process is flagged (default: 10)
|
||||
@ -29,7 +29,6 @@ LOAD_WARN=2.0
|
||||
MEM_WARN=85
|
||||
ZOMBIE_WARN=1
|
||||
SWAP_WARN=512
|
||||
|
||||
HOSTS_FILTER="" # comma-separated host list from --hosts; empty = audit all
|
||||
JSON_OUTPUT=0 # set to 1 by --json
|
||||
|
||||
|
||||
@ -93,6 +93,34 @@ else
|
||||
fail "disk_usage" "expected 'N /path', got: '$result'"
|
||||
fi
|
||||
|
||||
# --- --hosts flag parsing ---
|
||||
echo ""
|
||||
echo "=== --hosts argument parsing tests ==="
|
||||
|
||||
# Single host
|
||||
input="vm-115:10.10.0.88"
|
||||
IFS=',' read -ra entries <<<"$input"
|
||||
label="${entries[0]%%:*}"
|
||||
addr="${entries[0]#*:}"
|
||||
if [[ "$label" == "vm-115" && "$addr" == "10.10.0.88" ]]; then
|
||||
pass "--hosts single entry parsed: $label $addr"
|
||||
else
|
||||
fail "--hosts single" "expected 'vm-115 10.10.0.88', got: '$label $addr'"
|
||||
fi
|
||||
|
||||
# Multiple hosts
|
||||
input="vm-115:10.10.0.88,lxc-225:10.10.0.225"
|
||||
IFS=',' read -ra entries <<<"$input"
|
||||
label1="${entries[0]%%:*}"
|
||||
addr1="${entries[0]#*:}"
|
||||
label2="${entries[1]%%:*}"
|
||||
addr2="${entries[1]#*:}"
|
||||
if [[ "$label1" == "vm-115" && "$addr1" == "10.10.0.88" && "$label2" == "lxc-225" && "$addr2" == "10.10.0.225" ]]; then
|
||||
pass "--hosts multi entry parsed: $label1 $addr1, $label2 $addr2"
|
||||
else
|
||||
fail "--hosts multi" "unexpected parse result"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== Results: $PASS passed, $FAIL failed ==="
|
||||
((FAIL == 0))
|
||||
|
||||
@ -178,7 +178,7 @@ When merging many PRs at once (e.g., batch pagination PRs), branch protection ru
|
||||
| `LOG_LEVEL` | Logging verbosity (default: INFO) |
|
||||
| `DATABASE_TYPE` | `postgresql` |
|
||||
| `POSTGRES_HOST` | Container name of PostgreSQL |
|
||||
| `POSTGRES_DB` | Database name (`pd_master`) |
|
||||
| `POSTGRES_DB` | Database name — `pd_master` (prod) / `paperdynasty_dev` (dev) |
|
||||
| `POSTGRES_USER` | DB username |
|
||||
| `POSTGRES_PASSWORD` | DB password |
|
||||
|
||||
@ -189,4 +189,6 @@ When merging many PRs at once (e.g., batch pagination PRs), branch protection ru
|
||||
| Database API (prod) | `ssh akamai` | `pd_api` | 815 |
|
||||
| Database API (dev) | `ssh pd-database` | `dev_pd_database` | 813 |
|
||||
| PostgreSQL (prod) | `ssh akamai` | `pd_postgres` | 5432 |
|
||||
| PostgreSQL (dev) | `ssh pd-database` | `pd_postgres` | 5432 |
|
||||
| PostgreSQL (dev) | `ssh pd-database` | `sba_postgres` | 5432 |
|
||||
|
||||
**Dev database credentials:** container `sba_postgres`, database `paperdynasty_dev`, user `sba_admin`. Prod uses `pd_postgres`, database `pd_master`.
|
||||
|
||||
170
paper-dynasty/discord-browser-testing-workflow.md
Normal file
170
paper-dynasty/discord-browser-testing-workflow.md
Normal file
@ -0,0 +1,170 @@
|
||||
---
|
||||
title: "Discord Bot Browser Testing via Playwright + CDP"
|
||||
description: "Step-by-step workflow for automated Discord bot testing using Playwright connected to Brave browser via Chrome DevTools Protocol. Covers setup, slash command execution, and screenshot capture."
|
||||
type: runbook
|
||||
domain: paper-dynasty
|
||||
tags: [paper-dynasty, discord, testing, playwright, automation]
|
||||
---
|
||||
|
||||
# Discord Bot Browser Testing via Playwright + CDP
|
||||
|
||||
Automated testing of Paper Dynasty Discord bot commands by connecting Playwright to a running Brave browser instance with Discord open.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Brave browser installed (`brave-browser-stable`)
|
||||
- Playwright installed (`pip install playwright && playwright install chromium`)
|
||||
- Discord logged in via browser (not desktop app)
|
||||
- Discord bot running (locally via docker-compose or on remote host)
|
||||
- Bot's `API_TOKEN` must match the target API environment
|
||||
|
||||
## Setup
|
||||
|
||||
### 1. Launch Brave with CDP enabled
|
||||
|
||||
Brave must be started with `--remote-debugging-port`. If Brave is already running, **kill it first** — otherwise the flag is ignored and the new process merges into the existing one.
|
||||
|
||||
```bash
|
||||
killall brave && sleep 2 && brave-browser-stable --remote-debugging-port=9222 &
|
||||
```
|
||||
|
||||
### 2. Verify CDP is responding
|
||||
|
||||
```bash
|
||||
curl -s http://localhost:9222/json/version | python3 -m json.tool
|
||||
```
|
||||
|
||||
Should return JSON with `Browser`, `webSocketDebuggerUrl`, etc.
|
||||
|
||||
### 3. Open Discord in browser
|
||||
|
||||
Navigate to `https://discord.com/channels/<server_id>/<channel_id>` in Brave.
|
||||
|
||||
**Paper Dynasty test server:**
|
||||
- Server: Cals Test Server (`669356687294988350`)
|
||||
- Channel: #pd-game-test (`982850262903451658`)
|
||||
- URL: `https://discord.com/channels/669356687294988350/982850262903451658`
|
||||
|
||||
### 4. Verify bot is running with correct API token
|
||||
|
||||
```bash
|
||||
# Check docker-compose.yml has the right API_TOKEN for the target environment
|
||||
grep API_TOKEN /mnt/NV2/Development/paper-dynasty/discord-app/docker-compose.yml
|
||||
|
||||
# Dev API token lives on the dev host:
|
||||
ssh pd-database "docker exec sba_postgres psql -U sba_admin -d paperdynasty_dev -c \"SELECT 1;\""
|
||||
|
||||
# Restart bot if token was changed:
|
||||
cd /mnt/NV2/Development/paper-dynasty/discord-app && docker compose up -d
|
||||
```
|
||||
|
||||
## Running Commands
|
||||
|
||||
### Find the Discord tab
|
||||
|
||||
```python
|
||||
from playwright.sync_api import sync_playwright
|
||||
import time
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.connect_over_cdp('http://localhost:9222')
|
||||
for ctx in browser.contexts:
|
||||
for page in ctx.pages:
|
||||
if 'discord' in page.url.lower():
|
||||
print(f'Found: {page.url}')
|
||||
break
|
||||
browser.close()
|
||||
```
|
||||
|
||||
### Execute a slash command and capture result
|
||||
|
||||
```python
|
||||
from playwright.sync_api import sync_playwright
|
||||
import time
|
||||
|
||||
def run_slash_command(command: str, wait_seconds: int = 5, screenshot_path: str = '/tmp/discord_result.png'):
|
||||
"""
|
||||
Type a slash command in Discord, select the top autocomplete option,
|
||||
submit it, wait for the bot response, and take a screenshot.
|
||||
"""
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.connect_over_cdp('http://localhost:9222')
|
||||
for ctx in browser.contexts:
|
||||
for page in ctx.pages:
|
||||
if 'discord' in page.url.lower():
|
||||
msg_box = page.locator('[role="textbox"][data-slate-editor="true"]')
|
||||
msg_box.click()
|
||||
time.sleep(0.3)
|
||||
|
||||
# Type the command (delay simulates human typing for autocomplete)
|
||||
msg_box.type(command, delay=80)
|
||||
time.sleep(2)
|
||||
|
||||
# Tab selects the top autocomplete option
|
||||
page.keyboard.press('Tab')
|
||||
time.sleep(1)
|
||||
|
||||
# Enter submits the command
|
||||
page.keyboard.press('Enter')
|
||||
time.sleep(wait_seconds)
|
||||
|
||||
page.screenshot(path=screenshot_path)
|
||||
print(f'Screenshot saved to {screenshot_path}')
|
||||
break
|
||||
browser.close()
|
||||
|
||||
# Example usage:
|
||||
run_slash_command('/refractor status')
|
||||
```
|
||||
|
||||
### Commands with parameters
|
||||
|
||||
After pressing Tab to select the command, Discord shows an options panel. To fill parameters:
|
||||
|
||||
1. The first parameter input is auto-focused after Tab
|
||||
2. Type the value, then Tab to move to the next parameter
|
||||
3. Press Enter when ready to submit
|
||||
|
||||
```python
|
||||
# Example: /refractor status with tier filter
|
||||
msg_box.type('/refractor status', delay=80)
|
||||
time.sleep(2)
|
||||
page.keyboard.press('Tab') # Select command from autocomplete
|
||||
time.sleep(1)
|
||||
# Now fill parameters if needed, or just submit
|
||||
page.keyboard.press('Enter')
|
||||
```
|
||||
|
||||
## Key Selectors
|
||||
|
||||
| Element | Selector |
|
||||
|---------|----------|
|
||||
| Message input box | `[role="textbox"][data-slate-editor="true"]` |
|
||||
| Autocomplete popup | `[class*="autocomplete"]` |
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Brave must be killed before relaunch** — if an instance is already running, `--remote-debugging-port` is silently ignored
|
||||
- **Bot token mismatch** — the bot's `API_TOKEN` in `docker-compose.yml` must match the target API (dev or prod). Symptoms: `{"detail":"Unauthorized"}` in bot logs
|
||||
- **Viewport is None** — when connecting via CDP, `page.viewport_size` returns None. Use `page.evaluate('() => ({w: window.innerWidth, h: window.innerHeight})')` instead
|
||||
- **Autocomplete timing** — typing too fast may not trigger Discord's autocomplete. The `delay=80` on `msg_box.type()` simulates human speed
|
||||
- **Multiple bots** — if multiple bots register the same slash command (e.g. MantiTestBot and PucklTestBot), Tab selects the top option. Verify the correct bot name in the autocomplete popup before proceeding
|
||||
|
||||
## Test Plan Reference
|
||||
|
||||
The Refractor integration test plan is at:
|
||||
`discord-app/tests/refractor-integration-test-plan.md`
|
||||
|
||||
Key test case groups:
|
||||
- REF-01 to REF-06: Tier badges and display
|
||||
- REF-10 to REF-15: Progress bars and filtering
|
||||
- REF-40 to REF-42: Cross-command badges (card, roster)
|
||||
- REF-70 to REF-72: Cross-command badge propagation (the current priority)
|
||||
|
||||
## Verified On
|
||||
|
||||
- **Date:** 2026-04-06
|
||||
- **Browser:** Brave 146.0.7680.178 (Chromium-based)
|
||||
- **Playwright:** Node.js driver via Python sync API
|
||||
- **Bot:** MantiTestBot on Cals Test Server, #pd-game-test channel
|
||||
- **API:** pddev.manticorum.com (dev environment)
|
||||
107
paper-dynasty/refractor-in-app-test-plan.md
Normal file
107
paper-dynasty/refractor-in-app-test-plan.md
Normal file
@ -0,0 +1,107 @@
|
||||
---
|
||||
title: "Refractor In-App Test Plan"
|
||||
description: "Comprehensive manual test plan for the Refractor card evolution system — covers /refractor status, tier badges, post-game hooks, tier-up notifications, card art tiers, and known issues."
|
||||
type: guide
|
||||
domain: paper-dynasty
|
||||
tags: [paper-dynasty, testing, refractor, discord, database]
|
||||
---
|
||||
|
||||
# Refractor In-App Test Plan
|
||||
|
||||
Manual test plan for the Refractor (card evolution) system. All testing targets **dev** environment (`pddev.manticorum.com` / dev Discord bot).
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Dev bot running on `sba-bots`
|
||||
- Dev API at `pddev.manticorum.com` (port 813)
|
||||
- Team with seeded refractor data (team 31 from prior session)
|
||||
- At least one game playable to trigger post-game hooks
|
||||
|
||||
---
|
||||
|
||||
## REF-10: `/refractor status` — Basic Display
|
||||
|
||||
| # | Test | Steps | Expected |
|
||||
|---|---|---|---|
|
||||
| 10 | No filters | `/refractor status` | Ephemeral embed with team branding, tier summary line, 10 cards sorted by tier DESC, pagination buttons if >10 cards |
|
||||
| 11 | Card type filter | `/refractor status card_type:Batter` | Only batter cards shown, count matches |
|
||||
| 12 | Tier filter | `/refractor status tier:T2—Refractor` | Only T2 cards, embed color changes to tier color |
|
||||
| 13 | Progress filter | `/refractor status progress:Close to next tier` | Only cards >=80% to next threshold, fully evolved excluded |
|
||||
| 14 | Combined filters | `/refractor status card_type:Batter tier:T1—Base Chrome` | Intersection of both filters |
|
||||
| 15 | Empty result | `/refractor status tier:T4—Superfractor` (if none exist) | "No cards match your filters..." message with filter details |
|
||||
|
||||
## REF-20: `/refractor status` — Pagination
|
||||
|
||||
| # | Test | Steps | Expected |
|
||||
|---|---|---|---|
|
||||
| 20 | Page buttons appear | `/refractor status` with >10 cards | Prev/Next buttons visible |
|
||||
| 21 | Next page | Click `Next >` | Page 2 shown, footer updates to "Page 2/N" |
|
||||
| 22 | Prev page | From page 2, click `< Prev` | Back to page 1 |
|
||||
| 23 | First page prev | On page 1, click `< Prev` | Nothing happens / stays on page 1 |
|
||||
| 24 | Last page next | On last page, click `Next >` | Nothing happens / stays on last page |
|
||||
| 25 | Button timeout | Wait 120s after command | Buttons become unresponsive |
|
||||
| 26 | Wrong user clicks | Another user clicks buttons | Silently ignored |
|
||||
|
||||
## REF-30: Tier Badges in Card Embeds
|
||||
|
||||
| # | Test | Steps | Expected |
|
||||
|---|---|---|---|
|
||||
| 30 | T0 card display | View a T0 card via `/myteam` or `/roster` | No badge prefix, just player name |
|
||||
| 31 | T1 badge | View a T1 card | Title shows `[BC] Player Name` |
|
||||
| 32 | T2 badge | View a T2 card | Title shows `[R] Player Name` |
|
||||
| 33 | T3 badge | View a T3 card | Title shows `[GR] Player Name` |
|
||||
| 34 | T4 badge | View a T4 card (if exists) | Title shows `[SF] Player Name` |
|
||||
| 35 | Badge in pack open | Open a pack with an evolved card | Badge appears in pack embed |
|
||||
| 36 | API down gracefully | (hard to test) | Card displays normally with no badge, no error |
|
||||
|
||||
## REF-50: Post-Game Hook & Tier-Up Notifications
|
||||
|
||||
| # | Test | Steps | Expected |
|
||||
|---|---|---|---|
|
||||
| 50 | Game completes normally | Play a full game | No errors in bot logs; refractor evaluate-game fires after season-stats update |
|
||||
| 51 | Tier-up notification | Play game where a card crosses a threshold | Embed in game channel: "Refractor Tier Up!", player name, tier name, correct color |
|
||||
| 52 | No tier-up | Play game where no thresholds crossed | No refractor embed posted, game completes normally |
|
||||
| 53 | Multiple tier-ups | Game where 2+ players tier up | One embed per tier-up, all posted |
|
||||
| 54 | Auto-init new card | Play game with a card that has no RefractorCardState | State created automatically, player evaluated, no error |
|
||||
| 55 | Superfractor notification | (may need forced data) | "SUPERFRACTOR!" title, teal color |
|
||||
|
||||
## REF-60: Card Art with Tiers (API-level)
|
||||
|
||||
| # | Test | Steps | Expected |
|
||||
|---|---|---|---|
|
||||
| 60 | T0 card image | `GET /api/v2/players/{id}/card-image?card_type=batting` | Base card, no tier styling |
|
||||
| 61 | Tier override | `GET ...?card_type=batting&tier=2` | Refractor styling visible (border, diamond indicator) |
|
||||
| 62 | Each tier visual | `?tier=1` through `?tier=4` | Correct border colors, diamond fill, header gradients per tier |
|
||||
| 63 | Pitcher card | `?card_type=pitching&tier=2` | Tier styling applies correctly to pitcher layout |
|
||||
|
||||
## REF-70: Known Issues to Verify
|
||||
|
||||
| # | Issue | Check | Status |
|
||||
|---|---|---|---|
|
||||
| 70 | Superfractor embed says "Rating boosts coming in a future update!" | Verify — boosts ARE implemented now, text is stale | **Fix needed** |
|
||||
| 71 | `on_timeout` doesn't edit message | Buttons stay visually active after 120s | **Known, low priority** |
|
||||
| 72 | Card embed perf (1 API call per card) | Note latency on roster views with 10+ cards | **Monitor** |
|
||||
| 73 | Season-stats failure kills refractor eval | Both in same try/except | **Known risk, verify logging** |
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints Under Test
|
||||
|
||||
| Method | Endpoint | Used By |
|
||||
|---|---|---|
|
||||
| GET | `/api/v2/refractor/tracks` | Track listing |
|
||||
| GET | `/api/v2/refractor/cards?team_id=X` | `/refractor status` command |
|
||||
| GET | `/api/v2/refractor/cards/{card_id}` | Tier badge in card embeds |
|
||||
| POST | `/api/v2/refractor/cards/{card_id}/evaluate` | Force re-evaluation |
|
||||
| POST | `/api/v2/refractor/evaluate-game/{game_id}` | Post-game hook |
|
||||
| GET | `/api/v2/teams/{team_id}/refractors` | Teams alias endpoint |
|
||||
| GET | `/api/v2/players/{id}/card-image?tier=N` | Card art tier preview |
|
||||
|
||||
## Notification Embed Colors
|
||||
|
||||
| Tier | Name | Color |
|
||||
|---|---|---|
|
||||
| T1 | Base Chrome | Green (0x2ECC71) |
|
||||
| T2 | Refractor | Gold (0xF1C40F) |
|
||||
| T3 | Gold Refractor | Purple (0x9B59B6) |
|
||||
| T4 | Superfractor | Teal (0x1ABC9C) |
|
||||
@ -14,7 +14,7 @@ tags: [proxmox, maintenance, reboot, ansible, operations, systemd]
|
||||
|--------|-------|
|
||||
| **Schedule** | 1st Sunday of every month, 3:00 AM ET (08:00 UTC) |
|
||||
| **Expected downtime** | ~15 minutes (host reboot + VM/LXC startup) |
|
||||
| **Orchestration** | Ansible playbook on LXC 304 (ansible-controller) |
|
||||
| **Orchestration** | Ansible on LXC 304 — shutdown playbook → host reboot → post-reboot startup playbook |
|
||||
| **Calendar** | Google Calendar recurring event: "Proxmox Monthly Maintenance Reboot" |
|
||||
| **HA DNS** | ubuntu-manticore (10.10.0.226) provides Pi-hole 2 during Proxmox downtime |
|
||||
|
||||
@ -24,16 +24,25 @@ tags: [proxmox, maintenance, reboot, ansible, operations, systemd]
|
||||
- Long uptimes allow memory leaks and process state drift (e.g., avahi busy-loops)
|
||||
- Validates that all VMs/LXCs auto-start cleanly with `onboot: 1`
|
||||
|
||||
## Architecture
|
||||
|
||||
The reboot is split into two playbooks because LXC 304 (the Ansible controller) is itself a guest on the Proxmox host being rebooted:
|
||||
|
||||
1. **`monthly-reboot.yml`** — Snapshots all guests, shuts them down in dependency order, issues a fire-and-forget `reboot` to the Proxmox host, then exits. LXC 304 is killed when the host reboots.
|
||||
2. **`post-reboot-startup.yml`** — After the host reboots, LXC 304 auto-starts via `onboot: 1`. A systemd service (`ansible-post-reboot.service`) waits 120 seconds for the Proxmox API to stabilize, then starts all guests in dependency order with staggered delays.
|
||||
|
||||
The `onboot: 1` flag on all production guests acts as a safety net — even if the post-reboot playbook fails, Proxmox will start everything (though without controlled ordering).
|
||||
|
||||
## Prerequisites (Before Maintenance)
|
||||
|
||||
- [ ] Verify no active Tdarr transcodes on ubuntu-manticore
|
||||
- [ ] Verify no running database backups
|
||||
- [ ] Switch workstation DNS to `1.1.1.1` (Pi-hole 1 on VM 106 will be offline)
|
||||
- [ ] Ensure workstation has Pi-hole 2 (10.10.0.226) as a fallback DNS server so it fails over automatically during downtime
|
||||
- [ ] Confirm ubuntu-manticore Pi-hole 2 is healthy: `ssh manticore "docker exec pihole pihole status"`
|
||||
|
||||
## `onboot` Audit
|
||||
|
||||
All production VMs and LXCs must have `onboot: 1` so they restart automatically if the playbook fails mid-sequence.
|
||||
All production VMs and LXCs must have `onboot: 1` so they restart automatically as a safety net.
|
||||
|
||||
**Check VMs:**
|
||||
```bash
|
||||
@ -55,18 +64,18 @@ done"
|
||||
|
||||
**Audit results (2026-04-03):**
|
||||
|
||||
| ID | Name | Type | `onboot` | Action needed |
|
||||
|----|------|------|----------|---------------|
|
||||
| ID | Name | Type | `onboot` | Status |
|
||||
|----|------|------|----------|--------|
|
||||
| 106 | docker-home | VM | 1 | OK |
|
||||
| 109 | homeassistant | VM | NOT SET | **Add `onboot: 1`** |
|
||||
| 109 | homeassistant | VM | 1 | OK (fixed 2026-04-03) |
|
||||
| 110 | discord-bots | VM | 1 | OK |
|
||||
| 112 | databases-bots | VM | 1 | OK |
|
||||
| 115 | docker-sba | VM | 1 | OK |
|
||||
| 116 | docker-home-servers | VM | 1 | OK |
|
||||
| 210 | docker-n8n-lxc | LXC | 1 | OK |
|
||||
| 221 | arr-stack | LXC | NOT SET | **Add `onboot: 1`** |
|
||||
| 221 | arr-stack | LXC | 1 | OK (fixed 2026-04-03) |
|
||||
| 222 | memos | LXC | 1 | OK |
|
||||
| 223 | foundry-lxc | LXC | NOT SET | **Add `onboot: 1`** |
|
||||
| 223 | foundry-lxc | LXC | 1 | OK (fixed 2026-04-03) |
|
||||
| 225 | gitea | LXC | 1 | OK |
|
||||
| 227 | uptime-kuma | LXC | 1 | OK |
|
||||
| 301 | claude-discord-coordinator | LXC | 1 | OK |
|
||||
@ -74,16 +83,15 @@ done"
|
||||
| 303 | mcp-gateway | LXC | 0 | Intentional (on-demand) |
|
||||
| 304 | ansible-controller | LXC | 1 | OK |
|
||||
|
||||
**Fix missing `onboot`:**
|
||||
**If any production guest is missing `onboot: 1`:**
|
||||
```bash
|
||||
ssh proxmox "qm set 109 --onboot 1"
|
||||
ssh proxmox "pct set 221 --onboot 1"
|
||||
ssh proxmox "pct set 223 --onboot 1"
|
||||
ssh proxmox "qm set <VMID> --onboot 1" # for VMs
|
||||
ssh proxmox "pct set <CTID> --onboot 1" # for LXCs
|
||||
```
|
||||
|
||||
## Shutdown Order (Dependency-Aware)
|
||||
|
||||
Reverse of the validated startup sequence. Stop consumers before their dependencies.
|
||||
Reverse of the validated startup sequence. Stop consumers before their dependencies. Each tier polls per-guest status rather than using fixed waits.
|
||||
|
||||
```
|
||||
Tier 4 — Media & Others (no downstream dependents)
|
||||
@ -92,7 +100,6 @@ Tier 4 — Media & Others (no downstream dependents)
|
||||
LXC 222 memos
|
||||
LXC 223 foundry-lxc
|
||||
LXC 302 claude-runner
|
||||
LXC 303 mcp-gateway (if running)
|
||||
|
||||
Tier 3 — Applications (depend on databases + infra)
|
||||
VM 115 docker-sba (Paper Dynasty, Major Domo)
|
||||
@ -107,21 +114,19 @@ Tier 2 — Infrastructure + DNS (depend on databases)
|
||||
VM 116 docker-home-servers
|
||||
|
||||
Tier 1 — Databases (no dependencies, shut down last)
|
||||
VM 112 databases-bots
|
||||
VM 112 databases-bots (force-stop after 90s if ACPI ignored)
|
||||
|
||||
Tier 0 — Ansible controller shuts itself down last
|
||||
LXC 304 ansible-controller
|
||||
|
||||
→ Proxmox host reboots
|
||||
→ LXC 304 issues fire-and-forget reboot to Proxmox host, then is killed
|
||||
```
|
||||
|
||||
**Known quirks:**
|
||||
- VM 112 (databases-bots) may ignore ACPI shutdown — use `--forceStop` after timeout
|
||||
- VM 112 (databases-bots) may ignore ACPI shutdown — playbook force-stops after 90s
|
||||
- VM 109 (homeassistant) is self-managed via HA Supervisor, excluded from Ansible inventory
|
||||
- LXC 303 (mcp-gateway) has `onboot: 0` and is operator-managed — not included in shutdown/startup. If it was running before maintenance, bring it up manually afterward
|
||||
|
||||
## Startup Order (Staggered)
|
||||
|
||||
After the Proxmox host reboots, guests with `onboot: 1` will auto-start. The Ansible playbook overrides this with a controlled sequence:
|
||||
After the Proxmox host reboots, LXC 304 auto-starts and the `ansible-post-reboot.service` waits 120s before running the controlled startup:
|
||||
|
||||
```
|
||||
Tier 1 — Databases first
|
||||
@ -142,8 +147,8 @@ Tier 3 — Applications
|
||||
LXC 301 claude-discord-coordinator
|
||||
→ wait 30s
|
||||
|
||||
Pi-hole fix — restart container to clear UDP DNS bug
|
||||
qm guest exec 106 -- docker restart pihole
|
||||
Pi-hole fix — restart container via SSH to clear UDP DNS bug
|
||||
ssh docker-home "docker restart pihole"
|
||||
→ wait 10s
|
||||
|
||||
Tier 4 — Media & Others
|
||||
@ -151,6 +156,7 @@ Tier 4 — Media & Others
|
||||
LXC 221 arr-stack
|
||||
LXC 222 memos
|
||||
LXC 223 foundry-lxc
|
||||
LXC 302 claude-runner
|
||||
```
|
||||
|
||||
## Post-Reboot Validation
|
||||
@ -161,28 +167,35 @@ Tier 4 — Media & Others
|
||||
- [ ] Discord bots responding (check Discord)
|
||||
- [ ] Uptime Kuma dashboard green: `curl -sf http://10.10.0.227:3001/api/status-page/homelab`
|
||||
- [ ] Home Assistant running: `curl -sf http://10.10.0.109:8123/api/ -H 'Authorization: Bearer <token>'`
|
||||
- [ ] Switch workstation DNS back from `1.1.1.1` to Pi-hole
|
||||
- [ ] Maintenance snapshots cleaned up (auto, 7-day retention)
|
||||
|
||||
## Automation
|
||||
|
||||
### Ansible Playbook
|
||||
### Ansible Playbooks
|
||||
|
||||
Located at `/opt/ansible/playbooks/monthly-reboot.yml` on LXC 304.
|
||||
Both located at `/opt/ansible/playbooks/` on LXC 304.
|
||||
|
||||
```bash
|
||||
# Dry run (check mode)
|
||||
# Dry run — shutdown only
|
||||
ssh ansible "ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --check"
|
||||
|
||||
# Manual execution
|
||||
# Manual full execution — shutdown + reboot
|
||||
ssh ansible "ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml"
|
||||
|
||||
# Limit to shutdown only (skip reboot)
|
||||
# Manual post-reboot startup (if automatic startup failed)
|
||||
ssh ansible "ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml"
|
||||
|
||||
# Shutdown only — skip the host reboot
|
||||
ssh ansible "ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --tags shutdown"
|
||||
```
|
||||
|
||||
### Systemd Timer
|
||||
### Systemd Units (on LXC 304)
|
||||
|
||||
The playbook runs automatically via systemd timer on LXC 304:
|
||||
| Unit | Purpose | Schedule |
|
||||
|------|---------|----------|
|
||||
| `ansible-monthly-reboot.timer` | Triggers shutdown + reboot playbook | 1st Sunday of month, 08:00 UTC |
|
||||
| `ansible-monthly-reboot.service` | Runs `monthly-reboot.yml` | Activated by timer |
|
||||
| `ansible-post-reboot.service` | Runs `post-reboot-startup.yml` | On boot (multi-user.target), only if uptime < 10 min |
|
||||
|
||||
```bash
|
||||
# Check timer status
|
||||
@ -191,10 +204,32 @@ ssh ansible "systemctl status ansible-monthly-reboot.timer"
|
||||
# Next scheduled run
|
||||
ssh ansible "systemctl list-timers ansible-monthly-reboot.timer"
|
||||
|
||||
# Check post-reboot service status
|
||||
ssh ansible "systemctl status ansible-post-reboot.service"
|
||||
|
||||
# Disable for a month (e.g., during an incident)
|
||||
ssh ansible "systemctl stop ansible-monthly-reboot.timer"
|
||||
```
|
||||
|
||||
### Deployment (one-time setup on LXC 304)
|
||||
|
||||
```bash
|
||||
# Copy playbooks
|
||||
scp ansible/playbooks/monthly-reboot.yml ansible:/opt/ansible/playbooks/
|
||||
scp ansible/playbooks/post-reboot-startup.yml ansible:/opt/ansible/playbooks/
|
||||
|
||||
# Copy and enable systemd units
|
||||
scp ansible/systemd/ansible-monthly-reboot.timer ansible:/etc/systemd/system/
|
||||
scp ansible/systemd/ansible-monthly-reboot.service ansible:/etc/systemd/system/
|
||||
scp ansible/systemd/ansible-post-reboot.service ansible:/etc/systemd/system/
|
||||
ssh ansible "sudo systemctl daemon-reload && \
|
||||
sudo systemctl enable --now ansible-monthly-reboot.timer && \
|
||||
sudo systemctl enable ansible-post-reboot.service"
|
||||
|
||||
# Verify SSH key access from LXC 304 to docker-home (needed for Pi-hole restart)
|
||||
ssh ansible "ssh -o BatchMode=yes docker-home 'echo ok'"
|
||||
```
|
||||
|
||||
## Rollback
|
||||
|
||||
If a guest fails to start after reboot:
|
||||
@ -202,6 +237,7 @@ If a guest fails to start after reboot:
|
||||
2. Review guest logs: `ssh proxmox "journalctl -u pve-guests -n 50"`
|
||||
3. Manual start: `ssh proxmox "pvesh create /nodes/proxmox/qemu/<VMID>/status/start"`
|
||||
4. If guest is corrupted, restore from the pre-reboot Proxmox snapshot
|
||||
5. If post-reboot startup failed entirely, run manually: `ssh ansible "ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml"`
|
||||
|
||||
## Related Documentation
|
||||
|
||||
|
||||
@ -12,5 +12,5 @@ ostype: l26
|
||||
scsi0: local-lvm:vm-115-disk-0,size=256G
|
||||
scsihw: virtio-scsi-pci
|
||||
smbios1: uuid=19be98ee-f60d-473d-acd2-9164717fcd11
|
||||
sockets: 2
|
||||
sockets: 1
|
||||
vmgenid: 682dfeab-8c63-4f0b-8ed2-8828c2f808ef
|
||||
|
||||
Loading…
Reference in New Issue
Block a user