From 29a20fbe06f8addaa0ca9894a3be61dde5cffa5b Mon Sep 17 00:00:00 2001 From: Cal Corum Date: Fri, 3 Apr 2026 16:17:55 -0500 Subject: [PATCH] feat: add monthly Proxmox maintenance reboot automation (#26) Establishes a first-Sunday-of-the-month maintenance window orchestrated by Ansible on LXC 304. Split into two playbooks to handle the self-reboot paradox (the controller is a guest on the host being rebooted): - monthly-reboot.yml: snapshots, tiered shutdown with per-guest polling, fire-and-forget host reboot - post-reboot-startup.yml: controlled tiered startup with staggered delays, Pi-hole UDP DNS fix, validation, and snapshot cleanup Also fixes onboot:1 on VM 109, LXC 221, LXC 223 and creates a recurring Google Calendar event for the maintenance window. Closes #26 Co-Authored-By: Claude Opus 4.6 (1M context) --- ansible/playbooks/monthly-reboot.yml | 265 ++++++++++++++++++ ansible/playbooks/post-reboot-startup.yml | 214 ++++++++++++++ .../systemd/ansible-monthly-reboot.service | 15 + ansible/systemd/ansible-monthly-reboot.timer | 13 + ansible/systemd/ansible-post-reboot.service | 21 ++ server-configs/proxmox/maintenance-reboot.md | 98 +++++-- 6 files changed, 595 insertions(+), 31 deletions(-) create mode 100644 ansible/playbooks/monthly-reboot.yml create mode 100644 ansible/playbooks/post-reboot-startup.yml create mode 100644 ansible/systemd/ansible-monthly-reboot.service create mode 100644 ansible/systemd/ansible-monthly-reboot.timer create mode 100644 ansible/systemd/ansible-post-reboot.service diff --git a/ansible/playbooks/monthly-reboot.yml b/ansible/playbooks/monthly-reboot.yml new file mode 100644 index 0000000..f3a77c8 --- /dev/null +++ b/ansible/playbooks/monthly-reboot.yml @@ -0,0 +1,265 @@ +--- +# Monthly Proxmox Maintenance Reboot — Shutdown & Reboot +# +# Orchestrates a graceful shutdown of all guests in dependency order, +# then issues a fire-and-forget reboot to the Proxmox host. +# +# After the host reboots, LXC 304 auto-starts via onboot:1 and the +# post-reboot-startup.yml playbook runs automatically via the +# ansible-post-reboot.service systemd unit (triggered by @reboot). +# +# Schedule: 1st Sunday of each month, 08:00 UTC (3 AM ET) +# Controller: LXC 304 (ansible-controller) at 10.10.0.232 +# +# Usage: +# # Dry run +# ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --check +# +# # Full execution +# ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml +# +# # Shutdown only (skip the host reboot) +# ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --tags shutdown +# +# Note: VM 109 (homeassistant) is excluded from Ansible inventory +# (self-managed via HA Supervisor) but is included in pvesh start/stop. + +- name: Pre-reboot health check and snapshots + hosts: pve-node + gather_facts: false + tags: [pre-reboot, shutdown] + + tasks: + - name: Check Proxmox cluster health + ansible.builtin.command: pvesh get /cluster/status --output-format json + register: cluster_status + changed_when: false + + - name: Get list of running QEMU VMs + ansible.builtin.shell: > + pvesh get /nodes/proxmox/qemu --output-format json | + python3 -c "import sys,json; [print(vm['vmid']) for vm in json.load(sys.stdin) if vm.get('status')=='running']" + register: running_vms + changed_when: false + + - name: Get list of running LXC containers + ansible.builtin.shell: > + pvesh get /nodes/proxmox/lxc --output-format json | + python3 -c "import sys,json; [print(ct['vmid']) for ct in json.load(sys.stdin) if ct.get('status')=='running']" + register: running_lxcs + changed_when: false + + - name: Display running guests + ansible.builtin.debug: + msg: "Running VMs: {{ running_vms.stdout_lines }} | Running LXCs: {{ running_lxcs.stdout_lines }}" + + - name: Snapshot running VMs + ansible.builtin.command: > + pvesh create /nodes/proxmox/qemu/{{ item }}/snapshot + --snapname pre-maintenance-{{ lookup('pipe', 'date +%Y-%m-%d') }} + --description "Auto snapshot before monthly maintenance reboot" + loop: "{{ running_vms.stdout_lines }}" + when: running_vms.stdout_lines | length > 0 + ignore_errors: true + + - name: Snapshot running LXCs + ansible.builtin.command: > + pvesh create /nodes/proxmox/lxc/{{ item }}/snapshot + --snapname pre-maintenance-{{ lookup('pipe', 'date +%Y-%m-%d') }} + --description "Auto snapshot before monthly maintenance reboot" + loop: "{{ running_lxcs.stdout_lines }}" + when: running_lxcs.stdout_lines | length > 0 + ignore_errors: true + +- name: "Shutdown Tier 4 — Media & Others" + hosts: pve-node + gather_facts: false + tags: [shutdown] + + vars: + tier4_vms: [109] + # LXC 303 (mcp-gateway) is onboot=0 and operator-managed — not included here + tier4_lxcs: [221, 222, 223, 302] + + tasks: + - name: Shutdown Tier 4 VMs + ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown + loop: "{{ tier4_vms }}" + ignore_errors: true + + - name: Shutdown Tier 4 LXCs + ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown + loop: "{{ tier4_lxcs }}" + ignore_errors: true + + - name: Wait for Tier 4 VMs to stop + ansible.builtin.shell: > + pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json | + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" + register: t4_vm_status + until: t4_vm_status.stdout.strip() == "stopped" + retries: 12 + delay: 5 + loop: "{{ tier4_vms }}" + ignore_errors: true + + - name: Wait for Tier 4 LXCs to stop + ansible.builtin.shell: > + pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json | + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" + register: t4_lxc_status + until: t4_lxc_status.stdout.strip() == "stopped" + retries: 12 + delay: 5 + loop: "{{ tier4_lxcs }}" + ignore_errors: true + +- name: "Shutdown Tier 3 — Applications" + hosts: pve-node + gather_facts: false + tags: [shutdown] + + vars: + tier3_vms: [115, 110] + tier3_lxcs: [301] + + tasks: + - name: Shutdown Tier 3 VMs + ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown + loop: "{{ tier3_vms }}" + ignore_errors: true + + - name: Shutdown Tier 3 LXCs + ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown + loop: "{{ tier3_lxcs }}" + ignore_errors: true + + - name: Wait for Tier 3 VMs to stop + ansible.builtin.shell: > + pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json | + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" + register: t3_vm_status + until: t3_vm_status.stdout.strip() == "stopped" + retries: 12 + delay: 5 + loop: "{{ tier3_vms }}" + ignore_errors: true + + - name: Wait for Tier 3 LXCs to stop + ansible.builtin.shell: > + pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json | + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" + register: t3_lxc_status + until: t3_lxc_status.stdout.strip() == "stopped" + retries: 12 + delay: 5 + loop: "{{ tier3_lxcs }}" + ignore_errors: true + +- name: "Shutdown Tier 2 — Infrastructure" + hosts: pve-node + gather_facts: false + tags: [shutdown] + + vars: + tier2_vms: [106, 116] + tier2_lxcs: [225, 210, 227] + + tasks: + - name: Shutdown Tier 2 VMs + ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown + loop: "{{ tier2_vms }}" + ignore_errors: true + + - name: Shutdown Tier 2 LXCs + ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown + loop: "{{ tier2_lxcs }}" + ignore_errors: true + + - name: Wait for Tier 2 VMs to stop + ansible.builtin.shell: > + pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json | + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" + register: t2_vm_status + until: t2_vm_status.stdout.strip() == "stopped" + retries: 12 + delay: 5 + loop: "{{ tier2_vms }}" + ignore_errors: true + + - name: Wait for Tier 2 LXCs to stop + ansible.builtin.shell: > + pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json | + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" + register: t2_lxc_status + until: t2_lxc_status.stdout.strip() == "stopped" + retries: 12 + delay: 5 + loop: "{{ tier2_lxcs }}" + ignore_errors: true + +- name: "Shutdown Tier 1 — Databases" + hosts: pve-node + gather_facts: false + tags: [shutdown] + + vars: + tier1_vms: [112] + + tasks: + - name: Shutdown database VMs + ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown + loop: "{{ tier1_vms }}" + ignore_errors: true + + - name: Wait for database VMs to stop (up to 90s) + ansible.builtin.shell: > + pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json | + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" + register: t1_vm_status + until: t1_vm_status.stdout.strip() == "stopped" + retries: 18 + delay: 5 + loop: "{{ tier1_vms }}" + ignore_errors: true + + - name: Force stop database VMs if still running + ansible.builtin.shell: > + status=$(pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json | + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"); + if [ "$status" = "running" ]; then + pvesh create /nodes/proxmox/qemu/{{ item }}/status/stop; + echo "Force stopped VM {{ item }}"; + else + echo "VM {{ item }} already stopped"; + fi + loop: "{{ tier1_vms }}" + register: force_stop_result + changed_when: force_stop_result.results | default([]) | selectattr('stdout', 'defined') | selectattr('stdout', 'search', 'Force stopped') | list | length > 0 + +- name: "Verify and reboot Proxmox host" + hosts: pve-node + gather_facts: false + tags: [reboot] + + tasks: + - name: Verify all guests are stopped (excluding LXC 304) + ansible.builtin.shell: > + running_vms=$(pvesh get /nodes/proxmox/qemu --output-format json | + python3 -c "import sys,json; vms=[v for v in json.load(sys.stdin) if v.get('status')=='running']; print(len(vms))"); + running_lxcs=$(pvesh get /nodes/proxmox/lxc --output-format json | + python3 -c "import sys,json; cts=[c for c in json.load(sys.stdin) if c.get('status')=='running' and c['vmid'] != 304]; print(len(cts))"); + echo "Running VMs: $running_vms, Running LXCs: $running_lxcs"; + if [ "$running_vms" != "0" ] || [ "$running_lxcs" != "0" ]; then exit 1; fi + register: verify_stopped + + - name: Issue fire-and-forget reboot (controller will be killed) + ansible.builtin.shell: > + nohup bash -c 'sleep 10 && reboot' &>/dev/null & + echo "Reboot scheduled in 10 seconds" + register: reboot_issued + when: not ansible_check_mode + + - name: Log reboot issued + ansible.builtin.debug: + msg: "{{ reboot_issued.stdout }} — Ansible process will terminate when host reboots. Post-reboot startup handled by ansible-post-reboot.service on LXC 304." diff --git a/ansible/playbooks/post-reboot-startup.yml b/ansible/playbooks/post-reboot-startup.yml new file mode 100644 index 0000000..d05c77c --- /dev/null +++ b/ansible/playbooks/post-reboot-startup.yml @@ -0,0 +1,214 @@ +--- +# Post-Reboot Startup — Controlled Guest Startup After Proxmox Reboot +# +# Starts all guests in dependency order with staggered delays to avoid +# I/O storms. Runs automatically via ansible-post-reboot.service on +# LXC 304 after the Proxmox host reboots. +# +# Can also be run manually: +# ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml +# +# Note: VM 109 (homeassistant) is excluded from Ansible inventory +# (self-managed via HA Supervisor) but is included in pvesh start/stop. + +- name: Wait for Proxmox API to be ready + hosts: pve-node + gather_facts: false + tags: [startup] + + tasks: + - name: Wait for Proxmox API + ansible.builtin.command: pvesh get /version --output-format json + register: pve_version + until: pve_version.rc == 0 + retries: 30 + delay: 10 + changed_when: false + + - name: Display Proxmox version + ansible.builtin.debug: + msg: "Proxmox API ready: {{ pve_version.stdout | from_json | json_query('version') | default('unknown') }}" + +- name: "Startup Tier 1 — Databases" + hosts: pve-node + gather_facts: false + tags: [startup] + + tasks: + - name: Start database VM (112) + ansible.builtin.command: pvesh create /nodes/proxmox/qemu/112/status/start + ignore_errors: true + + - name: Wait for VM 112 to be running + ansible.builtin.shell: > + pvesh get /nodes/proxmox/qemu/112/status/current --output-format json | + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" + register: db_status + until: db_status.stdout.strip() == "running" + retries: 12 + delay: 5 + changed_when: false + + - name: Wait for database services to initialize + ansible.builtin.pause: + seconds: 30 + +- name: "Startup Tier 2 — Infrastructure" + hosts: pve-node + gather_facts: false + tags: [startup] + + vars: + tier2_vms: [106, 116] + tier2_lxcs: [225, 210, 227] + + tasks: + - name: Start Tier 2 VMs + ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start + loop: "{{ tier2_vms }}" + ignore_errors: true + + - name: Start Tier 2 LXCs + ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start + loop: "{{ tier2_lxcs }}" + ignore_errors: true + + - name: Wait for infrastructure to come up + ansible.builtin.pause: + seconds: 30 + +- name: "Startup Tier 3 — Applications" + hosts: pve-node + gather_facts: false + tags: [startup] + + vars: + tier3_vms: [115, 110] + tier3_lxcs: [301] + + tasks: + - name: Start Tier 3 VMs + ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start + loop: "{{ tier3_vms }}" + ignore_errors: true + + - name: Start Tier 3 LXCs + ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start + loop: "{{ tier3_lxcs }}" + ignore_errors: true + + - name: Wait for applications to start + ansible.builtin.pause: + seconds: 30 + + - name: Restart Pi-hole container via SSH (UDP DNS fix) + ansible.builtin.command: ssh docker-home "docker restart pihole" + ignore_errors: true + + - name: Wait for Pi-hole to stabilize + ansible.builtin.pause: + seconds: 10 + +- name: "Startup Tier 4 — Media & Others" + hosts: pve-node + gather_facts: false + tags: [startup] + + vars: + tier4_vms: [109] + tier4_lxcs: [221, 222, 223, 302] + + tasks: + - name: Start Tier 4 VMs + ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start + loop: "{{ tier4_vms }}" + ignore_errors: true + + - name: Start Tier 4 LXCs + ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start + loop: "{{ tier4_lxcs }}" + ignore_errors: true + +- name: Post-reboot validation + hosts: pve-node + gather_facts: false + tags: [startup, validate] + + tasks: + - name: Wait for all services to initialize + ansible.builtin.pause: + seconds: 60 + + - name: Check all expected VMs are running + ansible.builtin.shell: > + pvesh get /nodes/proxmox/qemu --output-format json | + python3 -c " + import sys, json + vms = json.load(sys.stdin) + expected = {106, 109, 110, 112, 115, 116} + running = {v['vmid'] for v in vms if v.get('status') == 'running'} + missing = expected - running + if missing: + print(f'WARN: VMs not running: {missing}') + sys.exit(1) + print(f'All expected VMs running: {running & expected}') + " + register: vm_check + ignore_errors: true + + - name: Check all expected LXCs are running + ansible.builtin.shell: > + pvesh get /nodes/proxmox/lxc --output-format json | + python3 -c " + import sys, json + cts = json.load(sys.stdin) + # LXC 303 (mcp-gateway) intentionally excluded — onboot=0, operator-managed + expected = {210, 221, 222, 223, 225, 227, 301, 302, 304} + running = {c['vmid'] for c in cts if c.get('status') == 'running'} + missing = expected - running + if missing: + print(f'WARN: LXCs not running: {missing}') + sys.exit(1) + print(f'All expected LXCs running: {running & expected}') + " + register: lxc_check + ignore_errors: true + + - name: Clean up old maintenance snapshots (older than 7 days) + ansible.builtin.shell: > + cutoff=$(date -d '7 days ago' +%s); + for vmid in $(pvesh get /nodes/proxmox/qemu --output-format json | + python3 -c "import sys,json; [print(v['vmid']) for v in json.load(sys.stdin)]"); do + for snap in $(pvesh get /nodes/proxmox/qemu/$vmid/snapshot --output-format json | + python3 -c "import sys,json; [print(s['name']) for s in json.load(sys.stdin) if s['name'].startswith('pre-maintenance-')]" 2>/dev/null); do + snap_date=$(echo $snap | sed 's/pre-maintenance-//'); + snap_epoch=$(date -d "$snap_date" +%s 2>/dev/null); + if [ -z "$snap_epoch" ]; then + echo "WARN: could not parse date for snapshot $snap on VM $vmid"; + elif [ "$snap_epoch" -lt "$cutoff" ]; then + pvesh delete /nodes/proxmox/qemu/$vmid/snapshot/$snap && echo "Deleted $snap from VM $vmid"; + fi + done + done; + for ctid in $(pvesh get /nodes/proxmox/lxc --output-format json | + python3 -c "import sys,json; [print(c['vmid']) for c in json.load(sys.stdin)]"); do + for snap in $(pvesh get /nodes/proxmox/lxc/$ctid/snapshot --output-format json | + python3 -c "import sys,json; [print(s['name']) for s in json.load(sys.stdin) if s['name'].startswith('pre-maintenance-')]" 2>/dev/null); do + snap_date=$(echo $snap | sed 's/pre-maintenance-//'); + snap_epoch=$(date -d "$snap_date" +%s 2>/dev/null); + if [ -z "$snap_epoch" ]; then + echo "WARN: could not parse date for snapshot $snap on LXC $ctid"; + elif [ "$snap_epoch" -lt "$cutoff" ]; then + pvesh delete /nodes/proxmox/lxc/$ctid/snapshot/$snap && echo "Deleted $snap from LXC $ctid"; + fi + done + done; + echo "Snapshot cleanup complete" + ignore_errors: true + + - name: Display validation results + ansible.builtin.debug: + msg: + - "VM status: {{ vm_check.stdout }}" + - "LXC status: {{ lxc_check.stdout }}" + - "Maintenance reboot complete — post-reboot startup finished" diff --git a/ansible/systemd/ansible-monthly-reboot.service b/ansible/systemd/ansible-monthly-reboot.service new file mode 100644 index 0000000..02b2db2 --- /dev/null +++ b/ansible/systemd/ansible-monthly-reboot.service @@ -0,0 +1,15 @@ +[Unit] +Description=Monthly Proxmox maintenance reboot (Ansible) +After=network-online.target +Wants=network-online.target + +[Service] +Type=oneshot +User=cal +WorkingDirectory=/opt/ansible +ExecStart=/usr/bin/ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml +StandardOutput=append:/opt/ansible/logs/monthly-reboot.log +StandardError=append:/opt/ansible/logs/monthly-reboot.log +TimeoutStartSec=900 + +# No [Install] section — this service is activated exclusively by ansible-monthly-reboot.timer diff --git a/ansible/systemd/ansible-monthly-reboot.timer b/ansible/systemd/ansible-monthly-reboot.timer new file mode 100644 index 0000000..5711dda --- /dev/null +++ b/ansible/systemd/ansible-monthly-reboot.timer @@ -0,0 +1,13 @@ +[Unit] +Description=Monthly Proxmox maintenance reboot timer +Documentation=https://git.manticorum.com/cal/claude-home/src/branch/main/server-configs/proxmox/maintenance-reboot.md + +[Timer] +# First Sunday of the month at 08:00 UTC (3:00 AM ET during EDT) +# Day range 01-07 ensures it's always the first occurrence of that weekday +OnCalendar=Sun *-*-01..07 08:00:00 +Persistent=true +RandomizedDelaySec=600 + +[Install] +WantedBy=timers.target diff --git a/ansible/systemd/ansible-post-reboot.service b/ansible/systemd/ansible-post-reboot.service new file mode 100644 index 0000000..132ac6b --- /dev/null +++ b/ansible/systemd/ansible-post-reboot.service @@ -0,0 +1,21 @@ +[Unit] +Description=Post-reboot controlled guest startup (Ansible) +After=network-online.target +Wants=network-online.target +# Only run after a fresh boot — not on service restart +ConditionUpTimeSec=600 + +[Service] +Type=oneshot +User=cal +WorkingDirectory=/opt/ansible +# Delay 120s to let Proxmox API stabilize and onboot guests settle +ExecStartPre=/bin/sleep 120 +ExecStart=/usr/bin/ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml +StandardOutput=append:/opt/ansible/logs/post-reboot-startup.log +StandardError=append:/opt/ansible/logs/post-reboot-startup.log +TimeoutStartSec=1800 + +[Install] +# Runs automatically on every boot of LXC 304 +WantedBy=multi-user.target diff --git a/server-configs/proxmox/maintenance-reboot.md b/server-configs/proxmox/maintenance-reboot.md index 0c72d5a..36e63da 100644 --- a/server-configs/proxmox/maintenance-reboot.md +++ b/server-configs/proxmox/maintenance-reboot.md @@ -14,7 +14,7 @@ tags: [proxmox, maintenance, reboot, ansible, operations, systemd] |--------|-------| | **Schedule** | 1st Sunday of every month, 3:00 AM ET (08:00 UTC) | | **Expected downtime** | ~15 minutes (host reboot + VM/LXC startup) | -| **Orchestration** | Ansible playbook on LXC 304 (ansible-controller) | +| **Orchestration** | Ansible on LXC 304 — shutdown playbook → host reboot → post-reboot startup playbook | | **Calendar** | Google Calendar recurring event: "Proxmox Monthly Maintenance Reboot" | | **HA DNS** | ubuntu-manticore (10.10.0.226) provides Pi-hole 2 during Proxmox downtime | @@ -24,16 +24,25 @@ tags: [proxmox, maintenance, reboot, ansible, operations, systemd] - Long uptimes allow memory leaks and process state drift (e.g., avahi busy-loops) - Validates that all VMs/LXCs auto-start cleanly with `onboot: 1` +## Architecture + +The reboot is split into two playbooks because LXC 304 (the Ansible controller) is itself a guest on the Proxmox host being rebooted: + +1. **`monthly-reboot.yml`** — Snapshots all guests, shuts them down in dependency order, issues a fire-and-forget `reboot` to the Proxmox host, then exits. LXC 304 is killed when the host reboots. +2. **`post-reboot-startup.yml`** — After the host reboots, LXC 304 auto-starts via `onboot: 1`. A systemd service (`ansible-post-reboot.service`) waits 120 seconds for the Proxmox API to stabilize, then starts all guests in dependency order with staggered delays. + +The `onboot: 1` flag on all production guests acts as a safety net — even if the post-reboot playbook fails, Proxmox will start everything (though without controlled ordering). + ## Prerequisites (Before Maintenance) - [ ] Verify no active Tdarr transcodes on ubuntu-manticore - [ ] Verify no running database backups -- [ ] Switch workstation DNS to `1.1.1.1` (Pi-hole 1 on VM 106 will be offline) +- [ ] Ensure workstation has Pi-hole 2 (10.10.0.226) as a fallback DNS server so it fails over automatically during downtime - [ ] Confirm ubuntu-manticore Pi-hole 2 is healthy: `ssh manticore "docker exec pihole pihole status"` ## `onboot` Audit -All production VMs and LXCs must have `onboot: 1` so they restart automatically if the playbook fails mid-sequence. +All production VMs and LXCs must have `onboot: 1` so they restart automatically as a safety net. **Check VMs:** ```bash @@ -55,18 +64,18 @@ done" **Audit results (2026-04-03):** -| ID | Name | Type | `onboot` | Action needed | -|----|------|------|----------|---------------| +| ID | Name | Type | `onboot` | Status | +|----|------|------|----------|--------| | 106 | docker-home | VM | 1 | OK | -| 109 | homeassistant | VM | NOT SET | **Add `onboot: 1`** | +| 109 | homeassistant | VM | 1 | OK (fixed 2026-04-03) | | 110 | discord-bots | VM | 1 | OK | | 112 | databases-bots | VM | 1 | OK | | 115 | docker-sba | VM | 1 | OK | | 116 | docker-home-servers | VM | 1 | OK | | 210 | docker-n8n-lxc | LXC | 1 | OK | -| 221 | arr-stack | LXC | NOT SET | **Add `onboot: 1`** | +| 221 | arr-stack | LXC | 1 | OK (fixed 2026-04-03) | | 222 | memos | LXC | 1 | OK | -| 223 | foundry-lxc | LXC | NOT SET | **Add `onboot: 1`** | +| 223 | foundry-lxc | LXC | 1 | OK (fixed 2026-04-03) | | 225 | gitea | LXC | 1 | OK | | 227 | uptime-kuma | LXC | 1 | OK | | 301 | claude-discord-coordinator | LXC | 1 | OK | @@ -74,16 +83,15 @@ done" | 303 | mcp-gateway | LXC | 0 | Intentional (on-demand) | | 304 | ansible-controller | LXC | 1 | OK | -**Fix missing `onboot`:** +**If any production guest is missing `onboot: 1`:** ```bash -ssh proxmox "qm set 109 --onboot 1" -ssh proxmox "pct set 221 --onboot 1" -ssh proxmox "pct set 223 --onboot 1" +ssh proxmox "qm set --onboot 1" # for VMs +ssh proxmox "pct set --onboot 1" # for LXCs ``` ## Shutdown Order (Dependency-Aware) -Reverse of the validated startup sequence. Stop consumers before their dependencies. +Reverse of the validated startup sequence. Stop consumers before their dependencies. Each tier polls per-guest status rather than using fixed waits. ``` Tier 4 — Media & Others (no downstream dependents) @@ -92,7 +100,6 @@ Tier 4 — Media & Others (no downstream dependents) LXC 222 memos LXC 223 foundry-lxc LXC 302 claude-runner - LXC 303 mcp-gateway (if running) Tier 3 — Applications (depend on databases + infra) VM 115 docker-sba (Paper Dynasty, Major Domo) @@ -107,21 +114,19 @@ Tier 2 — Infrastructure + DNS (depend on databases) VM 116 docker-home-servers Tier 1 — Databases (no dependencies, shut down last) - VM 112 databases-bots + VM 112 databases-bots (force-stop after 90s if ACPI ignored) -Tier 0 — Ansible controller shuts itself down last - LXC 304 ansible-controller - -→ Proxmox host reboots +→ LXC 304 issues fire-and-forget reboot to Proxmox host, then is killed ``` **Known quirks:** -- VM 112 (databases-bots) may ignore ACPI shutdown — use `--forceStop` after timeout +- VM 112 (databases-bots) may ignore ACPI shutdown — playbook force-stops after 90s - VM 109 (homeassistant) is self-managed via HA Supervisor, excluded from Ansible inventory +- LXC 303 (mcp-gateway) has `onboot: 0` and is operator-managed — not included in shutdown/startup. If it was running before maintenance, bring it up manually afterward ## Startup Order (Staggered) -After the Proxmox host reboots, guests with `onboot: 1` will auto-start. The Ansible playbook overrides this with a controlled sequence: +After the Proxmox host reboots, LXC 304 auto-starts and the `ansible-post-reboot.service` waits 120s before running the controlled startup: ``` Tier 1 — Databases first @@ -142,8 +147,8 @@ Tier 3 — Applications LXC 301 claude-discord-coordinator → wait 30s -Pi-hole fix — restart container to clear UDP DNS bug - qm guest exec 106 -- docker restart pihole +Pi-hole fix — restart container via SSH to clear UDP DNS bug + ssh docker-home "docker restart pihole" → wait 10s Tier 4 — Media & Others @@ -151,6 +156,7 @@ Tier 4 — Media & Others LXC 221 arr-stack LXC 222 memos LXC 223 foundry-lxc + LXC 302 claude-runner ``` ## Post-Reboot Validation @@ -161,28 +167,35 @@ Tier 4 — Media & Others - [ ] Discord bots responding (check Discord) - [ ] Uptime Kuma dashboard green: `curl -sf http://10.10.0.227:3001/api/status-page/homelab` - [ ] Home Assistant running: `curl -sf http://10.10.0.109:8123/api/ -H 'Authorization: Bearer '` -- [ ] Switch workstation DNS back from `1.1.1.1` to Pi-hole +- [ ] Maintenance snapshots cleaned up (auto, 7-day retention) ## Automation -### Ansible Playbook +### Ansible Playbooks -Located at `/opt/ansible/playbooks/monthly-reboot.yml` on LXC 304. +Both located at `/opt/ansible/playbooks/` on LXC 304. ```bash -# Dry run (check mode) +# Dry run — shutdown only ssh ansible "ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --check" -# Manual execution +# Manual full execution — shutdown + reboot ssh ansible "ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml" -# Limit to shutdown only (skip reboot) +# Manual post-reboot startup (if automatic startup failed) +ssh ansible "ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml" + +# Shutdown only — skip the host reboot ssh ansible "ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --tags shutdown" ``` -### Systemd Timer +### Systemd Units (on LXC 304) -The playbook runs automatically via systemd timer on LXC 304: +| Unit | Purpose | Schedule | +|------|---------|----------| +| `ansible-monthly-reboot.timer` | Triggers shutdown + reboot playbook | 1st Sunday of month, 08:00 UTC | +| `ansible-monthly-reboot.service` | Runs `monthly-reboot.yml` | Activated by timer | +| `ansible-post-reboot.service` | Runs `post-reboot-startup.yml` | On boot (multi-user.target), only if uptime < 10 min | ```bash # Check timer status @@ -191,10 +204,32 @@ ssh ansible "systemctl status ansible-monthly-reboot.timer" # Next scheduled run ssh ansible "systemctl list-timers ansible-monthly-reboot.timer" +# Check post-reboot service status +ssh ansible "systemctl status ansible-post-reboot.service" + # Disable for a month (e.g., during an incident) ssh ansible "systemctl stop ansible-monthly-reboot.timer" ``` +### Deployment (one-time setup on LXC 304) + +```bash +# Copy playbooks +scp ansible/playbooks/monthly-reboot.yml ansible:/opt/ansible/playbooks/ +scp ansible/playbooks/post-reboot-startup.yml ansible:/opt/ansible/playbooks/ + +# Copy and enable systemd units +scp ansible/systemd/ansible-monthly-reboot.timer ansible:/etc/systemd/system/ +scp ansible/systemd/ansible-monthly-reboot.service ansible:/etc/systemd/system/ +scp ansible/systemd/ansible-post-reboot.service ansible:/etc/systemd/system/ +ssh ansible "sudo systemctl daemon-reload && \ + sudo systemctl enable --now ansible-monthly-reboot.timer && \ + sudo systemctl enable ansible-post-reboot.service" + +# Verify SSH key access from LXC 304 to docker-home (needed for Pi-hole restart) +ssh ansible "ssh -o BatchMode=yes docker-home 'echo ok'" +``` + ## Rollback If a guest fails to start after reboot: @@ -202,6 +237,7 @@ If a guest fails to start after reboot: 2. Review guest logs: `ssh proxmox "journalctl -u pve-guests -n 50"` 3. Manual start: `ssh proxmox "pvesh create /nodes/proxmox/qemu//status/start"` 4. If guest is corrupted, restore from the pre-reboot Proxmox snapshot +5. If post-reboot startup failed entirely, run manually: `ssh ansible "ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml"` ## Related Documentation