8 changed files with 738 additions and 33 deletions
--- a/ansible/playbooks/monthly-reboot.yml
+++ b/ansible/playbooks/monthly-reboot.yml
@ -0,0 +1,265 @@
+---
+# Monthly Proxmox Maintenance Reboot — Shutdown & Reboot
+#
+# Orchestrates a graceful shutdown of all guests in dependency order,
+# then issues a fire-and-forget reboot to the Proxmox host.
+#
+# After the host reboots, LXC 304 auto-starts via onboot:1 and the
+# post-reboot-startup.yml playbook runs automatically via the
+# ansible-post-reboot.service systemd unit (triggered by @reboot).
+#
+# Schedule: 1st Sunday of each month, 08:00 UTC (3 AM ET)
+# Controller: LXC 304 (ansible-controller) at 10.10.0.232
+#
+# Usage:
+#   # Dry run
+#   ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --check
+#
+#   # Full execution
+#   ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml
+#
+#   # Shutdown only (skip the host reboot)
+#   ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --tags shutdown
+#
+# Note: VM 109 (homeassistant) is excluded from Ansible inventory
+# (self-managed via HA Supervisor) but is included in pvesh start/stop.
+
+- name: Pre-reboot health check and snapshots
+  hosts: pve-node
+  gather_facts: false
+  tags: [pre-reboot, shutdown]
+
+  tasks:
+    - name: Check Proxmox cluster health
+      ansible.builtin.command: pvesh get /cluster/status --output-format json
+      register: cluster_status
+      changed_when: false
+
+    - name: Get list of running QEMU VMs
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/qemu --output-format json |
+        python3 -c "import sys,json; [print(vm['vmid']) for vm in json.load(sys.stdin) if vm.get('status')=='running']"
+      register: running_vms
+      changed_when: false
+
+    - name: Get list of running LXC containers
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/lxc --output-format json |
+        python3 -c "import sys,json; [print(ct['vmid']) for ct in json.load(sys.stdin) if ct.get('status')=='running']"
+      register: running_lxcs
+      changed_when: false
+
+    - name: Display running guests
+      ansible.builtin.debug:
+        msg: "Running VMs: {{ running_vms.stdout_lines }} | Running LXCs: {{ running_lxcs.stdout_lines }}"
+
+    - name: Snapshot running VMs
+      ansible.builtin.command: >
+        pvesh create /nodes/proxmox/qemu/{{ item }}/snapshot
+        --snapname pre-maintenance-{{ lookup('pipe', 'date +%Y-%m-%d') }}
+        --description "Auto snapshot before monthly maintenance reboot"
+      loop: "{{ running_vms.stdout_lines }}"
+      when: running_vms.stdout_lines | length > 0
+      ignore_errors: true
+
+    - name: Snapshot running LXCs
+      ansible.builtin.command: >
+        pvesh create /nodes/proxmox/lxc/{{ item }}/snapshot
+        --snapname pre-maintenance-{{ lookup('pipe', 'date +%Y-%m-%d') }}
+        --description "Auto snapshot before monthly maintenance reboot"
+      loop: "{{ running_lxcs.stdout_lines }}"
+      when: running_lxcs.stdout_lines | length > 0
+      ignore_errors: true
+
+- name: "Shutdown Tier 4 — Media & Others"
+  hosts: pve-node
+  gather_facts: false
+  tags: [shutdown]
+
+  vars:
+    tier4_vms: [109]
+    # LXC 303 (mcp-gateway) is onboot=0 and operator-managed — not included here
+    tier4_lxcs: [221, 222, 223, 302]
+
+  tasks:
+    - name: Shutdown Tier 4 VMs
+      ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
+      loop: "{{ tier4_vms }}"
+      ignore_errors: true
+
+    - name: Shutdown Tier 4 LXCs
+      ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown
+      loop: "{{ tier4_lxcs }}"
+      ignore_errors: true
+
+    - name: Wait for Tier 4 VMs to stop
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
+        python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
+      register: t4_vm_status
+      until: t4_vm_status.stdout.strip() == "stopped"
+      retries: 12
+      delay: 5
+      loop: "{{ tier4_vms }}"
+      ignore_errors: true
+
+    - name: Wait for Tier 4 LXCs to stop
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json |
+        python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
+      register: t4_lxc_status
+      until: t4_lxc_status.stdout.strip() == "stopped"
+      retries: 12
+      delay: 5
+      loop: "{{ tier4_lxcs }}"
+      ignore_errors: true
+
+- name: "Shutdown Tier 3 — Applications"
+  hosts: pve-node
+  gather_facts: false
+  tags: [shutdown]
+
+  vars:
+    tier3_vms: [115, 110]
+    tier3_lxcs: [301]
+
+  tasks:
+    - name: Shutdown Tier 3 VMs
+      ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
+      loop: "{{ tier3_vms }}"
+      ignore_errors: true
+
+    - name: Shutdown Tier 3 LXCs
+      ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown
+      loop: "{{ tier3_lxcs }}"
+      ignore_errors: true
+
+    - name: Wait for Tier 3 VMs to stop
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
+        python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
+      register: t3_vm_status
+      until: t3_vm_status.stdout.strip() == "stopped"
+      retries: 12
+      delay: 5
+      loop: "{{ tier3_vms }}"
+      ignore_errors: true
+
+    - name: Wait for Tier 3 LXCs to stop
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json |
+        python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
+      register: t3_lxc_status
+      until: t3_lxc_status.stdout.strip() == "stopped"
+      retries: 12
+      delay: 5
+      loop: "{{ tier3_lxcs }}"
+      ignore_errors: true
+
+- name: "Shutdown Tier 2 — Infrastructure"
+  hosts: pve-node
+  gather_facts: false
+  tags: [shutdown]
+
+  vars:
+    tier2_vms: [106, 116]
+    tier2_lxcs: [225, 210, 227]
+
+  tasks:
+    - name: Shutdown Tier 2 VMs
+      ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
+      loop: "{{ tier2_vms }}"
+      ignore_errors: true
+
+    - name: Shutdown Tier 2 LXCs
+      ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown
+      loop: "{{ tier2_lxcs }}"
+      ignore_errors: true
+
+    - name: Wait for Tier 2 VMs to stop
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
+        python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
+      register: t2_vm_status
+      until: t2_vm_status.stdout.strip() == "stopped"
+      retries: 12
+      delay: 5
+      loop: "{{ tier2_vms }}"
+      ignore_errors: true
+
+    - name: Wait for Tier 2 LXCs to stop
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json |
+        python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
+      register: t2_lxc_status
+      until: t2_lxc_status.stdout.strip() == "stopped"
+      retries: 12
+      delay: 5
+      loop: "{{ tier2_lxcs }}"
+      ignore_errors: true
+
+- name: "Shutdown Tier 1 — Databases"
+  hosts: pve-node
+  gather_facts: false
+  tags: [shutdown]
+
+  vars:
+    tier1_vms: [112]
+
+  tasks:
+    - name: Shutdown database VMs
+      ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
+      loop: "{{ tier1_vms }}"
+      ignore_errors: true
+
+    - name: Wait for database VMs to stop (up to 90s)
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
+        python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
+      register: t1_vm_status
+      until: t1_vm_status.stdout.strip() == "stopped"
+      retries: 18
+      delay: 5
+      loop: "{{ tier1_vms }}"
+      ignore_errors: true
+
+    - name: Force stop database VMs if still running
+      ansible.builtin.shell: >
+        status=$(pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
+        python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))");
+        if [ "$status" = "running" ]; then
+          pvesh create /nodes/proxmox/qemu/{{ item }}/status/stop;
+          echo "Force stopped VM {{ item }}";
+        else
+          echo "VM {{ item }} already stopped";
+        fi
+      loop: "{{ tier1_vms }}"
+      register: force_stop_result
+      changed_when: force_stop_result.results | default([]) | selectattr('stdout', 'defined') | selectattr('stdout', 'search', 'Force stopped') | list | length > 0
+
+- name: "Verify and reboot Proxmox host"
+  hosts: pve-node
+  gather_facts: false
+  tags: [reboot]
+
+  tasks:
+    - name: Verify all guests are stopped (excluding LXC 304)
+      ansible.builtin.shell: >
+        running_vms=$(pvesh get /nodes/proxmox/qemu --output-format json |
+        python3 -c "import sys,json; vms=[v for v in json.load(sys.stdin) if v.get('status')=='running']; print(len(vms))");
+        running_lxcs=$(pvesh get /nodes/proxmox/lxc --output-format json |
+        python3 -c "import sys,json; cts=[c for c in json.load(sys.stdin) if c.get('status')=='running' and c['vmid'] != 304]; print(len(cts))");
+        echo "Running VMs: $running_vms, Running LXCs: $running_lxcs";
+        if [ "$running_vms" != "0" ] || [ "$running_lxcs" != "0" ]; then exit 1; fi
+      register: verify_stopped
+
+    - name: Issue fire-and-forget reboot (controller will be killed)
+      ansible.builtin.shell: >
+        nohup bash -c 'sleep 10 && reboot' &>/dev/null &
+        echo "Reboot scheduled in 10 seconds"
+      register: reboot_issued
+      when: not ansible_check_mode
+
+    - name: Log reboot issued
+      ansible.builtin.debug:
+        msg: "{{ reboot_issued.stdout }} — Ansible process will terminate when host reboots. Post-reboot startup handled by ansible-post-reboot.service on LXC 304."
--- a/ansible/playbooks/post-reboot-startup.yml
+++ b/ansible/playbooks/post-reboot-startup.yml
@ -0,0 +1,214 @@
+---
+# Post-Reboot Startup — Controlled Guest Startup After Proxmox Reboot
+#
+# Starts all guests in dependency order with staggered delays to avoid
+# I/O storms. Runs automatically via ansible-post-reboot.service on
+# LXC 304 after the Proxmox host reboots.
+#
+# Can also be run manually:
+#   ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml
+#
+# Note: VM 109 (homeassistant) is excluded from Ansible inventory
+# (self-managed via HA Supervisor) but is included in pvesh start/stop.
+
+- name: Wait for Proxmox API to be ready
+  hosts: pve-node
+  gather_facts: false
+  tags: [startup]
+
+  tasks:
+    - name: Wait for Proxmox API
+      ansible.builtin.command: pvesh get /version --output-format json
+      register: pve_version
+      until: pve_version.rc == 0
+      retries: 30
+      delay: 10
+      changed_when: false
+
+    - name: Display Proxmox version
+      ansible.builtin.debug:
+        msg: "Proxmox API ready: {{ pve_version.stdout | from_json | json_query('version') | default('unknown') }}"
+
+- name: "Startup Tier 1 — Databases"
+  hosts: pve-node
+  gather_facts: false
+  tags: [startup]
+
+  tasks:
+    - name: Start database VM (112)
+      ansible.builtin.command: pvesh create /nodes/proxmox/qemu/112/status/start
+      ignore_errors: true
+
+    - name: Wait for VM 112 to be running
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/qemu/112/status/current --output-format json |
+        python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
+      register: db_status
+      until: db_status.stdout.strip() == "running"
+      retries: 12
+      delay: 5
+      changed_when: false
+
+    - name: Wait for database services to initialize
+      ansible.builtin.pause:
+        seconds: 30
+
+- name: "Startup Tier 2 — Infrastructure"
+  hosts: pve-node
+  gather_facts: false
+  tags: [startup]
+
+  vars:
+    tier2_vms: [106, 116]
+    tier2_lxcs: [225, 210, 227]
+
+  tasks:
+    - name: Start Tier 2 VMs
+      ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start
+      loop: "{{ tier2_vms }}"
+      ignore_errors: true
+
+    - name: Start Tier 2 LXCs
+      ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start
+      loop: "{{ tier2_lxcs }}"
+      ignore_errors: true
+
+    - name: Wait for infrastructure to come up
+      ansible.builtin.pause:
+        seconds: 30
+
+- name: "Startup Tier 3 — Applications"
+  hosts: pve-node
+  gather_facts: false
+  tags: [startup]
+
+  vars:
+    tier3_vms: [115, 110]
+    tier3_lxcs: [301]
+
+  tasks:
+    - name: Start Tier 3 VMs
+      ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start
+      loop: "{{ tier3_vms }}"
+      ignore_errors: true
+
+    - name: Start Tier 3 LXCs
+      ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start
+      loop: "{{ tier3_lxcs }}"
+      ignore_errors: true
+
+    - name: Wait for applications to start
+      ansible.builtin.pause:
+        seconds: 30
+
+    - name: Restart Pi-hole container via SSH (UDP DNS fix)
+      ansible.builtin.command: ssh docker-home "docker restart pihole"
+      ignore_errors: true
+
+    - name: Wait for Pi-hole to stabilize
+      ansible.builtin.pause:
+        seconds: 10
+
+- name: "Startup Tier 4 — Media & Others"
+  hosts: pve-node
+  gather_facts: false
+  tags: [startup]
+
+  vars:
+    tier4_vms: [109]
+    tier4_lxcs: [221, 222, 223, 302]
+
+  tasks:
+    - name: Start Tier 4 VMs
+      ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start
+      loop: "{{ tier4_vms }}"
+      ignore_errors: true
+
+    - name: Start Tier 4 LXCs
+      ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start
+      loop: "{{ tier4_lxcs }}"
+      ignore_errors: true
+
+- name: Post-reboot validation
+  hosts: pve-node
+  gather_facts: false
+  tags: [startup, validate]
+
+  tasks:
+    - name: Wait for all services to initialize
+      ansible.builtin.pause:
+        seconds: 60
+
+    - name: Check all expected VMs are running
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/qemu --output-format json |
+        python3 -c "
+        import sys, json
+        vms = json.load(sys.stdin)
+        expected = {106, 109, 110, 112, 115, 116}
+        running = {v['vmid'] for v in vms if v.get('status') == 'running'}
+        missing = expected - running
+        if missing:
+            print(f'WARN: VMs not running: {missing}')
+            sys.exit(1)
+        print(f'All expected VMs running: {running & expected}')
+        "
+      register: vm_check
+      ignore_errors: true
+
+    - name: Check all expected LXCs are running
+      ansible.builtin.shell: >
+        pvesh get /nodes/proxmox/lxc --output-format json |
+        python3 -c "
+        import sys, json
+        cts = json.load(sys.stdin)
+        # LXC 303 (mcp-gateway) intentionally excluded — onboot=0, operator-managed
+        expected = {210, 221, 222, 223, 225, 227, 301, 302, 304}
+        running = {c['vmid'] for c in cts if c.get('status') == 'running'}
+        missing = expected - running
+        if missing:
+            print(f'WARN: LXCs not running: {missing}')
+            sys.exit(1)
+        print(f'All expected LXCs running: {running & expected}')
+        "
+      register: lxc_check
+      ignore_errors: true
+
+    - name: Clean up old maintenance snapshots (older than 7 days)
+      ansible.builtin.shell: >
+        cutoff=$(date -d '7 days ago' +%s);
+        for vmid in $(pvesh get /nodes/proxmox/qemu --output-format json |
+          python3 -c "import sys,json; [print(v['vmid']) for v in json.load(sys.stdin)]"); do
+          for snap in $(pvesh get /nodes/proxmox/qemu/$vmid/snapshot --output-format json |
+            python3 -c "import sys,json; [print(s['name']) for s in json.load(sys.stdin) if s['name'].startswith('pre-maintenance-')]" 2>/dev/null); do
+            snap_date=$(echo $snap | sed 's/pre-maintenance-//');
+            snap_epoch=$(date -d "$snap_date" +%s 2>/dev/null);
+            if [ -z "$snap_epoch" ]; then
+              echo "WARN: could not parse date for snapshot $snap on VM $vmid";
+            elif [ "$snap_epoch" -lt "$cutoff" ]; then
+              pvesh delete /nodes/proxmox/qemu/$vmid/snapshot/$snap && echo "Deleted $snap from VM $vmid";
+            fi
+          done
+        done;
+        for ctid in $(pvesh get /nodes/proxmox/lxc --output-format json |
+          python3 -c "import sys,json; [print(c['vmid']) for c in json.load(sys.stdin)]"); do
+          for snap in $(pvesh get /nodes/proxmox/lxc/$ctid/snapshot --output-format json |
+            python3 -c "import sys,json; [print(s['name']) for s in json.load(sys.stdin) if s['name'].startswith('pre-maintenance-')]" 2>/dev/null); do
+            snap_date=$(echo $snap | sed 's/pre-maintenance-//');
+            snap_epoch=$(date -d "$snap_date" +%s 2>/dev/null);
+            if [ -z "$snap_epoch" ]; then
+              echo "WARN: could not parse date for snapshot $snap on LXC $ctid";
+            elif [ "$snap_epoch" -lt "$cutoff" ]; then
+              pvesh delete /nodes/proxmox/lxc/$ctid/snapshot/$snap && echo "Deleted $snap from LXC $ctid";
+            fi
+          done
+        done;
+        echo "Snapshot cleanup complete"
+      ignore_errors: true
+
+    - name: Display validation results
+      ansible.builtin.debug:
+        msg:
+          - "VM status: {{ vm_check.stdout }}"
+          - "LXC status: {{ lxc_check.stdout }}"
+          - "Maintenance reboot complete — post-reboot startup finished"
--- a/ansible/systemd/ansible-monthly-reboot.service
+++ b/ansible/systemd/ansible-monthly-reboot.service
@ -0,0 +1,15 @@
+[Unit]
+Description=Monthly Proxmox maintenance reboot (Ansible)
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+Type=oneshot
+User=cal
+WorkingDirectory=/opt/ansible
+ExecStart=/usr/bin/ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml
+StandardOutput=append:/opt/ansible/logs/monthly-reboot.log
+StandardError=append:/opt/ansible/logs/monthly-reboot.log
+TimeoutStartSec=900
+
+# No [Install] section — this service is activated exclusively by ansible-monthly-reboot.timer
--- a/ansible/systemd/ansible-monthly-reboot.timer
+++ b/ansible/systemd/ansible-monthly-reboot.timer
@ -0,0 +1,13 @@
+[Unit]
+Description=Monthly Proxmox maintenance reboot timer
+Documentation=https://git.manticorum.com/cal/claude-home/src/branch/main/server-configs/proxmox/maintenance-reboot.md
+
+[Timer]
+# First Sunday of the month at 08:00 UTC (3:00 AM ET during EDT)
+# Day range 01-07 ensures it's always the first occurrence of that weekday
+OnCalendar=Sun *-*-01..07 08:00:00
+Persistent=true
+RandomizedDelaySec=600
+
+[Install]
+WantedBy=timers.target
--- a/ansible/systemd/ansible-post-reboot.service
+++ b/ansible/systemd/ansible-post-reboot.service
@ -0,0 +1,21 @@
+[Unit]
+Description=Post-reboot controlled guest startup (Ansible)
+After=network-online.target
+Wants=network-online.target
+# Only run after a fresh boot — not on service restart
+ConditionUpTimeSec=600
+
+[Service]
+Type=oneshot
+User=cal
+WorkingDirectory=/opt/ansible
+# Delay 120s to let Proxmox API stabilize and onboot guests settle
+ExecStartPre=/bin/sleep 120
+ExecStart=/usr/bin/ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml
+StandardOutput=append:/opt/ansible/logs/post-reboot-startup.log
+StandardError=append:/opt/ansible/logs/post-reboot-startup.log
+TimeoutStartSec=1800
+
+[Install]
+# Runs automatically on every boot of LXC 304
+WantedBy=multi-user.target
--- a/server-configs/proxmox/maintenance-reboot.md
+++ b/server-configs/proxmox/maintenance-reboot.md
@ -14,7 +14,7 @@ tags: [proxmox, maintenance, reboot, ansible, operations, systemd]
 |--------|-------|
 | **Schedule** | 1st Sunday of every month, 3:00 AM ET (08:00 UTC) |
 | **Expected downtime** | ~15 minutes (host reboot + VM/LXC startup) |
-| **Orchestration** | Ansible playbook on LXC 304 (ansible-controller) |
+| **Orchestration** | Ansible on LXC 304 — shutdown playbook → host reboot → post-reboot startup playbook |
 | **Calendar** | Google Calendar recurring event: "Proxmox Monthly Maintenance Reboot" |
 | **HA DNS** | ubuntu-manticore (10.10.0.226) provides Pi-hole 2 during Proxmox downtime |

@ -24,16 +24,25 @@ tags: [proxmox, maintenance, reboot, ansible, operations, systemd]
 - Long uptimes allow memory leaks and process state drift (e.g., avahi busy-loops)
 - Validates that all VMs/LXCs auto-start cleanly with `onboot: 1`

+## Architecture
+
+The reboot is split into two playbooks because LXC 304 (the Ansible controller) is itself a guest on the Proxmox host being rebooted:
+
+1. **`monthly-reboot.yml`** — Snapshots all guests, shuts them down in dependency order, issues a fire-and-forget `reboot` to the Proxmox host, then exits. LXC 304 is killed when the host reboots.
+2. **`post-reboot-startup.yml`** — After the host reboots, LXC 304 auto-starts via `onboot: 1`. A systemd service (`ansible-post-reboot.service`) waits 120 seconds for the Proxmox API to stabilize, then starts all guests in dependency order with staggered delays.
+
+The `onboot: 1` flag on all production guests acts as a safety net — even if the post-reboot playbook fails, Proxmox will start everything (though without controlled ordering).
+
 ## Prerequisites (Before Maintenance)

 - [ ] Verify no active Tdarr transcodes on ubuntu-manticore
 - [ ] Verify no running database backups
- [ ] Switch workstation DNS to `1.1.1.1` (Pi-hole 1 on VM 106 will be offline)
+- [ ] Ensure workstation has Pi-hole 2 (10.10.0.226) as a fallback DNS server so it fails over automatically during downtime
 - [ ] Confirm ubuntu-manticore Pi-hole 2 is healthy: `ssh manticore "docker exec pihole pihole status"`

 ## `onboot` Audit

-All production VMs and LXCs must have `onboot: 1` so they restart automatically if the playbook fails mid-sequence.
+All production VMs and LXCs must have `onboot: 1` so they restart automatically as a safety net.

 **Check VMs:**
 ```bash
@ -55,18 +64,18 @@ done"

 **Audit results (2026-04-03):**

-| ID | Name | Type | `onboot` | Action needed |
-|----|------|------|----------|---------------|
+| ID | Name | Type | `onboot` | Status |
+|----|------|------|----------|--------|
 | 106 | docker-home | VM | 1 | OK |
-| 109 | homeassistant | VM | NOT SET | **Add `onboot: 1`** |
+| 109 | homeassistant | VM | 1 | OK (fixed 2026-04-03) |
 | 110 | discord-bots | VM | 1 | OK |
 | 112 | databases-bots | VM | 1 | OK |
 | 115 | docker-sba | VM | 1 | OK |
 | 116 | docker-home-servers | VM | 1 | OK |
 | 210 | docker-n8n-lxc | LXC | 1 | OK |
-| 221 | arr-stack | LXC | NOT SET | **Add `onboot: 1`** |
+| 221 | arr-stack | LXC | 1 | OK (fixed 2026-04-03) |
 | 222 | memos | LXC | 1 | OK |
-| 223 | foundry-lxc | LXC | NOT SET | **Add `onboot: 1`** |
+| 223 | foundry-lxc | LXC | 1 | OK (fixed 2026-04-03) |
 | 225 | gitea | LXC | 1 | OK |
 | 227 | uptime-kuma | LXC | 1 | OK |
 | 301 | claude-discord-coordinator | LXC | 1 | OK |
@ -74,16 +83,15 @@ done"
 | 303 | mcp-gateway | LXC | 0 | Intentional (on-demand) |
 | 304 | ansible-controller | LXC | 1 | OK |

-**Fix missing `onboot`:**
+**If any production guest is missing `onboot: 1`:**
 ```bash
-ssh proxmox "qm set 109 --onboot 1"
-ssh proxmox "pct set 221 --onboot 1"
-ssh proxmox "pct set 223 --onboot 1"
+ssh proxmox "qm set <VMID> --onboot 1"   # for VMs
+ssh proxmox "pct set <CTID> --onboot 1"   # for LXCs
 ```

 ## Shutdown Order (Dependency-Aware)

-Reverse of the validated startup sequence. Stop consumers before their dependencies.
+Reverse of the validated startup sequence. Stop consumers before their dependencies. Each tier polls per-guest status rather than using fixed waits.

 ```
 Tier 4 — Media & Others (no downstream dependents)
@ -92,7 +100,6 @@ Tier 4 — Media & Others (no downstream dependents)
  LXC 222 memos
  LXC 223 foundry-lxc
  LXC 302 claude-runner
-  LXC 303 mcp-gateway (if running)

 Tier 3 — Applications (depend on databases + infra)
  VM 115  docker-sba (Paper Dynasty, Major Domo)
@ -107,21 +114,19 @@ Tier 2 — Infrastructure + DNS (depend on databases)
  VM 116  docker-home-servers

 Tier 1 — Databases (no dependencies, shut down last)
-  VM 112  databases-bots
+  VM 112  databases-bots (force-stop after 90s if ACPI ignored)

-Tier 0 — Ansible controller shuts itself down last
-  LXC 304 ansible-controller
-
-→ Proxmox host reboots
+→ LXC 304 issues fire-and-forget reboot to Proxmox host, then is killed
 ```

 **Known quirks:**
- VM 112 (databases-bots) may ignore ACPI shutdown — use `--forceStop` after timeout
+- VM 112 (databases-bots) may ignore ACPI shutdown — playbook force-stops after 90s
 - VM 109 (homeassistant) is self-managed via HA Supervisor, excluded from Ansible inventory
+- LXC 303 (mcp-gateway) has `onboot: 0` and is operator-managed — not included in shutdown/startup. If it was running before maintenance, bring it up manually afterward

 ## Startup Order (Staggered)

-After the Proxmox host reboots, guests with `onboot: 1` will auto-start. The Ansible playbook overrides this with a controlled sequence:
+After the Proxmox host reboots, LXC 304 auto-starts and the `ansible-post-reboot.service` waits 120s before running the controlled startup:

 ```
 Tier 1 — Databases first
@ -142,8 +147,8 @@ Tier 3 — Applications
  LXC 301 claude-discord-coordinator
  → wait 30s

-Pi-hole fix — restart container to clear UDP DNS bug
-  qm guest exec 106 -- docker restart pihole
+Pi-hole fix — restart container via SSH to clear UDP DNS bug
+  ssh docker-home "docker restart pihole"
  → wait 10s

 Tier 4 — Media & Others
@ -151,6 +156,7 @@ Tier 4 — Media & Others
  LXC 221 arr-stack
  LXC 222 memos
  LXC 223 foundry-lxc
+  LXC 302 claude-runner
 ```

 ## Post-Reboot Validation
@ -161,28 +167,35 @@ Tier 4 — Media & Others
 - [ ] Discord bots responding (check Discord)
 - [ ] Uptime Kuma dashboard green: `curl -sf http://10.10.0.227:3001/api/status-page/homelab`
 - [ ] Home Assistant running: `curl -sf http://10.10.0.109:8123/api/ -H 'Authorization: Bearer <token>'`
- [ ] Switch workstation DNS back from `1.1.1.1` to Pi-hole
+- [ ] Maintenance snapshots cleaned up (auto, 7-day retention)

 ## Automation

-### Ansible Playbook
+### Ansible Playbooks

-Located at `/opt/ansible/playbooks/monthly-reboot.yml` on LXC 304.
+Both located at `/opt/ansible/playbooks/` on LXC 304.

 ```bash
-# Dry run (check mode)
+# Dry run — shutdown only
 ssh ansible "ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --check"

-# Manual execution
+# Manual full execution — shutdown + reboot
 ssh ansible "ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml"

-# Limit to shutdown only (skip reboot)
+# Manual post-reboot startup (if automatic startup failed)
+ssh ansible "ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml"
+
+# Shutdown only — skip the host reboot
 ssh ansible "ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --tags shutdown"
 ```

-### Systemd Timer
+### Systemd Units (on LXC 304)

-The playbook runs automatically via systemd timer on LXC 304:
+| Unit | Purpose | Schedule |
+|------|---------|----------|
+| `ansible-monthly-reboot.timer` | Triggers shutdown + reboot playbook | 1st Sunday of month, 08:00 UTC |
+| `ansible-monthly-reboot.service` | Runs `monthly-reboot.yml` | Activated by timer |
+| `ansible-post-reboot.service` | Runs `post-reboot-startup.yml` | On boot (multi-user.target), only if uptime < 10 min |

 ```bash
 # Check timer status
@ -191,10 +204,32 @@ ssh ansible "systemctl status ansible-monthly-reboot.timer"
 # Next scheduled run
 ssh ansible "systemctl list-timers ansible-monthly-reboot.timer"

+# Check post-reboot service status
+ssh ansible "systemctl status ansible-post-reboot.service"
+
 # Disable for a month (e.g., during an incident)
 ssh ansible "systemctl stop ansible-monthly-reboot.timer"
 ```

+### Deployment (one-time setup on LXC 304)
+
+```bash
+# Copy playbooks
+scp ansible/playbooks/monthly-reboot.yml ansible:/opt/ansible/playbooks/
+scp ansible/playbooks/post-reboot-startup.yml ansible:/opt/ansible/playbooks/
+
+# Copy and enable systemd units
+scp ansible/systemd/ansible-monthly-reboot.timer ansible:/etc/systemd/system/
+scp ansible/systemd/ansible-monthly-reboot.service ansible:/etc/systemd/system/
+scp ansible/systemd/ansible-post-reboot.service ansible:/etc/systemd/system/
+ssh ansible "sudo systemctl daemon-reload && \
+  sudo systemctl enable --now ansible-monthly-reboot.timer && \
+  sudo systemctl enable ansible-post-reboot.service"
+
+# Verify SSH key access from LXC 304 to docker-home (needed for Pi-hole restart)
+ssh ansible "ssh -o BatchMode=yes docker-home 'echo ok'"
+```
+
 ## Rollback

 If a guest fails to start after reboot:
@ -202,6 +237,7 @@ If a guest fails to start after reboot:
 2. Review guest logs: `ssh proxmox "journalctl -u pve-guests -n 50"`
 3. Manual start: `ssh proxmox "pvesh create /nodes/proxmox/qemu/<VMID>/status/start"`
 4. If guest is corrupted, restore from the pre-reboot Proxmox snapshot
+5. If post-reboot startup failed entirely, run manually: `ssh ansible "ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml"`

 ## Related Documentation

--- a/server-configs/proxmox/qemu/106.conf
+++ b/server-configs/proxmox/qemu/106.conf
@ -1,7 +1,7 @@
 agent: 1
 boot: order=scsi0;net0
 cores: 4
-memory: 16384
+memory: 6144
 meta: creation-qemu=6.1.0,ctime=1646083628
 name: docker-home
 net0: virtio=BA:65:DF:88:85:4C,bridge=vmbr0,firewall=1
@ -11,5 +11,5 @@ ostype: l26
 scsi0: local-lvm:vm-106-disk-0,size=256G
 scsihw: virtio-scsi-pci
 smbios1: uuid=54ef12fc-edcc-4744-a109-dd2de9a6dc03
-sockets: 2
+sockets: 1
 vmgenid: a13c92a2-a955-485e-a80e-391e99b19fbd
--- a/server-configs/proxmox/right-sizing-vm-106.md
+++ b/server-configs/proxmox/right-sizing-vm-106.md
@ -0,0 +1,141 @@
+---
+title: "VM 106 (docker-home) Right-Sizing Runbook"
+description: "Runbook for right-sizing VM 106 from 16 GB/8 vCPU to 6 GB/4 vCPU — pre-checks, resize commands, and post-resize validation."
+type: runbook
+domain: server-configs
+tags: [proxmox, infra-audit, right-sizing, docker-home]
+---
+
+# VM 106 (docker-home) Right-Sizing Runbook
+
+## Context
+
+Infrastructure audit (2026-04-02) found VM 106 severely overprovisioned:
+
+| Resource | Allocated | Actual Usage | Target |
+|----------|-----------|--------------|--------|
+| RAM | 16 GB | 1.1–1.5 GB | 6 GB (4× headroom) |
+| vCPUs | 8 (2 sockets × 4 cores) | load 0.12/core | 4 (1 socket × 4 cores) |
+
+**Services**: Pi-hole, Nginx Proxy Manager, Portainer
+
+## Pre-Check Results (2026-04-03)
+
+Automated checks were run before resizing. **All clear.**
+
+### Container memory limits
+
+```bash
+docker inspect pihole nginx-proxy-manager_app_1 portainer \
+  | python3 -c "import json,sys; c=json.load(sys.stdin); \
+    [print(x['Name'], 'MemoryLimit:', x['HostConfig']['Memory']) for x in c]"
+```
+
+Result:
+```
+/pihole MemoryLimit: 0
+/nginx-proxy-manager_app_1 MemoryLimit: 0
+/portainer MemoryLimit: 0
+```
+
+`0` = no limit — no containers will OOM at 6 GB.
+
+### Docker Compose memory reservations
+
+```bash
+grep -rn 'memory\|mem_limit\|memswap' /home/cal/container-data/*/docker-compose.yml
+```
+
+Result: **no matches** — no compose-level memory reservations.
+
+### Live memory usage at audit time
+
+```
+total: 15 GiB  used: 1.1 GiB  free: 6.8 GiB  buff/cache: 7.7 GiB
+Pi-hole:  463 MiB
+NPM:      367 MiB
+Portainer:  12 MiB
+Total containers: ~842 MiB
+```
+
+## Resize Procedure
+
+Brief downtime: Pi-hole and NPM will be unavailable during shutdown.
+Manticore runs Pi-hole 2 (10.10.0.226) for HA DNS — clients fail over automatically.
+
+### Step 1 — Shut down the VM
+
+```bash
+ssh proxmox "qm shutdown 106 --timeout 60"
+# Wait for shutdown
+ssh proxmox "qm status 106"   # Should show: status: stopped
+```
+
+### Step 2 — Apply new hardware config
+
+```bash
+# Reduce RAM: 16384 MB → 6144 MB
+ssh proxmox "qm set 106 --memory 6144"
+
+# Reduce vCPUs: 2 sockets × 4 cores → 1 socket × 4 cores (8 → 4 vCPUs)
+ssh proxmox "qm set 106 --sockets 1 --cores 4"
+
+# Verify
+ssh proxmox "qm config 106 | grep -E 'memory|cores|sockets'"
+```
+
+Expected output:
+```
+cores: 4
+memory: 6144
+sockets: 1
+```
+
+### Step 3 — Start the VM
+
+```bash
+ssh proxmox "qm start 106"
+```
+
+Wait ~30 seconds for Docker to come up.
+
+### Step 4 — Verify services
+
+```bash
+# Pi-hole DNS resolution
+ssh pihole "docker exec pihole dig google.com @127.0.0.1 | grep -E 'SERVER|ANSWER'"
+
+# NPM — check it's running
+ssh pihole "docker ps --filter name=nginx-proxy-manager --format '{{.Status}}'"
+
+# Portainer
+ssh pihole "docker ps --filter name=portainer --format '{{.Status}}'"
+
+# Memory usage post-resize
+ssh pihole "free -h"
+```
+
+### Step 5 — Monitor for 24h
+
+Check memory doesn't approach the 6 GB limit:
+
+```bash
+ssh pihole "free -h && docker stats --no-stream --format 'table {{.Name}}\t{{.MemUsage}}'"
+```
+
+Alert threshold: if `used` exceeds 4.5 GB (75% of 6 GB), consider increasing to 8 GB.
+
+## Rollback
+
+If services fail to come up after resizing:
+
+```bash
+# Restore original allocation
+ssh proxmox "qm set 106 --memory 16384 --sockets 2 --cores 4"
+ssh proxmox "qm start 106"
+```
+
+## Related
+
+- [Maintenance Reboot Runbook](maintenance-reboot.md) — VM 106 is Tier 2 (shut down after apps, before databases)
+- Issue: cal/claude-home#19