--- # Monthly Proxmox Maintenance Reboot — Shutdown & Reboot # # Orchestrates a graceful shutdown of all guests in dependency order, # then issues a fire-and-forget reboot to the Proxmox host. # # After the host reboots, LXC 304 auto-starts via onboot:1 and the # post-reboot-startup.yml playbook runs automatically via the # ansible-post-reboot.service systemd unit (triggered by @reboot). # # Schedule: 1st Sunday of each month, 08:00 UTC (3 AM ET) # Controller: LXC 304 (ansible-controller) at 10.10.0.232 # # Usage: # # Dry run # ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --check # # # Full execution # ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml # # # Shutdown only (skip the host reboot) # ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --tags shutdown # # Note: VM 109 (homeassistant) is excluded from Ansible inventory # (self-managed via HA Supervisor) but is included in pvesh start/stop. - name: Pre-reboot health check and snapshots hosts: pve-node gather_facts: false tags: [pre-reboot, shutdown] tasks: - name: Check Proxmox cluster health ansible.builtin.command: pvesh get /cluster/status --output-format json register: cluster_status changed_when: false - name: Get list of running QEMU VMs ansible.builtin.shell: > pvesh get /nodes/proxmox/qemu --output-format json | python3 -c "import sys,json; [print(vm['vmid']) for vm in json.load(sys.stdin) if vm.get('status')=='running']" register: running_vms changed_when: false - name: Get list of running LXC containers ansible.builtin.shell: > pvesh get /nodes/proxmox/lxc --output-format json | python3 -c "import sys,json; [print(ct['vmid']) for ct in json.load(sys.stdin) if ct.get('status')=='running']" register: running_lxcs changed_when: false - name: Display running guests ansible.builtin.debug: msg: "Running VMs: {{ running_vms.stdout_lines }} | Running LXCs: {{ running_lxcs.stdout_lines }}" - name: Snapshot running VMs ansible.builtin.command: > pvesh create /nodes/proxmox/qemu/{{ item }}/snapshot --snapname pre-maintenance-{{ lookup('pipe', 'date +%Y-%m-%d') }} --description "Auto snapshot before monthly maintenance reboot" loop: "{{ running_vms.stdout_lines }}" when: running_vms.stdout_lines | length > 0 ignore_errors: true - name: Snapshot running LXCs ansible.builtin.command: > pvesh create /nodes/proxmox/lxc/{{ item }}/snapshot --snapname pre-maintenance-{{ lookup('pipe', 'date +%Y-%m-%d') }} --description "Auto snapshot before monthly maintenance reboot" loop: "{{ running_lxcs.stdout_lines }}" when: running_lxcs.stdout_lines | length > 0 ignore_errors: true - name: "Shutdown Tier 4 — Media & Others" hosts: pve-node gather_facts: false tags: [shutdown] vars: tier4_vms: [109] # LXC 303 (mcp-gateway) is onboot=0 and operator-managed — not included here tier4_lxcs: [221, 222, 223, 302] tasks: - name: Shutdown Tier 4 VMs ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown loop: "{{ tier4_vms }}" ignore_errors: true - name: Shutdown Tier 4 LXCs ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown loop: "{{ tier4_lxcs }}" ignore_errors: true - name: Wait for Tier 4 VMs to stop ansible.builtin.shell: > pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json | python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" register: t4_vm_status until: t4_vm_status.stdout.strip() == "stopped" retries: 12 delay: 5 loop: "{{ tier4_vms }}" ignore_errors: true - name: Wait for Tier 4 LXCs to stop ansible.builtin.shell: > pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json | python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" register: t4_lxc_status until: t4_lxc_status.stdout.strip() == "stopped" retries: 12 delay: 5 loop: "{{ tier4_lxcs }}" ignore_errors: true - name: "Shutdown Tier 3 — Applications" hosts: pve-node gather_facts: false tags: [shutdown] vars: tier3_vms: [115, 110] tier3_lxcs: [301] tasks: - name: Shutdown Tier 3 VMs ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown loop: "{{ tier3_vms }}" ignore_errors: true - name: Shutdown Tier 3 LXCs ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown loop: "{{ tier3_lxcs }}" ignore_errors: true - name: Wait for Tier 3 VMs to stop ansible.builtin.shell: > pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json | python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" register: t3_vm_status until: t3_vm_status.stdout.strip() == "stopped" retries: 12 delay: 5 loop: "{{ tier3_vms }}" ignore_errors: true - name: Wait for Tier 3 LXCs to stop ansible.builtin.shell: > pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json | python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" register: t3_lxc_status until: t3_lxc_status.stdout.strip() == "stopped" retries: 12 delay: 5 loop: "{{ tier3_lxcs }}" ignore_errors: true - name: "Shutdown Tier 2 — Infrastructure" hosts: pve-node gather_facts: false tags: [shutdown] vars: tier2_vms: [106, 116] tier2_lxcs: [225, 210, 227] tasks: - name: Shutdown Tier 2 VMs ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown loop: "{{ tier2_vms }}" ignore_errors: true - name: Shutdown Tier 2 LXCs ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown loop: "{{ tier2_lxcs }}" ignore_errors: true - name: Wait for Tier 2 VMs to stop ansible.builtin.shell: > pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json | python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" register: t2_vm_status until: t2_vm_status.stdout.strip() == "stopped" retries: 12 delay: 5 loop: "{{ tier2_vms }}" ignore_errors: true - name: Wait for Tier 2 LXCs to stop ansible.builtin.shell: > pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json | python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" register: t2_lxc_status until: t2_lxc_status.stdout.strip() == "stopped" retries: 12 delay: 5 loop: "{{ tier2_lxcs }}" ignore_errors: true - name: "Shutdown Tier 1 — Databases" hosts: pve-node gather_facts: false tags: [shutdown] vars: tier1_vms: [112] tasks: - name: Shutdown database VMs ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown loop: "{{ tier1_vms }}" ignore_errors: true - name: Wait for database VMs to stop (up to 90s) ansible.builtin.shell: > pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json | python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" register: t1_vm_status until: t1_vm_status.stdout.strip() == "stopped" retries: 18 delay: 5 loop: "{{ tier1_vms }}" ignore_errors: true - name: Force stop database VMs if still running ansible.builtin.shell: > status=$(pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json | python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"); if [ "$status" = "running" ]; then pvesh create /nodes/proxmox/qemu/{{ item }}/status/stop; echo "Force stopped VM {{ item }}"; else echo "VM {{ item }} already stopped"; fi loop: "{{ tier1_vms }}" register: force_stop_result changed_when: force_stop_result.results | default([]) | selectattr('stdout', 'defined') | selectattr('stdout', 'search', 'Force stopped') | list | length > 0 - name: "Verify and reboot Proxmox host" hosts: pve-node gather_facts: false tags: [reboot] tasks: - name: Verify all guests are stopped (excluding LXC 304) ansible.builtin.shell: > running_vms=$(pvesh get /nodes/proxmox/qemu --output-format json | python3 -c "import sys,json; vms=[v for v in json.load(sys.stdin) if v.get('status')=='running']; print(len(vms))"); running_lxcs=$(pvesh get /nodes/proxmox/lxc --output-format json | python3 -c "import sys,json; cts=[c for c in json.load(sys.stdin) if c.get('status')=='running' and c['vmid'] != 304]; print(len(cts))"); echo "Running VMs: $running_vms, Running LXCs: $running_lxcs"; if [ "$running_vms" != "0" ] || [ "$running_lxcs" != "0" ]; then exit 1; fi register: verify_stopped - name: Issue fire-and-forget reboot (controller will be killed) ansible.builtin.shell: > nohup bash -c 'sleep 10 && reboot' &>/dev/null & echo "Reboot scheduled in 10 seconds" register: reboot_issued when: not ansible_check_mode - name: Log reboot issued ansible.builtin.debug: msg: "{{ reboot_issued.stdout }} — Ansible process will terminate when host reboots. Post-reboot startup handled by ansible-post-reboot.service on LXC 304."