Establishes a first-Sunday-of-the-month maintenance window orchestrated by Ansible on LXC 304. Split into two playbooks to handle the self-reboot paradox (the controller is a guest on the host being rebooted): - monthly-reboot.yml: snapshots, tiered shutdown with per-guest polling, fire-and-forget host reboot - post-reboot-startup.yml: controlled tiered startup with staggered delays, Pi-hole UDP DNS fix, validation, and snapshot cleanup Also fixes onboot:1 on VM 109, LXC 221, LXC 223 and creates a recurring Google Calendar event for the maintenance window. Closes #26 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
266 lines
9.6 KiB
YAML
266 lines
9.6 KiB
YAML
---
|
|
# Monthly Proxmox Maintenance Reboot — Shutdown & Reboot
|
|
#
|
|
# Orchestrates a graceful shutdown of all guests in dependency order,
|
|
# then issues a fire-and-forget reboot to the Proxmox host.
|
|
#
|
|
# After the host reboots, LXC 304 auto-starts via onboot:1 and the
|
|
# post-reboot-startup.yml playbook runs automatically via the
|
|
# ansible-post-reboot.service systemd unit (triggered by @reboot).
|
|
#
|
|
# Schedule: 1st Sunday of each month, 08:00 UTC (3 AM ET)
|
|
# Controller: LXC 304 (ansible-controller) at 10.10.0.232
|
|
#
|
|
# Usage:
|
|
# # Dry run
|
|
# ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --check
|
|
#
|
|
# # Full execution
|
|
# ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml
|
|
#
|
|
# # Shutdown only (skip the host reboot)
|
|
# ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --tags shutdown
|
|
#
|
|
# Note: VM 109 (homeassistant) is excluded from Ansible inventory
|
|
# (self-managed via HA Supervisor) but is included in pvesh start/stop.
|
|
|
|
- name: Pre-reboot health check and snapshots
|
|
hosts: pve-node
|
|
gather_facts: false
|
|
tags: [pre-reboot, shutdown]
|
|
|
|
tasks:
|
|
- name: Check Proxmox cluster health
|
|
ansible.builtin.command: pvesh get /cluster/status --output-format json
|
|
register: cluster_status
|
|
changed_when: false
|
|
|
|
- name: Get list of running QEMU VMs
|
|
ansible.builtin.shell: >
|
|
pvesh get /nodes/proxmox/qemu --output-format json |
|
|
python3 -c "import sys,json; [print(vm['vmid']) for vm in json.load(sys.stdin) if vm.get('status')=='running']"
|
|
register: running_vms
|
|
changed_when: false
|
|
|
|
- name: Get list of running LXC containers
|
|
ansible.builtin.shell: >
|
|
pvesh get /nodes/proxmox/lxc --output-format json |
|
|
python3 -c "import sys,json; [print(ct['vmid']) for ct in json.load(sys.stdin) if ct.get('status')=='running']"
|
|
register: running_lxcs
|
|
changed_when: false
|
|
|
|
- name: Display running guests
|
|
ansible.builtin.debug:
|
|
msg: "Running VMs: {{ running_vms.stdout_lines }} | Running LXCs: {{ running_lxcs.stdout_lines }}"
|
|
|
|
- name: Snapshot running VMs
|
|
ansible.builtin.command: >
|
|
pvesh create /nodes/proxmox/qemu/{{ item }}/snapshot
|
|
--snapname pre-maintenance-{{ lookup('pipe', 'date +%Y-%m-%d') }}
|
|
--description "Auto snapshot before monthly maintenance reboot"
|
|
loop: "{{ running_vms.stdout_lines }}"
|
|
when: running_vms.stdout_lines | length > 0
|
|
ignore_errors: true
|
|
|
|
- name: Snapshot running LXCs
|
|
ansible.builtin.command: >
|
|
pvesh create /nodes/proxmox/lxc/{{ item }}/snapshot
|
|
--snapname pre-maintenance-{{ lookup('pipe', 'date +%Y-%m-%d') }}
|
|
--description "Auto snapshot before monthly maintenance reboot"
|
|
loop: "{{ running_lxcs.stdout_lines }}"
|
|
when: running_lxcs.stdout_lines | length > 0
|
|
ignore_errors: true
|
|
|
|
- name: "Shutdown Tier 4 — Media & Others"
|
|
hosts: pve-node
|
|
gather_facts: false
|
|
tags: [shutdown]
|
|
|
|
vars:
|
|
tier4_vms: [109]
|
|
# LXC 303 (mcp-gateway) is onboot=0 and operator-managed — not included here
|
|
tier4_lxcs: [221, 222, 223, 302]
|
|
|
|
tasks:
|
|
- name: Shutdown Tier 4 VMs
|
|
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
|
|
loop: "{{ tier4_vms }}"
|
|
ignore_errors: true
|
|
|
|
- name: Shutdown Tier 4 LXCs
|
|
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown
|
|
loop: "{{ tier4_lxcs }}"
|
|
ignore_errors: true
|
|
|
|
- name: Wait for Tier 4 VMs to stop
|
|
ansible.builtin.shell: >
|
|
pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
|
|
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
|
register: t4_vm_status
|
|
until: t4_vm_status.stdout.strip() == "stopped"
|
|
retries: 12
|
|
delay: 5
|
|
loop: "{{ tier4_vms }}"
|
|
ignore_errors: true
|
|
|
|
- name: Wait for Tier 4 LXCs to stop
|
|
ansible.builtin.shell: >
|
|
pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json |
|
|
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
|
register: t4_lxc_status
|
|
until: t4_lxc_status.stdout.strip() == "stopped"
|
|
retries: 12
|
|
delay: 5
|
|
loop: "{{ tier4_lxcs }}"
|
|
ignore_errors: true
|
|
|
|
- name: "Shutdown Tier 3 — Applications"
|
|
hosts: pve-node
|
|
gather_facts: false
|
|
tags: [shutdown]
|
|
|
|
vars:
|
|
tier3_vms: [115, 110]
|
|
tier3_lxcs: [301]
|
|
|
|
tasks:
|
|
- name: Shutdown Tier 3 VMs
|
|
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
|
|
loop: "{{ tier3_vms }}"
|
|
ignore_errors: true
|
|
|
|
- name: Shutdown Tier 3 LXCs
|
|
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown
|
|
loop: "{{ tier3_lxcs }}"
|
|
ignore_errors: true
|
|
|
|
- name: Wait for Tier 3 VMs to stop
|
|
ansible.builtin.shell: >
|
|
pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
|
|
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
|
register: t3_vm_status
|
|
until: t3_vm_status.stdout.strip() == "stopped"
|
|
retries: 12
|
|
delay: 5
|
|
loop: "{{ tier3_vms }}"
|
|
ignore_errors: true
|
|
|
|
- name: Wait for Tier 3 LXCs to stop
|
|
ansible.builtin.shell: >
|
|
pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json |
|
|
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
|
register: t3_lxc_status
|
|
until: t3_lxc_status.stdout.strip() == "stopped"
|
|
retries: 12
|
|
delay: 5
|
|
loop: "{{ tier3_lxcs }}"
|
|
ignore_errors: true
|
|
|
|
- name: "Shutdown Tier 2 — Infrastructure"
|
|
hosts: pve-node
|
|
gather_facts: false
|
|
tags: [shutdown]
|
|
|
|
vars:
|
|
tier2_vms: [106, 116]
|
|
tier2_lxcs: [225, 210, 227]
|
|
|
|
tasks:
|
|
- name: Shutdown Tier 2 VMs
|
|
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
|
|
loop: "{{ tier2_vms }}"
|
|
ignore_errors: true
|
|
|
|
- name: Shutdown Tier 2 LXCs
|
|
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown
|
|
loop: "{{ tier2_lxcs }}"
|
|
ignore_errors: true
|
|
|
|
- name: Wait for Tier 2 VMs to stop
|
|
ansible.builtin.shell: >
|
|
pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
|
|
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
|
register: t2_vm_status
|
|
until: t2_vm_status.stdout.strip() == "stopped"
|
|
retries: 12
|
|
delay: 5
|
|
loop: "{{ tier2_vms }}"
|
|
ignore_errors: true
|
|
|
|
- name: Wait for Tier 2 LXCs to stop
|
|
ansible.builtin.shell: >
|
|
pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json |
|
|
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
|
register: t2_lxc_status
|
|
until: t2_lxc_status.stdout.strip() == "stopped"
|
|
retries: 12
|
|
delay: 5
|
|
loop: "{{ tier2_lxcs }}"
|
|
ignore_errors: true
|
|
|
|
- name: "Shutdown Tier 1 — Databases"
|
|
hosts: pve-node
|
|
gather_facts: false
|
|
tags: [shutdown]
|
|
|
|
vars:
|
|
tier1_vms: [112]
|
|
|
|
tasks:
|
|
- name: Shutdown database VMs
|
|
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
|
|
loop: "{{ tier1_vms }}"
|
|
ignore_errors: true
|
|
|
|
- name: Wait for database VMs to stop (up to 90s)
|
|
ansible.builtin.shell: >
|
|
pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
|
|
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
|
register: t1_vm_status
|
|
until: t1_vm_status.stdout.strip() == "stopped"
|
|
retries: 18
|
|
delay: 5
|
|
loop: "{{ tier1_vms }}"
|
|
ignore_errors: true
|
|
|
|
- name: Force stop database VMs if still running
|
|
ansible.builtin.shell: >
|
|
status=$(pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
|
|
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))");
|
|
if [ "$status" = "running" ]; then
|
|
pvesh create /nodes/proxmox/qemu/{{ item }}/status/stop;
|
|
echo "Force stopped VM {{ item }}";
|
|
else
|
|
echo "VM {{ item }} already stopped";
|
|
fi
|
|
loop: "{{ tier1_vms }}"
|
|
register: force_stop_result
|
|
changed_when: force_stop_result.results | default([]) | selectattr('stdout', 'defined') | selectattr('stdout', 'search', 'Force stopped') | list | length > 0
|
|
|
|
- name: "Verify and reboot Proxmox host"
|
|
hosts: pve-node
|
|
gather_facts: false
|
|
tags: [reboot]
|
|
|
|
tasks:
|
|
- name: Verify all guests are stopped (excluding LXC 304)
|
|
ansible.builtin.shell: >
|
|
running_vms=$(pvesh get /nodes/proxmox/qemu --output-format json |
|
|
python3 -c "import sys,json; vms=[v for v in json.load(sys.stdin) if v.get('status')=='running']; print(len(vms))");
|
|
running_lxcs=$(pvesh get /nodes/proxmox/lxc --output-format json |
|
|
python3 -c "import sys,json; cts=[c for c in json.load(sys.stdin) if c.get('status')=='running' and c['vmid'] != 304]; print(len(cts))");
|
|
echo "Running VMs: $running_vms, Running LXCs: $running_lxcs";
|
|
if [ "$running_vms" != "0" ] || [ "$running_lxcs" != "0" ]; then exit 1; fi
|
|
register: verify_stopped
|
|
|
|
- name: Issue fire-and-forget reboot (controller will be killed)
|
|
ansible.builtin.shell: >
|
|
nohup bash -c 'sleep 10 && reboot' &>/dev/null &
|
|
echo "Reboot scheduled in 10 seconds"
|
|
register: reboot_issued
|
|
when: not ansible_check_mode
|
|
|
|
- name: Log reboot issued
|
|
ansible.builtin.debug:
|
|
msg: "{{ reboot_issued.stdout }} — Ansible process will terminate when host reboots. Post-reboot startup handled by ansible-post-reboot.service on LXC 304."
|