claude-home/ansible/playbooks/post-reboot-startup.yml
Cal Corum 7a0c264f27 feat: add monthly Proxmox maintenance reboot automation (#26)
Establishes a first-Sunday-of-the-month maintenance window orchestrated
by Ansible on LXC 304. Split into two playbooks to handle the self-reboot
paradox (the controller is a guest on the host being rebooted):

- monthly-reboot.yml: snapshots, tiered shutdown with per-guest polling,
  fire-and-forget host reboot
- post-reboot-startup.yml: controlled tiered startup with staggered delays,
  Pi-hole UDP DNS fix, validation, and snapshot cleanup

Also fixes onboot:1 on VM 109, LXC 221, LXC 223 and creates a recurring
Google Calendar event for the maintenance window.

Closes #26

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-03 16:17:55 -05:00

215 lines
7.3 KiB
YAML

---
# Post-Reboot Startup — Controlled Guest Startup After Proxmox Reboot
#
# Starts all guests in dependency order with staggered delays to avoid
# I/O storms. Runs automatically via ansible-post-reboot.service on
# LXC 304 after the Proxmox host reboots.
#
# Can also be run manually:
# ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml
#
# Note: VM 109 (homeassistant) is excluded from Ansible inventory
# (self-managed via HA Supervisor) but is included in pvesh start/stop.
- name: Wait for Proxmox API to be ready
hosts: pve-node
gather_facts: false
tags: [startup]
tasks:
- name: Wait for Proxmox API
ansible.builtin.command: pvesh get /version --output-format json
register: pve_version
until: pve_version.rc == 0
retries: 30
delay: 10
changed_when: false
- name: Display Proxmox version
ansible.builtin.debug:
msg: "Proxmox API ready: {{ pve_version.stdout | from_json | json_query('version') | default('unknown') }}"
- name: "Startup Tier 1 — Databases"
hosts: pve-node
gather_facts: false
tags: [startup]
tasks:
- name: Start database VM (112)
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/112/status/start
ignore_errors: true
- name: Wait for VM 112 to be running
ansible.builtin.shell: >
pvesh get /nodes/proxmox/qemu/112/status/current --output-format json |
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
register: db_status
until: db_status.stdout.strip() == "running"
retries: 12
delay: 5
changed_when: false
- name: Wait for database services to initialize
ansible.builtin.pause:
seconds: 30
- name: "Startup Tier 2 — Infrastructure"
hosts: pve-node
gather_facts: false
tags: [startup]
vars:
tier2_vms: [106, 116]
tier2_lxcs: [225, 210, 227]
tasks:
- name: Start Tier 2 VMs
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start
loop: "{{ tier2_vms }}"
ignore_errors: true
- name: Start Tier 2 LXCs
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start
loop: "{{ tier2_lxcs }}"
ignore_errors: true
- name: Wait for infrastructure to come up
ansible.builtin.pause:
seconds: 30
- name: "Startup Tier 3 — Applications"
hosts: pve-node
gather_facts: false
tags: [startup]
vars:
tier3_vms: [115, 110]
tier3_lxcs: [301]
tasks:
- name: Start Tier 3 VMs
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start
loop: "{{ tier3_vms }}"
ignore_errors: true
- name: Start Tier 3 LXCs
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start
loop: "{{ tier3_lxcs }}"
ignore_errors: true
- name: Wait for applications to start
ansible.builtin.pause:
seconds: 30
- name: Restart Pi-hole container via SSH (UDP DNS fix)
ansible.builtin.command: ssh docker-home "docker restart pihole"
ignore_errors: true
- name: Wait for Pi-hole to stabilize
ansible.builtin.pause:
seconds: 10
- name: "Startup Tier 4 — Media & Others"
hosts: pve-node
gather_facts: false
tags: [startup]
vars:
tier4_vms: [109]
tier4_lxcs: [221, 222, 223, 302]
tasks:
- name: Start Tier 4 VMs
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start
loop: "{{ tier4_vms }}"
ignore_errors: true
- name: Start Tier 4 LXCs
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start
loop: "{{ tier4_lxcs }}"
ignore_errors: true
- name: Post-reboot validation
hosts: pve-node
gather_facts: false
tags: [startup, validate]
tasks:
- name: Wait for all services to initialize
ansible.builtin.pause:
seconds: 60
- name: Check all expected VMs are running
ansible.builtin.shell: >
pvesh get /nodes/proxmox/qemu --output-format json |
python3 -c "
import sys, json
vms = json.load(sys.stdin)
expected = {106, 109, 110, 112, 115, 116}
running = {v['vmid'] for v in vms if v.get('status') == 'running'}
missing = expected - running
if missing:
print(f'WARN: VMs not running: {missing}')
sys.exit(1)
print(f'All expected VMs running: {running & expected}')
"
register: vm_check
ignore_errors: true
- name: Check all expected LXCs are running
ansible.builtin.shell: >
pvesh get /nodes/proxmox/lxc --output-format json |
python3 -c "
import sys, json
cts = json.load(sys.stdin)
# LXC 303 (mcp-gateway) intentionally excluded — onboot=0, operator-managed
expected = {210, 221, 222, 223, 225, 227, 301, 302, 304}
running = {c['vmid'] for c in cts if c.get('status') == 'running'}
missing = expected - running
if missing:
print(f'WARN: LXCs not running: {missing}')
sys.exit(1)
print(f'All expected LXCs running: {running & expected}')
"
register: lxc_check
ignore_errors: true
- name: Clean up old maintenance snapshots (older than 7 days)
ansible.builtin.shell: >
cutoff=$(date -d '7 days ago' +%s);
for vmid in $(pvesh get /nodes/proxmox/qemu --output-format json |
python3 -c "import sys,json; [print(v['vmid']) for v in json.load(sys.stdin)]"); do
for snap in $(pvesh get /nodes/proxmox/qemu/$vmid/snapshot --output-format json |
python3 -c "import sys,json; [print(s['name']) for s in json.load(sys.stdin) if s['name'].startswith('pre-maintenance-')]" 2>/dev/null); do
snap_date=$(echo $snap | sed 's/pre-maintenance-//');
snap_epoch=$(date -d "$snap_date" +%s 2>/dev/null);
if [ -z "$snap_epoch" ]; then
echo "WARN: could not parse date for snapshot $snap on VM $vmid";
elif [ "$snap_epoch" -lt "$cutoff" ]; then
pvesh delete /nodes/proxmox/qemu/$vmid/snapshot/$snap && echo "Deleted $snap from VM $vmid";
fi
done
done;
for ctid in $(pvesh get /nodes/proxmox/lxc --output-format json |
python3 -c "import sys,json; [print(c['vmid']) for c in json.load(sys.stdin)]"); do
for snap in $(pvesh get /nodes/proxmox/lxc/$ctid/snapshot --output-format json |
python3 -c "import sys,json; [print(s['name']) for s in json.load(sys.stdin) if s['name'].startswith('pre-maintenance-')]" 2>/dev/null); do
snap_date=$(echo $snap | sed 's/pre-maintenance-//');
snap_epoch=$(date -d "$snap_date" +%s 2>/dev/null);
if [ -z "$snap_epoch" ]; then
echo "WARN: could not parse date for snapshot $snap on LXC $ctid";
elif [ "$snap_epoch" -lt "$cutoff" ]; then
pvesh delete /nodes/proxmox/lxc/$ctid/snapshot/$snap && echo "Deleted $snap from LXC $ctid";
fi
done
done;
echo "Snapshot cleanup complete"
ignore_errors: true
- name: Display validation results
ansible.builtin.debug:
msg:
- "VM status: {{ vm_check.stdout }}"
- "LXC status: {{ lxc_check.stdout }}"
- "Maintenance reboot complete — post-reboot startup finished"