All checks were successful
Reindex Knowledge Base / reindex (push) Successful in 2s
Establishes a first-Sunday-of-the-month maintenance window orchestrated by Ansible on LXC 304. Split into two playbooks to handle the self-reboot paradox (the controller is a guest on the host being rebooted): - monthly-reboot.yml: snapshots, tiered shutdown with per-guest polling, fire-and-forget host reboot - post-reboot-startup.yml: controlled tiered startup with staggered delays, Pi-hole UDP DNS fix, validation, and snapshot cleanup Also fixes onboot:1 on VM 109, LXC 221, LXC 223 and creates a recurring Google Calendar event for the maintenance window. Closes #26 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
215 lines
7.3 KiB
YAML
215 lines
7.3 KiB
YAML
---
|
|
# Post-Reboot Startup — Controlled Guest Startup After Proxmox Reboot
|
|
#
|
|
# Starts all guests in dependency order with staggered delays to avoid
|
|
# I/O storms. Runs automatically via ansible-post-reboot.service on
|
|
# LXC 304 after the Proxmox host reboots.
|
|
#
|
|
# Can also be run manually:
|
|
# ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml
|
|
#
|
|
# Note: VM 109 (homeassistant) is excluded from Ansible inventory
|
|
# (self-managed via HA Supervisor) but is included in pvesh start/stop.
|
|
|
|
- name: Wait for Proxmox API to be ready
|
|
hosts: pve-node
|
|
gather_facts: false
|
|
tags: [startup]
|
|
|
|
tasks:
|
|
- name: Wait for Proxmox API
|
|
ansible.builtin.command: pvesh get /version --output-format json
|
|
register: pve_version
|
|
until: pve_version.rc == 0
|
|
retries: 30
|
|
delay: 10
|
|
changed_when: false
|
|
|
|
- name: Display Proxmox version
|
|
ansible.builtin.debug:
|
|
msg: "Proxmox API ready: {{ pve_version.stdout | from_json | json_query('version') | default('unknown') }}"
|
|
|
|
- name: "Startup Tier 1 — Databases"
|
|
hosts: pve-node
|
|
gather_facts: false
|
|
tags: [startup]
|
|
|
|
tasks:
|
|
- name: Start database VM (112)
|
|
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/112/status/start
|
|
ignore_errors: true
|
|
|
|
- name: Wait for VM 112 to be running
|
|
ansible.builtin.shell: >
|
|
pvesh get /nodes/proxmox/qemu/112/status/current --output-format json |
|
|
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
|
register: db_status
|
|
until: db_status.stdout.strip() == "running"
|
|
retries: 12
|
|
delay: 5
|
|
changed_when: false
|
|
|
|
- name: Wait for database services to initialize
|
|
ansible.builtin.pause:
|
|
seconds: 30
|
|
|
|
- name: "Startup Tier 2 — Infrastructure"
|
|
hosts: pve-node
|
|
gather_facts: false
|
|
tags: [startup]
|
|
|
|
vars:
|
|
tier2_vms: [106, 116]
|
|
tier2_lxcs: [225, 210, 227]
|
|
|
|
tasks:
|
|
- name: Start Tier 2 VMs
|
|
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start
|
|
loop: "{{ tier2_vms }}"
|
|
ignore_errors: true
|
|
|
|
- name: Start Tier 2 LXCs
|
|
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start
|
|
loop: "{{ tier2_lxcs }}"
|
|
ignore_errors: true
|
|
|
|
- name: Wait for infrastructure to come up
|
|
ansible.builtin.pause:
|
|
seconds: 30
|
|
|
|
- name: "Startup Tier 3 — Applications"
|
|
hosts: pve-node
|
|
gather_facts: false
|
|
tags: [startup]
|
|
|
|
vars:
|
|
tier3_vms: [115, 110]
|
|
tier3_lxcs: [301]
|
|
|
|
tasks:
|
|
- name: Start Tier 3 VMs
|
|
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start
|
|
loop: "{{ tier3_vms }}"
|
|
ignore_errors: true
|
|
|
|
- name: Start Tier 3 LXCs
|
|
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start
|
|
loop: "{{ tier3_lxcs }}"
|
|
ignore_errors: true
|
|
|
|
- name: Wait for applications to start
|
|
ansible.builtin.pause:
|
|
seconds: 30
|
|
|
|
- name: Restart Pi-hole container via SSH (UDP DNS fix)
|
|
ansible.builtin.command: ssh docker-home "docker restart pihole"
|
|
ignore_errors: true
|
|
|
|
- name: Wait for Pi-hole to stabilize
|
|
ansible.builtin.pause:
|
|
seconds: 10
|
|
|
|
- name: "Startup Tier 4 — Media & Others"
|
|
hosts: pve-node
|
|
gather_facts: false
|
|
tags: [startup]
|
|
|
|
vars:
|
|
tier4_vms: [109]
|
|
tier4_lxcs: [221, 222, 223, 302]
|
|
|
|
tasks:
|
|
- name: Start Tier 4 VMs
|
|
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start
|
|
loop: "{{ tier4_vms }}"
|
|
ignore_errors: true
|
|
|
|
- name: Start Tier 4 LXCs
|
|
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start
|
|
loop: "{{ tier4_lxcs }}"
|
|
ignore_errors: true
|
|
|
|
- name: Post-reboot validation
|
|
hosts: pve-node
|
|
gather_facts: false
|
|
tags: [startup, validate]
|
|
|
|
tasks:
|
|
- name: Wait for all services to initialize
|
|
ansible.builtin.pause:
|
|
seconds: 60
|
|
|
|
- name: Check all expected VMs are running
|
|
ansible.builtin.shell: >
|
|
pvesh get /nodes/proxmox/qemu --output-format json |
|
|
python3 -c "
|
|
import sys, json
|
|
vms = json.load(sys.stdin)
|
|
expected = {106, 109, 110, 112, 115, 116}
|
|
running = {v['vmid'] for v in vms if v.get('status') == 'running'}
|
|
missing = expected - running
|
|
if missing:
|
|
print(f'WARN: VMs not running: {missing}')
|
|
sys.exit(1)
|
|
print(f'All expected VMs running: {running & expected}')
|
|
"
|
|
register: vm_check
|
|
ignore_errors: true
|
|
|
|
- name: Check all expected LXCs are running
|
|
ansible.builtin.shell: >
|
|
pvesh get /nodes/proxmox/lxc --output-format json |
|
|
python3 -c "
|
|
import sys, json
|
|
cts = json.load(sys.stdin)
|
|
# LXC 303 (mcp-gateway) intentionally excluded — onboot=0, operator-managed
|
|
expected = {210, 221, 222, 223, 225, 227, 301, 302, 304}
|
|
running = {c['vmid'] for c in cts if c.get('status') == 'running'}
|
|
missing = expected - running
|
|
if missing:
|
|
print(f'WARN: LXCs not running: {missing}')
|
|
sys.exit(1)
|
|
print(f'All expected LXCs running: {running & expected}')
|
|
"
|
|
register: lxc_check
|
|
ignore_errors: true
|
|
|
|
- name: Clean up old maintenance snapshots (older than 7 days)
|
|
ansible.builtin.shell: >
|
|
cutoff=$(date -d '7 days ago' +%s);
|
|
for vmid in $(pvesh get /nodes/proxmox/qemu --output-format json |
|
|
python3 -c "import sys,json; [print(v['vmid']) for v in json.load(sys.stdin)]"); do
|
|
for snap in $(pvesh get /nodes/proxmox/qemu/$vmid/snapshot --output-format json |
|
|
python3 -c "import sys,json; [print(s['name']) for s in json.load(sys.stdin) if s['name'].startswith('pre-maintenance-')]" 2>/dev/null); do
|
|
snap_date=$(echo $snap | sed 's/pre-maintenance-//');
|
|
snap_epoch=$(date -d "$snap_date" +%s 2>/dev/null);
|
|
if [ -z "$snap_epoch" ]; then
|
|
echo "WARN: could not parse date for snapshot $snap on VM $vmid";
|
|
elif [ "$snap_epoch" -lt "$cutoff" ]; then
|
|
pvesh delete /nodes/proxmox/qemu/$vmid/snapshot/$snap && echo "Deleted $snap from VM $vmid";
|
|
fi
|
|
done
|
|
done;
|
|
for ctid in $(pvesh get /nodes/proxmox/lxc --output-format json |
|
|
python3 -c "import sys,json; [print(c['vmid']) for c in json.load(sys.stdin)]"); do
|
|
for snap in $(pvesh get /nodes/proxmox/lxc/$ctid/snapshot --output-format json |
|
|
python3 -c "import sys,json; [print(s['name']) for s in json.load(sys.stdin) if s['name'].startswith('pre-maintenance-')]" 2>/dev/null); do
|
|
snap_date=$(echo $snap | sed 's/pre-maintenance-//');
|
|
snap_epoch=$(date -d "$snap_date" +%s 2>/dev/null);
|
|
if [ -z "$snap_epoch" ]; then
|
|
echo "WARN: could not parse date for snapshot $snap on LXC $ctid";
|
|
elif [ "$snap_epoch" -lt "$cutoff" ]; then
|
|
pvesh delete /nodes/proxmox/lxc/$ctid/snapshot/$snap && echo "Deleted $snap from LXC $ctid";
|
|
fi
|
|
done
|
|
done;
|
|
echo "Snapshot cleanup complete"
|
|
ignore_errors: true
|
|
|
|
- name: Display validation results
|
|
ansible.builtin.debug:
|
|
msg:
|
|
- "VM status: {{ vm_check.stdout }}"
|
|
- "LXC status: {{ lxc_check.stdout }}"
|
|
- "Maintenance reboot complete — post-reboot startup finished"
|