Compare commits
1 Commits
main
...
test/auto-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5d1b640b0b |
@ -39,7 +39,7 @@ jobs:
|
||||
|
||||
echo "docs_only=${DOCS_ONLY}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Auto-merge
|
||||
- name: Approve and merge
|
||||
if: steps.check.outputs.docs_only == 'true'
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.AUTO_MERGE_TOKEN }}
|
||||
@ -47,9 +47,16 @@ jobs:
|
||||
PR_NUMBER=${{ github.event.pull_request.number }}
|
||||
API_BASE="${{ github.server_url }}/api/v1/repos/${{ github.repository }}/pulls/${PR_NUMBER}"
|
||||
|
||||
# Approve the PR
|
||||
curl -sf -X POST "${API_BASE}/reviews" \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"event": "APPROVED", "body": "Auto-approved: docs-only changes (all .md files)."}'
|
||||
|
||||
# Merge the PR
|
||||
curl -sf -X POST "${API_BASE}/merge" \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"Do": "merge", "merge_message_field": "Auto-merge: docs-only PR #'"${PR_NUMBER}"'"}'
|
||||
|
||||
echo "PR #${PR_NUMBER} auto-merged."
|
||||
echo "PR #${PR_NUMBER} auto-approved and merged."
|
||||
|
||||
@ -1,43 +0,0 @@
|
||||
---
|
||||
# Mask avahi-daemon on all Ubuntu hosts
|
||||
#
|
||||
# Avahi (mDNS/Bonjour) is not needed in a static-IP homelab with Pi-hole DNS.
|
||||
# A kernel busy-loop bug in avahi-daemon was found consuming ~1.7 CPU cores
|
||||
# across 5 VMs. Masking prevents it from ever starting again, surviving reboots.
|
||||
#
|
||||
# Targets: vms + physical (all Ubuntu QEMU VMs and ubuntu-manticore)
|
||||
# Controller: ansible-controller (LXC 304 at 10.10.0.232)
|
||||
#
|
||||
# Usage:
|
||||
# # Dry run
|
||||
# ansible-playbook /opt/ansible/playbooks/mask-avahi.yml --check
|
||||
#
|
||||
# # Test on a single host first
|
||||
# ansible-playbook /opt/ansible/playbooks/mask-avahi.yml --limit discord-bots
|
||||
#
|
||||
# # Roll out to all Ubuntu hosts
|
||||
# ansible-playbook /opt/ansible/playbooks/mask-avahi.yml
|
||||
#
|
||||
# To undo: systemctl unmask avahi-daemon
|
||||
|
||||
- name: Mask avahi-daemon on all Ubuntu hosts
|
||||
hosts: vms:physical
|
||||
become: true
|
||||
|
||||
tasks:
|
||||
- name: Stop avahi-daemon
|
||||
ansible.builtin.systemd:
|
||||
name: avahi-daemon
|
||||
state: stopped
|
||||
ignore_errors: true
|
||||
|
||||
- name: Mask avahi-daemon
|
||||
ansible.builtin.systemd:
|
||||
name: avahi-daemon
|
||||
masked: true
|
||||
|
||||
- name: Verify avahi is masked
|
||||
ansible.builtin.command: systemctl is-enabled avahi-daemon
|
||||
register: avahi_status
|
||||
changed_when: false
|
||||
failed_when: avahi_status.stdout | trim != 'masked'
|
||||
@ -1,265 +0,0 @@
|
||||
---
|
||||
# Monthly Proxmox Maintenance Reboot — Shutdown & Reboot
|
||||
#
|
||||
# Orchestrates a graceful shutdown of all guests in dependency order,
|
||||
# then issues a fire-and-forget reboot to the Proxmox host.
|
||||
#
|
||||
# After the host reboots, LXC 304 auto-starts via onboot:1 and the
|
||||
# post-reboot-startup.yml playbook runs automatically via the
|
||||
# ansible-post-reboot.service systemd unit (triggered by @reboot).
|
||||
#
|
||||
# Schedule: 1st Sunday of each month, 08:00 UTC (3 AM ET)
|
||||
# Controller: LXC 304 (ansible-controller) at 10.10.0.232
|
||||
#
|
||||
# Usage:
|
||||
# # Dry run
|
||||
# ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --check
|
||||
#
|
||||
# # Full execution
|
||||
# ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml
|
||||
#
|
||||
# # Shutdown only (skip the host reboot)
|
||||
# ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --tags shutdown
|
||||
#
|
||||
# Note: VM 109 (homeassistant) is excluded from Ansible inventory
|
||||
# (self-managed via HA Supervisor) but is included in pvesh start/stop.
|
||||
|
||||
- name: Pre-reboot health check and snapshots
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [pre-reboot, shutdown]
|
||||
|
||||
tasks:
|
||||
- name: Check Proxmox cluster health
|
||||
ansible.builtin.command: pvesh get /cluster/status --output-format json
|
||||
register: cluster_status
|
||||
changed_when: false
|
||||
|
||||
- name: Get list of running QEMU VMs
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/qemu --output-format json |
|
||||
python3 -c "import sys,json; [print(vm['vmid']) for vm in json.load(sys.stdin) if vm.get('status')=='running']"
|
||||
register: running_vms
|
||||
changed_when: false
|
||||
|
||||
- name: Get list of running LXC containers
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/lxc --output-format json |
|
||||
python3 -c "import sys,json; [print(ct['vmid']) for ct in json.load(sys.stdin) if ct.get('status')=='running']"
|
||||
register: running_lxcs
|
||||
changed_when: false
|
||||
|
||||
- name: Display running guests
|
||||
ansible.builtin.debug:
|
||||
msg: "Running VMs: {{ running_vms.stdout_lines }} | Running LXCs: {{ running_lxcs.stdout_lines }}"
|
||||
|
||||
- name: Snapshot running VMs
|
||||
ansible.builtin.command: >
|
||||
pvesh create /nodes/proxmox/qemu/{{ item }}/snapshot
|
||||
--snapname pre-maintenance-{{ lookup('pipe', 'date +%Y-%m-%d') }}
|
||||
--description "Auto snapshot before monthly maintenance reboot"
|
||||
loop: "{{ running_vms.stdout_lines }}"
|
||||
when: running_vms.stdout_lines | length > 0
|
||||
ignore_errors: true
|
||||
|
||||
- name: Snapshot running LXCs
|
||||
ansible.builtin.command: >
|
||||
pvesh create /nodes/proxmox/lxc/{{ item }}/snapshot
|
||||
--snapname pre-maintenance-{{ lookup('pipe', 'date +%Y-%m-%d') }}
|
||||
--description "Auto snapshot before monthly maintenance reboot"
|
||||
loop: "{{ running_lxcs.stdout_lines }}"
|
||||
when: running_lxcs.stdout_lines | length > 0
|
||||
ignore_errors: true
|
||||
|
||||
- name: "Shutdown Tier 4 — Media & Others"
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [shutdown]
|
||||
|
||||
vars:
|
||||
tier4_vms: [109]
|
||||
# LXC 303 (mcp-gateway) is onboot=0 and operator-managed — not included here
|
||||
tier4_lxcs: [221, 222, 223, 302]
|
||||
|
||||
tasks:
|
||||
- name: Shutdown Tier 4 VMs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
|
||||
loop: "{{ tier4_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Shutdown Tier 4 LXCs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown
|
||||
loop: "{{ tier4_lxcs }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for Tier 4 VMs to stop
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
||||
register: t4_vm_status
|
||||
until: t4_vm_status.stdout.strip() == "stopped"
|
||||
retries: 12
|
||||
delay: 5
|
||||
loop: "{{ tier4_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for Tier 4 LXCs to stop
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json |
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
||||
register: t4_lxc_status
|
||||
until: t4_lxc_status.stdout.strip() == "stopped"
|
||||
retries: 12
|
||||
delay: 5
|
||||
loop: "{{ tier4_lxcs }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: "Shutdown Tier 3 — Applications"
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [shutdown]
|
||||
|
||||
vars:
|
||||
tier3_vms: [115, 110]
|
||||
tier3_lxcs: [301]
|
||||
|
||||
tasks:
|
||||
- name: Shutdown Tier 3 VMs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
|
||||
loop: "{{ tier3_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Shutdown Tier 3 LXCs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown
|
||||
loop: "{{ tier3_lxcs }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for Tier 3 VMs to stop
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
||||
register: t3_vm_status
|
||||
until: t3_vm_status.stdout.strip() == "stopped"
|
||||
retries: 12
|
||||
delay: 5
|
||||
loop: "{{ tier3_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for Tier 3 LXCs to stop
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json |
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
||||
register: t3_lxc_status
|
||||
until: t3_lxc_status.stdout.strip() == "stopped"
|
||||
retries: 12
|
||||
delay: 5
|
||||
loop: "{{ tier3_lxcs }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: "Shutdown Tier 2 — Infrastructure"
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [shutdown]
|
||||
|
||||
vars:
|
||||
tier2_vms: [106, 116]
|
||||
tier2_lxcs: [225, 210, 227]
|
||||
|
||||
tasks:
|
||||
- name: Shutdown Tier 2 VMs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
|
||||
loop: "{{ tier2_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Shutdown Tier 2 LXCs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/shutdown
|
||||
loop: "{{ tier2_lxcs }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for Tier 2 VMs to stop
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
||||
register: t2_vm_status
|
||||
until: t2_vm_status.stdout.strip() == "stopped"
|
||||
retries: 12
|
||||
delay: 5
|
||||
loop: "{{ tier2_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for Tier 2 LXCs to stop
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/lxc/{{ item }}/status/current --output-format json |
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
||||
register: t2_lxc_status
|
||||
until: t2_lxc_status.stdout.strip() == "stopped"
|
||||
retries: 12
|
||||
delay: 5
|
||||
loop: "{{ tier2_lxcs }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: "Shutdown Tier 1 — Databases"
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [shutdown]
|
||||
|
||||
vars:
|
||||
tier1_vms: [112]
|
||||
|
||||
tasks:
|
||||
- name: Shutdown database VMs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/shutdown
|
||||
loop: "{{ tier1_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for database VMs to stop (up to 90s)
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
||||
register: t1_vm_status
|
||||
until: t1_vm_status.stdout.strip() == "stopped"
|
||||
retries: 18
|
||||
delay: 5
|
||||
loop: "{{ tier1_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Force stop database VMs if still running
|
||||
ansible.builtin.shell: >
|
||||
status=$(pvesh get /nodes/proxmox/qemu/{{ item }}/status/current --output-format json |
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))");
|
||||
if [ "$status" = "running" ]; then
|
||||
pvesh create /nodes/proxmox/qemu/{{ item }}/status/stop;
|
||||
echo "Force stopped VM {{ item }}";
|
||||
else
|
||||
echo "VM {{ item }} already stopped";
|
||||
fi
|
||||
loop: "{{ tier1_vms }}"
|
||||
register: force_stop_result
|
||||
changed_when: force_stop_result.results | default([]) | selectattr('stdout', 'defined') | selectattr('stdout', 'search', 'Force stopped') | list | length > 0
|
||||
|
||||
- name: "Verify and reboot Proxmox host"
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [reboot]
|
||||
|
||||
tasks:
|
||||
- name: Verify all guests are stopped (excluding LXC 304)
|
||||
ansible.builtin.shell: >
|
||||
running_vms=$(pvesh get /nodes/proxmox/qemu --output-format json |
|
||||
python3 -c "import sys,json; vms=[v for v in json.load(sys.stdin) if v.get('status')=='running']; print(len(vms))");
|
||||
running_lxcs=$(pvesh get /nodes/proxmox/lxc --output-format json |
|
||||
python3 -c "import sys,json; cts=[c for c in json.load(sys.stdin) if c.get('status')=='running' and c['vmid'] != 304]; print(len(cts))");
|
||||
echo "Running VMs: $running_vms, Running LXCs: $running_lxcs";
|
||||
if [ "$running_vms" != "0" ] || [ "$running_lxcs" != "0" ]; then exit 1; fi
|
||||
register: verify_stopped
|
||||
|
||||
- name: Issue fire-and-forget reboot (controller will be killed)
|
||||
ansible.builtin.shell: >
|
||||
nohup bash -c 'sleep 10 && reboot' &>/dev/null &
|
||||
echo "Reboot scheduled in 10 seconds"
|
||||
register: reboot_issued
|
||||
when: not ansible_check_mode
|
||||
|
||||
- name: Log reboot issued
|
||||
ansible.builtin.debug:
|
||||
msg: "{{ reboot_issued.stdout }} — Ansible process will terminate when host reboots. Post-reboot startup handled by ansible-post-reboot.service on LXC 304."
|
||||
@ -1,214 +0,0 @@
|
||||
---
|
||||
# Post-Reboot Startup — Controlled Guest Startup After Proxmox Reboot
|
||||
#
|
||||
# Starts all guests in dependency order with staggered delays to avoid
|
||||
# I/O storms. Runs automatically via ansible-post-reboot.service on
|
||||
# LXC 304 after the Proxmox host reboots.
|
||||
#
|
||||
# Can also be run manually:
|
||||
# ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml
|
||||
#
|
||||
# Note: VM 109 (homeassistant) is excluded from Ansible inventory
|
||||
# (self-managed via HA Supervisor) but is included in pvesh start/stop.
|
||||
|
||||
- name: Wait for Proxmox API to be ready
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [startup]
|
||||
|
||||
tasks:
|
||||
- name: Wait for Proxmox API
|
||||
ansible.builtin.command: pvesh get /version --output-format json
|
||||
register: pve_version
|
||||
until: pve_version.rc == 0
|
||||
retries: 30
|
||||
delay: 10
|
||||
changed_when: false
|
||||
|
||||
- name: Display Proxmox version
|
||||
ansible.builtin.debug:
|
||||
msg: "Proxmox API ready: {{ pve_version.stdout | from_json | json_query('version') | default('unknown') }}"
|
||||
|
||||
- name: "Startup Tier 1 — Databases"
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [startup]
|
||||
|
||||
tasks:
|
||||
- name: Start database VM (112)
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/112/status/start
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for VM 112 to be running
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/qemu/112/status/current --output-format json |
|
||||
python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))"
|
||||
register: db_status
|
||||
until: db_status.stdout.strip() == "running"
|
||||
retries: 12
|
||||
delay: 5
|
||||
changed_when: false
|
||||
|
||||
- name: Wait for database services to initialize
|
||||
ansible.builtin.pause:
|
||||
seconds: 30
|
||||
|
||||
- name: "Startup Tier 2 — Infrastructure"
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [startup]
|
||||
|
||||
vars:
|
||||
tier2_vms: [106, 116]
|
||||
tier2_lxcs: [225, 210, 227]
|
||||
|
||||
tasks:
|
||||
- name: Start Tier 2 VMs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start
|
||||
loop: "{{ tier2_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Start Tier 2 LXCs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start
|
||||
loop: "{{ tier2_lxcs }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for infrastructure to come up
|
||||
ansible.builtin.pause:
|
||||
seconds: 30
|
||||
|
||||
- name: "Startup Tier 3 — Applications"
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [startup]
|
||||
|
||||
vars:
|
||||
tier3_vms: [115, 110]
|
||||
tier3_lxcs: [301]
|
||||
|
||||
tasks:
|
||||
- name: Start Tier 3 VMs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start
|
||||
loop: "{{ tier3_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Start Tier 3 LXCs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start
|
||||
loop: "{{ tier3_lxcs }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for applications to start
|
||||
ansible.builtin.pause:
|
||||
seconds: 30
|
||||
|
||||
- name: Restart Pi-hole container via SSH (UDP DNS fix)
|
||||
ansible.builtin.command: ssh docker-home "docker restart pihole"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Wait for Pi-hole to stabilize
|
||||
ansible.builtin.pause:
|
||||
seconds: 10
|
||||
|
||||
- name: "Startup Tier 4 — Media & Others"
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [startup]
|
||||
|
||||
vars:
|
||||
tier4_vms: [109]
|
||||
tier4_lxcs: [221, 222, 223, 302]
|
||||
|
||||
tasks:
|
||||
- name: Start Tier 4 VMs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/qemu/{{ item }}/status/start
|
||||
loop: "{{ tier4_vms }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Start Tier 4 LXCs
|
||||
ansible.builtin.command: pvesh create /nodes/proxmox/lxc/{{ item }}/status/start
|
||||
loop: "{{ tier4_lxcs }}"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Post-reboot validation
|
||||
hosts: pve-node
|
||||
gather_facts: false
|
||||
tags: [startup, validate]
|
||||
|
||||
tasks:
|
||||
- name: Wait for all services to initialize
|
||||
ansible.builtin.pause:
|
||||
seconds: 60
|
||||
|
||||
- name: Check all expected VMs are running
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/qemu --output-format json |
|
||||
python3 -c "
|
||||
import sys, json
|
||||
vms = json.load(sys.stdin)
|
||||
expected = {106, 109, 110, 112, 115, 116}
|
||||
running = {v['vmid'] for v in vms if v.get('status') == 'running'}
|
||||
missing = expected - running
|
||||
if missing:
|
||||
print(f'WARN: VMs not running: {missing}')
|
||||
sys.exit(1)
|
||||
print(f'All expected VMs running: {running & expected}')
|
||||
"
|
||||
register: vm_check
|
||||
ignore_errors: true
|
||||
|
||||
- name: Check all expected LXCs are running
|
||||
ansible.builtin.shell: >
|
||||
pvesh get /nodes/proxmox/lxc --output-format json |
|
||||
python3 -c "
|
||||
import sys, json
|
||||
cts = json.load(sys.stdin)
|
||||
# LXC 303 (mcp-gateway) intentionally excluded — onboot=0, operator-managed
|
||||
expected = {210, 221, 222, 223, 225, 227, 301, 302, 304}
|
||||
running = {c['vmid'] for c in cts if c.get('status') == 'running'}
|
||||
missing = expected - running
|
||||
if missing:
|
||||
print(f'WARN: LXCs not running: {missing}')
|
||||
sys.exit(1)
|
||||
print(f'All expected LXCs running: {running & expected}')
|
||||
"
|
||||
register: lxc_check
|
||||
ignore_errors: true
|
||||
|
||||
- name: Clean up old maintenance snapshots (older than 7 days)
|
||||
ansible.builtin.shell: >
|
||||
cutoff=$(date -d '7 days ago' +%s);
|
||||
for vmid in $(pvesh get /nodes/proxmox/qemu --output-format json |
|
||||
python3 -c "import sys,json; [print(v['vmid']) for v in json.load(sys.stdin)]"); do
|
||||
for snap in $(pvesh get /nodes/proxmox/qemu/$vmid/snapshot --output-format json |
|
||||
python3 -c "import sys,json; [print(s['name']) for s in json.load(sys.stdin) if s['name'].startswith('pre-maintenance-')]" 2>/dev/null); do
|
||||
snap_date=$(echo $snap | sed 's/pre-maintenance-//');
|
||||
snap_epoch=$(date -d "$snap_date" +%s 2>/dev/null);
|
||||
if [ -z "$snap_epoch" ]; then
|
||||
echo "WARN: could not parse date for snapshot $snap on VM $vmid";
|
||||
elif [ "$snap_epoch" -lt "$cutoff" ]; then
|
||||
pvesh delete /nodes/proxmox/qemu/$vmid/snapshot/$snap && echo "Deleted $snap from VM $vmid";
|
||||
fi
|
||||
done
|
||||
done;
|
||||
for ctid in $(pvesh get /nodes/proxmox/lxc --output-format json |
|
||||
python3 -c "import sys,json; [print(c['vmid']) for c in json.load(sys.stdin)]"); do
|
||||
for snap in $(pvesh get /nodes/proxmox/lxc/$ctid/snapshot --output-format json |
|
||||
python3 -c "import sys,json; [print(s['name']) for s in json.load(sys.stdin) if s['name'].startswith('pre-maintenance-')]" 2>/dev/null); do
|
||||
snap_date=$(echo $snap | sed 's/pre-maintenance-//');
|
||||
snap_epoch=$(date -d "$snap_date" +%s 2>/dev/null);
|
||||
if [ -z "$snap_epoch" ]; then
|
||||
echo "WARN: could not parse date for snapshot $snap on LXC $ctid";
|
||||
elif [ "$snap_epoch" -lt "$cutoff" ]; then
|
||||
pvesh delete /nodes/proxmox/lxc/$ctid/snapshot/$snap && echo "Deleted $snap from LXC $ctid";
|
||||
fi
|
||||
done
|
||||
done;
|
||||
echo "Snapshot cleanup complete"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Display validation results
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "VM status: {{ vm_check.stdout }}"
|
||||
- "LXC status: {{ lxc_check.stdout }}"
|
||||
- "Maintenance reboot complete — post-reboot startup finished"
|
||||
@ -1,15 +0,0 @@
|
||||
[Unit]
|
||||
Description=Monthly Proxmox maintenance reboot (Ansible)
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=cal
|
||||
WorkingDirectory=/opt/ansible
|
||||
ExecStart=/usr/bin/ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml
|
||||
StandardOutput=append:/opt/ansible/logs/monthly-reboot.log
|
||||
StandardError=append:/opt/ansible/logs/monthly-reboot.log
|
||||
TimeoutStartSec=900
|
||||
|
||||
# No [Install] section — this service is activated exclusively by ansible-monthly-reboot.timer
|
||||
@ -1,13 +0,0 @@
|
||||
[Unit]
|
||||
Description=Monthly Proxmox maintenance reboot timer
|
||||
Documentation=https://git.manticorum.com/cal/claude-home/src/branch/main/server-configs/proxmox/maintenance-reboot.md
|
||||
|
||||
[Timer]
|
||||
# First Sunday of the month at 08:00 UTC (3:00 AM ET during EDT)
|
||||
# Day range 01-07 ensures it's always the first occurrence of that weekday
|
||||
OnCalendar=Sun *-*-01..07 08:00:00
|
||||
Persistent=true
|
||||
RandomizedDelaySec=600
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
@ -1,21 +0,0 @@
|
||||
[Unit]
|
||||
Description=Post-reboot controlled guest startup (Ansible)
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
# Only run after a fresh boot — not on service restart
|
||||
ConditionUpTimeSec=600
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=cal
|
||||
WorkingDirectory=/opt/ansible
|
||||
# Delay 120s to let Proxmox API stabilize and onboot guests settle
|
||||
ExecStartPre=/bin/sleep 120
|
||||
ExecStart=/usr/bin/ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml
|
||||
StandardOutput=append:/opt/ansible/logs/post-reboot-startup.log
|
||||
StandardError=append:/opt/ansible/logs/post-reboot-startup.log
|
||||
TimeoutStartSec=1800
|
||||
|
||||
[Install]
|
||||
# Runs automatically on every boot of LXC 304
|
||||
WantedBy=multi-user.target
|
||||
@ -1,62 +0,0 @@
|
||||
---
|
||||
title: "ESB-56: MDR Object Handler Simplification & Cleanup"
|
||||
description: "Code review and cleanup of MDR (Master Data Registry) object handler integration — added client singletons, hardened SQL, cached env vars, and filed follow-up issues."
|
||||
type: context
|
||||
domain: development
|
||||
tags: [esb-monorepo, python, cloud-functions, mdr, code-review, refactoring, gitea]
|
||||
---
|
||||
|
||||
# ESB-56: MDR Object Handler Simplification & Cleanup
|
||||
|
||||
**Date:** 2026-03-20
|
||||
**Branch:** `feature/ESB-56-mdr-object-handler`
|
||||
**Repo:** esb-monorepo
|
||||
|
||||
## What Was Done
|
||||
|
||||
Ran a three-agent parallel code review (reuse, quality, efficiency) on the MDR-to-object-handler integration, then applied fixes directly.
|
||||
|
||||
### Fixes Applied (commit `13f78f3`)
|
||||
|
||||
1. **Pub/Sub client singleton** — `output_utils.publish_to_pubsub` was creating a new `PublisherClient` per call. Added module-level singleton matching the pattern already used in `mdr-api/event_publisher.py`.
|
||||
|
||||
2. **Firestore client singleton** — `write_to_firestore` was creating a new `firestore.Client` (gRPC channel + auth) per call. Added keyed singleton dict `_firestore_clients` by `(project_id, database)`.
|
||||
|
||||
3. **SQL injection hardening** — `write_to_cloudsql` built the Postgres function name via f-string (`f"data.fn_upsert_{object_type.lower()}"`). While guarded by an allowlist check, the pattern is fragile. Replaced with a pre-built lookup dict `_UPSERT_FUNC_MAP` so SQL identifiers are never derived by string interpolation.
|
||||
|
||||
4. **Removed verbose payload logging** — `logging.info(f"Silver data: {silver_data}")` logged full business object payloads at INFO level. Replaced with identity-only log line.
|
||||
|
||||
5. **Dead code removal** — Two `if not project_id` guards in `send_to_downstream()` could never trigger because `os.environ.get("GCP_PROJECT_ID", "amenity-integrat-dev-svc")` always returns a non-None default.
|
||||
|
||||
6. **Env var caching** — `_PROJECT_ID`, `_FIRESTORE_ENABLED`, `_CLOUDSQL_ENABLED` moved to module-level constants instead of being read from `os.environ` on every request.
|
||||
|
||||
7. **Double `list_sources()` call** — `process_object_request` called `registry.list_sources()` twice (once for membership check, once for the error response). Assigned to local variable.
|
||||
|
||||
8. **Test singleton isolation** — Added `autouse` pytest fixture to reset module-level singletons between tests, fixing 3 test failures caused by cross-test state leakage.
|
||||
|
||||
### Decision: No Connection Pool for CloudSQL
|
||||
|
||||
Discussed whether a SQLAlchemy connection pool made sense. Decided **no** because:
|
||||
- Cloud Run functions default to single concurrent request per instance
|
||||
- A pool of size 1 just adds wrapper overhead
|
||||
- The `Connector` singleton (which caches IAM auth) is already the expensive part, and it's cached
|
||||
- Pool would only matter with concurrency > 1, which would need thread-safety work first
|
||||
|
||||
## Follow-Up Issues Created (Gitea)
|
||||
|
||||
| # | Title | Priority |
|
||||
|---|-------|----------|
|
||||
| [#1](https://git.manticorum.com/cal/esb-monorepo/issues/1) | Extract shared Pub/Sub publisher to py-esb-integrations | High |
|
||||
| [#2](https://git.manticorum.com/cal/esb-monorepo/issues/2) | Extract shared CloudSQL connector to py-esb-integrations | High |
|
||||
| [#3](https://git.manticorum.com/cal/esb-monorepo/issues/3) | Reduce duplication in MDR API route handlers | Medium |
|
||||
| [#4](https://git.manticorum.com/cal/esb-monorepo/issues/4) | Parallelize independent downstream calls | Low |
|
||||
|
||||
### Key Finding: Three-Way Duplication
|
||||
|
||||
Both Pub/Sub publishing and CloudSQL connection logic are copy-pasted identically across three functions (`object-handler`, `outbound-event-handler`, `outbound-object-router`). The shared `packages/py-esb-integrations` package has no GCP utility module yet — that's the natural home for extraction.
|
||||
|
||||
## Files Changed
|
||||
|
||||
- `functions/tac/object-handler/main.py` — env var caching, dead code removal, list_sources dedup
|
||||
- `functions/tac/object-handler/output_utils.py` — singletons, SQL hardening, logging cleanup
|
||||
- `functions/tac/object-handler/tests/test_output_utils.py` — singleton reset fixture
|
||||
@ -1,95 +0,0 @@
|
||||
---
|
||||
title: "ACE-Step 1.5 — Local Network Setup Guide"
|
||||
description: "How to run ACE-Step AI music generator on the local network via Gradio UI or REST API, including .env configuration and startup notes."
|
||||
type: guide
|
||||
domain: development
|
||||
tags: [ace-step, ai, music-generation, gradio, gpu, cuda]
|
||||
---
|
||||
|
||||
# ACE-Step 1.5 — Local Network Setup
|
||||
|
||||
ACE-Step is an open-source AI music generation model. This guide covers running it on the workstation and serving the Gradio web UI to the local network.
|
||||
|
||||
## Location
|
||||
|
||||
```
|
||||
/mnt/NV2/Development/ACE-Step-1.5/
|
||||
```
|
||||
|
||||
Cloned from GitHub. Uses `uv` for dependency management — the `.venv` is created automatically on first run.
|
||||
|
||||
## Quick Start (Gradio UI)
|
||||
|
||||
```bash
|
||||
cd /mnt/NV2/Development/ACE-Step-1.5
|
||||
./start_gradio_ui.sh
|
||||
```
|
||||
|
||||
Accessible from any device on the network at **http://10.10.0.41:7860** (or whatever the workstation IP is).
|
||||
|
||||
## .env Configuration
|
||||
|
||||
The `.env` file in the project root persists settings across git updates. Current config:
|
||||
|
||||
```env
|
||||
SERVER_NAME=0.0.0.0
|
||||
PORT=7860
|
||||
LANGUAGE=en
|
||||
```
|
||||
|
||||
### Key Settings
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `SERVER_NAME` | `127.0.0.1` | Set to `0.0.0.0` for LAN access |
|
||||
| `PORT` | `7860` | Gradio UI port |
|
||||
| `LANGUAGE` | `en` | UI language (`en`, `zh`, `he`, `ja`). **Must be set** — empty value causes `unbound variable` error with the launcher's `set -u` |
|
||||
| `ACESTEP_CONFIG_PATH` | `acestep-v15-turbo` | DiT model variant |
|
||||
| `ACESTEP_LM_MODEL_PATH` | `acestep-5Hz-lm-0.6B` | Language model for lyrics/prompts |
|
||||
| `ACESTEP_INIT_LLM` | `auto` | `auto` / `true` / `false` — auto detects based on VRAM |
|
||||
| `CHECK_UPDATE` | `true` | Set to `false` to skip interactive update prompt (useful for background/automated starts) |
|
||||
|
||||
See `.env.example` for the full list.
|
||||
|
||||
## REST API Server (Alternative)
|
||||
|
||||
For programmatic access instead of the web UI:
|
||||
|
||||
```bash
|
||||
cd /mnt/NV2/Development/ACE-Step-1.5
|
||||
./start_api_server.sh
|
||||
```
|
||||
|
||||
Default: `http://127.0.0.1:8001`. To serve on LAN, edit `start_api_server.sh` line 12:
|
||||
|
||||
```bash
|
||||
HOST="0.0.0.0"
|
||||
```
|
||||
|
||||
API docs available at `http://<ip>:8001/docs`.
|
||||
|
||||
## Hardware Profile (Workstation)
|
||||
|
||||
- **GPU**: NVIDIA RTX 4080 SUPER (16 GB VRAM)
|
||||
- **Tier**: 16GB class — auto-enables CPU offload, INT8 quantization, LLM
|
||||
- **Max batch (with LM)**: 4
|
||||
- **Max batch (without LM)**: 8
|
||||
- **Max duration (with LM)**: 480s (8 min)
|
||||
- **Max duration (without LM)**: 600s (10 min)
|
||||
|
||||
## Startup Behavior
|
||||
|
||||
1. Loads `.env` configuration
|
||||
2. Checks for git updates (interactive prompt — set `CHECK_UPDATE=false` to skip)
|
||||
3. Creates `.venv` via `uv sync` if missing (slow on first run)
|
||||
4. Runs legacy NVIDIA torch compatibility check
|
||||
5. Loads DiT model → quantizes to INT8 → loads LM → allocates KV cache
|
||||
6. Launches Gradio with queue for multi-user support
|
||||
|
||||
Full startup takes ~30-40 seconds after first run.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **LANGUAGE must be set in `.env`**: The system `$LANGUAGE` locale variable can be empty, causing the launcher to crash with `unbound variable` due to `set -u`. Always include `LANGUAGE=en` in `.env`.
|
||||
- **Update prompt blocks background execution**: If running headlessly or from a script, set `CHECK_UPDATE=false` to avoid the interactive Y/N prompt.
|
||||
- **Model downloads**: First run downloads ~4-5 GB of model weights from HuggingFace. Subsequent runs use cached checkpoints in `./checkpoints/`.
|
||||
@ -72,24 +72,12 @@ Each plugin lives under `plugins/<name>/` with this layout:
|
||||
plugins/<name>/
|
||||
.claude-plugin/
|
||||
plugin.json # name, description, version
|
||||
commands/ # user-facing slash commands (preferred)
|
||||
<verb>.md # shows as /plugin-name:verb in autocomplete
|
||||
skills/<verb>/ # alternative to commands (legacy)
|
||||
skills/<name>/ # for skill plugins
|
||||
SKILL.md # skill definition with frontmatter
|
||||
scripts/ # helper scripts referenced by commands/skills
|
||||
tool.py
|
||||
agents/ # for agent plugins
|
||||
<name>.md # agent definition
|
||||
```
|
||||
|
||||
### Commands vs Skills
|
||||
|
||||
**Commands** (`.md` files in `commands/`) are preferred for user-facing slash commands. They appear in autocomplete as `/plugin-name:command-name`, giving a clean grouped prefix (e.g., `/json-pretty:format`).
|
||||
|
||||
**Skills** (`SKILL.md` in `skills/<name>/`) are the older pattern. They work but don't group as cleanly in autocomplete.
|
||||
|
||||
Commands reference scripts via `${CLAUDE_PLUGIN_ROOT}/scripts/` — no symlinks needed.
|
||||
|
||||
### plugin.json
|
||||
|
||||
```json
|
||||
@ -100,16 +88,7 @@ Commands reference scripts via `${CLAUDE_PLUGIN_ROOT}/scripts/` — no symlinks
|
||||
}
|
||||
```
|
||||
|
||||
### Command Frontmatter
|
||||
|
||||
```yaml
|
||||
---
|
||||
description: "What this command does"
|
||||
allowed-tools: Bash
|
||||
---
|
||||
```
|
||||
|
||||
### SKILL.md Frontmatter (legacy)
|
||||
### SKILL.md Frontmatter
|
||||
|
||||
```yaml
|
||||
---
|
||||
@ -144,7 +123,7 @@ Then update the cache: `claude plugin marketplace update cal-claude-plugins`
|
||||
|
||||
## Important Notes
|
||||
|
||||
- **Commands use `${CLAUDE_PLUGIN_ROOT}`** — reference scripts via `${CLAUDE_PLUGIN_ROOT}/scripts/` instead of absolute paths or symlinks
|
||||
- **SKILL.md `name:` must match directory name** — a mismatch causes the skill to not load (e.g., the `optimise-claude` plugin had `name: claude-optimised` which broke it)
|
||||
- **SSH remote** — uses `git@git.manticorum.com:cal/claude-plugins.git` via SSH config alias (`~/.ssh/config` maps `git.manticorum.com` → `10.10.0.225` as user `git`)
|
||||
- **No secrets** — repo is public; plugins are just prompt definitions, no credentials
|
||||
- **Plugins requiring MCP servers** — `backlog`, `issue-worker`, and `pr-reviewer` require `gitea-mcp`; they install fine without it but those tool calls won't execute
|
||||
|
||||
@ -1,116 +0,0 @@
|
||||
---
|
||||
title: "Fix: Docker buildx cache 400 error — migrated to local volume cache"
|
||||
description: "Registry buildx cache caused 400 errors; permanent fix is local volume cache on the Gitea Actions runner."
|
||||
type: troubleshooting
|
||||
domain: development
|
||||
tags: [troubleshooting, docker, gitea, ci]
|
||||
---
|
||||
|
||||
# Fix: Docker buildx cache 400 error on CI builds
|
||||
|
||||
**Date:** 2026-03-23
|
||||
**Severity:** Medium — blocks CI/CD Docker image builds, requires manual intervention to retrigger
|
||||
|
||||
## Problem
|
||||
|
||||
Gitea Actions Docker build workflow fails at the "exporting cache to registry" step with:
|
||||
|
||||
```
|
||||
error writing layer blob: failed to copy: unexpected status from PUT request to
|
||||
https://registry-1.docker.io/v2/.../blobs/uploads/...: 400 Bad request
|
||||
```
|
||||
|
||||
The image never gets pushed to Docker Hub. Seen on both Paper Dynasty and Major Domo repos.
|
||||
|
||||
## Root Cause
|
||||
|
||||
Stale `buildx_buildkit_builder-*` containers accumulate on the Gitea Actions runner host. Each CI build creates a new buildx builder instance but doesn't always clean up. Over time, these stale builders corrupt the registry cache state, causing Docker Hub to reject cache export PUT requests with 400.
|
||||
|
||||
## Fix
|
||||
|
||||
Kill all stale buildx builder containers on the runner, then retrigger the build:
|
||||
|
||||
```bash
|
||||
# Kill stale builders
|
||||
ssh gitea "docker rm -f \$(docker ps -a --format '{{.Names}}' | grep buildx_buildkit_builder)"
|
||||
|
||||
# Retrigger by deleting and re-pushing the tag
|
||||
git push origin :refs/tags/<tag> && git push origin <tag>
|
||||
```
|
||||
|
||||
## Lessons
|
||||
|
||||
- `type=registry` cache is unreliable on a single-runner setup — stale builders accumulate and corrupt cache state
|
||||
- Killing stale builders is a temporary fix only
|
||||
|
||||
---
|
||||
|
||||
## Permanent Fix: Local Volume Buildx Cache (2026-03-24)
|
||||
|
||||
**Severity:** N/A — preventive infrastructure change
|
||||
|
||||
**Problem:** The `type=registry` cache kept failing with 400 errors. Killing stale builders was a manual band-aid.
|
||||
|
||||
**Root Cause:** Each CI build creates a new buildx builder container. On a single persistent runner (`gitea/act_runner`, `--restart unless-stopped`), these accumulate and corrupt the Docker Hub registry cache.
|
||||
|
||||
**Fix:** Switched all workflows from `type=registry` to `type=local` backed by a named Docker volume.
|
||||
|
||||
### Setup (one-time, on gitea runner host)
|
||||
|
||||
```bash
|
||||
# Create named volume
|
||||
docker volume create pd-buildx-cache
|
||||
|
||||
# Update /etc/gitea/runner-config.yaml
|
||||
# valid_volumes:
|
||||
# - pd-buildx-cache
|
||||
|
||||
# Recreate runner container with new volume mount
|
||||
docker run -d --name gitea-runner --restart unless-stopped \
|
||||
-v /etc/gitea/runner-config.yaml:/config.yaml:ro \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
-v gitea-runner-data:/data \
|
||||
-v pd-buildx-cache:/opt/buildx-cache \
|
||||
gitea/act_runner:latest
|
||||
```
|
||||
|
||||
### Workflow changes
|
||||
|
||||
1. Add `container.volumes` to mount the named volume into job containers:
|
||||
```yaml
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
volumes:
|
||||
- pd-buildx-cache:/opt/buildx-cache
|
||||
```
|
||||
|
||||
2. Replace cache directives (each repo uses its own subdirectory):
|
||||
```yaml
|
||||
cache-from: type=local,src=/opt/buildx-cache/<repo-name>
|
||||
cache-to: type=local,dest=/opt/buildx-cache/<repo-name>-new,mode=max
|
||||
```
|
||||
|
||||
3. Add cache rotation step (prevents unbounded growth):
|
||||
```yaml
|
||||
- name: Rotate cache
|
||||
run: |
|
||||
rm -rf /opt/buildx-cache/<repo-name>
|
||||
mv /opt/buildx-cache/<repo-name>-new /opt/buildx-cache/<repo-name>
|
||||
```
|
||||
|
||||
### Key details
|
||||
|
||||
- `type=gha` does NOT work on Gitea act_runner (requires GitHub's cache service API)
|
||||
- Named volumes (not bind mounts) are required because job containers are sibling containers spawned via Docker socket
|
||||
- `mode=max` caches all intermediate layers, not just final — important for multi-stage builds
|
||||
- First build after migration is cold; subsequent builds hit local cache
|
||||
- Cache size is bounded by the rotation step (~200-600MB per repo)
|
||||
- Applied to: Paper Dynasty database, Paper Dynasty discord. Major Domo repos still use registry cache (follow-up)
|
||||
|
||||
### Repos using local cache
|
||||
| Repo | Cache subdirectory |
|
||||
|---|---|
|
||||
| paper-dynasty-database | `/opt/buildx-cache/pd-database` |
|
||||
| paper-dynasty-discord | `/opt/buildx-cache/pd-discord` |
|
||||
@ -1,33 +0,0 @@
|
||||
---
|
||||
title: "Cookbook Release — 2026.3.21"
|
||||
description: "Initial project setup: recipe format, 5 starter recipes, and interactive grocery list generator."
|
||||
type: reference
|
||||
domain: development
|
||||
tags: [release-notes, deployment, cookbook]
|
||||
---
|
||||
|
||||
# Cookbook Release — 2026.3.21
|
||||
|
||||
**Date:** 2026-03-21
|
||||
**Repo:** `cal/cookbook` on git.manticorum.com
|
||||
|
||||
## Release Summary
|
||||
|
||||
New personal cookbook project using markdown files with YAML frontmatter. Designed around Cal's preference for low-effort meals and short ingredient lists. Includes an interactive grocery list generator served as a single HTML page.
|
||||
|
||||
## Changes
|
||||
|
||||
### New Features
|
||||
|
||||
- **Recipe format** — markdown with YAML frontmatter using three-tier ingredient system: `ingredients` (must buy), `optional` (nice to have), `pantry` (already on hand)
|
||||
- **5 starter recipes** — spaghetti, chicken stir fry, breakfast for dinner, boxed pancakes, upgraded ramen
|
||||
- **Grocery list generator** (`grocery.html`) — single-page HTML app that combines ingredients across selected recipes, renders interactive checklist with localStorage persistence, mobile-friendly
|
||||
- **Serve script** (`serve.sh`) — quick bash script to spin up HTTP server for network access to the grocery list
|
||||
- **Dietary tagging** — `contains-lactose` tag rule for household lactose intolerance; recipes with dairy allowed but must be tagged
|
||||
|
||||
### Format Decisions
|
||||
|
||||
- `effort` field (low/medium/high) for filtering by cooking motivation
|
||||
- `optional` section keeps core ingredient list short while preserving variation ideas
|
||||
- `pantry` section separates staples (salt, pepper, oil) from shopping items
|
||||
- Kebab-case filenames, concise step instructions
|
||||
@ -1,80 +0,0 @@
|
||||
---
|
||||
title: "Fix: Subagent Write/Edit tools blocked by permission mode mismatch"
|
||||
description: "Claude Code subagents cannot use Write or Edit tools unless spawned with mode: acceptEdits — other permission modes (dontAsk, auto, bypassPermissions) do not grant file-write capability."
|
||||
type: troubleshooting
|
||||
domain: development
|
||||
tags: [troubleshooting, claude-code, permissions, agents, subagents]
|
||||
---
|
||||
|
||||
# Fix: Subagent Write/Edit tools blocked by permission mode mismatch
|
||||
|
||||
**Date:** 2026-03-28
|
||||
**Severity:** Medium — blocks all agent-driven code generation workflows until identified
|
||||
|
||||
## Problem
|
||||
|
||||
When orchestrating multi-agent code generation (spawning engineer agents to write code in parallel), all subagents could Read/Glob/Grep files but Write and Edit tool calls were silently denied. Agents would complete their analysis, prepare the full file content, then report "blocked on Write/Edit permission."
|
||||
|
||||
This happened across **every** permission mode tried:
|
||||
- `mode: bypassPermissions` — denied (with worktree isolation)
|
||||
- `mode: auto` — denied (with and without worktree isolation)
|
||||
- `mode: dontAsk` — denied (with and without worktree isolation)
|
||||
|
||||
## Root Cause
|
||||
|
||||
Claude Code's Agent tool has multiple permission modes that control different things:
|
||||
|
||||
| Mode | What it controls | Grants Write/Edit? |
|
||||
|------|-----------------|-------------------|
|
||||
| `default` | User prompted for each tool call | No — user must approve each |
|
||||
| `dontAsk` | Suppresses user prompts | **No** — suppresses prompts but doesn't grant capability |
|
||||
| `auto` | Auto-approves based on context | **No** — same issue |
|
||||
| `bypassPermissions` | Skips permission-manager hooks | **No** — only bypasses plugin hooks, not tool-level gates |
|
||||
| `acceptEdits` | Grants file modification capability | **Yes** — this is the correct mode |
|
||||
|
||||
The key distinction: `dontAsk`/`auto`/`bypassPermissions` control the **user-facing permission prompt** (whether the user gets asked to approve). But Write/Edit tools have an **internal capability gate** that checks whether the agent was explicitly authorized to modify files. Only `acceptEdits` provides that authorization.
|
||||
|
||||
## Additional Complication: permission-manager plugin
|
||||
|
||||
The `permission-manager@agent-toolkit` plugin (`cmd-gate` PreToolUse hook) adds a second layer that blocks Bash-based file writes (output redirection `>`, `tee`, etc.). When agents fell back to Bash after Write/Edit failed, the plugin caught those too.
|
||||
|
||||
- `bypassPermissions` mode is documented to skip cmd-gate entirely, but this didn't work reliably in worktree isolation
|
||||
- Disabling the plugin (`/plugin` → toggle off `permission-manager@agent-toolkit`, then `/reload-plugins`) removed the Bash-level blocks but did NOT fix Write/Edit
|
||||
|
||||
## Fix
|
||||
|
||||
**Use `mode: acceptEdits`** when spawning any agent that needs to create or modify files:
|
||||
|
||||
```
|
||||
Agent(
|
||||
subagent_type="engineer",
|
||||
mode="acceptEdits", # <-- This is the critical setting
|
||||
prompt="..."
|
||||
)
|
||||
```
|
||||
|
||||
**Additional recommendations:**
|
||||
- Worktree isolation (`isolation: "worktree"`) may compound permission issues — avoid it unless the agents genuinely need isolation (e.g., conflicting file edits)
|
||||
- For agents that only read (reviewers, validators), any mode works
|
||||
- If the permission-manager plugin is also blocking Bash fallbacks, disable it temporarily or add classifiers for the specific commands needed
|
||||
|
||||
## Reproduction
|
||||
|
||||
1. Spawn an engineer agent with `mode: dontAsk` and a prompt to create a new file
|
||||
2. Agent will Read reference files successfully, prepare content, then report Write tool denied
|
||||
3. Change to `mode: acceptEdits` — same prompt succeeds immediately
|
||||
|
||||
## Environment
|
||||
|
||||
- Claude Code CLI on Linux (Nobara/Fedora)
|
||||
- Plugins: permission-manager@agent-toolkit (St0nefish/agent-toolkit)
|
||||
- Agent types tested: engineer, general-purpose
|
||||
- Models tested: sonnet subagents
|
||||
|
||||
## Lessons
|
||||
|
||||
- **Always use `acceptEdits` for code-writing agents.** The mode name is the clue — it's not just "accepting" edits from the user, it's granting the agent the capability to make edits.
|
||||
- **`dontAsk` ≠ "can do anything."** It means "don't prompt the user" — but the capability to write files is a separate authorization layer.
|
||||
- **Test agent permissions early.** When building a multi-agent orchestration workflow, verify the first agent can actually write before launching a full wave. A quick single-file test agent saves time.
|
||||
- **Worktree isolation adds complexity.** Only use it when agents would genuinely conflict on the same files. For non-overlapping file changes, skip isolation.
|
||||
- **The permission-manager plugin is a separate concern.** It blocks Bash file-write commands (>, tee, cat heredoc). Disabling it fixes Bash fallbacks but not Write/Edit tool calls. Both layers must be addressed independently.
|
||||
@ -1,254 +0,0 @@
|
||||
---
|
||||
title: "Tag-Triggered Release and Deploy Guide"
|
||||
description: "CalVer tag-triggered CI/CD workflow: push a git tag to build Docker images, then deploy with a script. Reference implementation from Major Domo Discord bot."
|
||||
type: guide
|
||||
domain: development
|
||||
tags: [docker, gitea, deployment, ci, calver, scripts, bash]
|
||||
---
|
||||
|
||||
# Tag-Triggered Release and Deploy Guide
|
||||
|
||||
Standard release workflow for Dockerized applications using CalVer git tags to trigger CI builds and a deploy script for production rollout. Decouples code merging from releasing — merges to `main` are free, releases are intentional.
|
||||
|
||||
## Overview
|
||||
|
||||
```
|
||||
merge PR to main → code lands, nothing builds
|
||||
.scripts/release.sh → creates CalVer tag, pushes to trigger CI
|
||||
CI builds → Docker image tagged with version + "production"
|
||||
.scripts/deploy.sh → pulls image on production host, restarts container
|
||||
```
|
||||
|
||||
## CalVer Format
|
||||
|
||||
`YYYY.M.BUILD` — year, month (no leading zero), incrementing build number within that month.
|
||||
|
||||
Examples: `2026.3.10`, `2026.3.11`, `2026.4.1`
|
||||
|
||||
## Release Script
|
||||
|
||||
`.scripts/release.sh` — creates a git tag and pushes it to trigger CI.
|
||||
|
||||
```bash
|
||||
# Auto-generate next version
|
||||
.scripts/release.sh
|
||||
|
||||
# Explicit version
|
||||
.scripts/release.sh 2026.3.11
|
||||
|
||||
# Skip confirmation
|
||||
.scripts/release.sh -y
|
||||
|
||||
# Both
|
||||
.scripts/release.sh 2026.3.11 -y
|
||||
```
|
||||
|
||||
### What it does
|
||||
|
||||
1. Verifies you're on `main` and in sync with origin
|
||||
2. Auto-generates next CalVer build number from existing tags (or uses the one you passed)
|
||||
3. Validates version format and checks for duplicate tags
|
||||
4. Shows commits since last tag for review
|
||||
5. Confirms, then tags and pushes
|
||||
|
||||
### Reference implementation
|
||||
|
||||
```bash
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SKIP_CONFIRM=false
|
||||
VERSION=""
|
||||
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
-y) SKIP_CONFIRM=true ;;
|
||||
*) VERSION="$arg" ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Ensure we're on main and up to date
|
||||
BRANCH=$(git rev-parse --abbrev-ref HEAD)
|
||||
if [[ "$BRANCH" != "main" ]]; then
|
||||
echo "ERROR: Must be on main branch (currently on ${BRANCH})"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
git fetch origin main --tags --quiet
|
||||
LOCAL=$(git rev-parse HEAD)
|
||||
REMOTE=$(git rev-parse origin/main)
|
||||
if [[ "$LOCAL" != "$REMOTE" ]]; then
|
||||
echo "ERROR: Local main is not up to date with origin. Run: git pull"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Determine version
|
||||
YEAR=$(date +%Y)
|
||||
MONTH=$(date +%-m)
|
||||
|
||||
if [[ -z "$VERSION" ]]; then
|
||||
LAST_BUILD=$(git tag --list "${YEAR}.${MONTH}.*" --sort=-v:refname | head -1 | awk -F. '{print $3}')
|
||||
NEXT_BUILD=$(( ${LAST_BUILD:-0} + 1 ))
|
||||
VERSION="${YEAR}.${MONTH}.${NEXT_BUILD}"
|
||||
fi
|
||||
|
||||
# Validate
|
||||
if [[ ! "$VERSION" =~ ^20[0-9]{2}\.[0-9]+\.[0-9]+$ ]]; then
|
||||
echo "ERROR: Invalid version format '${VERSION}'. Expected YYYY.M.BUILD"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if git rev-parse "refs/tags/${VERSION}" &>/dev/null; then
|
||||
echo "ERROR: Tag ${VERSION} already exists"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Show what's being released
|
||||
LAST_TAG=$(git tag --sort=-v:refname | head -1)
|
||||
echo "Version: ${VERSION}"
|
||||
echo "Previous: ${LAST_TAG:-none}"
|
||||
echo "Commit: $(git log -1 --format='%h %s')"
|
||||
|
||||
if [[ -n "$LAST_TAG" ]]; then
|
||||
echo "Changes since ${LAST_TAG}:"
|
||||
git log "${LAST_TAG}..HEAD" --oneline --no-merges
|
||||
fi
|
||||
|
||||
# Confirm
|
||||
if [[ "$SKIP_CONFIRM" != true ]]; then
|
||||
read -rp "Create tag ${VERSION} and trigger release? [y/N] " answer
|
||||
[[ "$answer" =~ ^[Yy]$ ]] || { echo "Aborted."; exit 0; }
|
||||
fi
|
||||
|
||||
# Tag and push
|
||||
git tag "$VERSION"
|
||||
git push origin tag "$VERSION"
|
||||
|
||||
echo "==> Tag ${VERSION} pushed. CI will build the image."
|
||||
echo "Deploy with: .scripts/deploy.sh"
|
||||
```
|
||||
|
||||
## CI Workflow (Gitea Actions)
|
||||
|
||||
`.gitea/workflows/docker-build.yml` — triggered by tag push, builds and pushes Docker image.
|
||||
|
||||
```yaml
|
||||
name: Build Docker Image
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- '20*' # matches CalVer tags like 2026.3.11
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Extract version from tag
|
||||
id: version
|
||||
run: |
|
||||
VERSION=${GITHUB_REF#refs/tags/}
|
||||
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
||||
echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
|
||||
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
- uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: |
|
||||
myorg/myapp:${{ steps.version.outputs.version }}
|
||||
myorg/myapp:production
|
||||
cache-from: type=registry,ref=myorg/myapp:buildcache
|
||||
cache-to: type=registry,ref=myorg/myapp:buildcache,mode=max
|
||||
```
|
||||
|
||||
### Docker image tags
|
||||
|
||||
| Tag | Type | Purpose |
|
||||
|-----|------|---------|
|
||||
| `2026.3.10` | Immutable | Pinpoints exact version, rollback target |
|
||||
| `production` | Floating | Always the latest release, used in docker-compose |
|
||||
|
||||
## Deploy Script
|
||||
|
||||
`.scripts/deploy.sh` — pulls the latest `production` image and restarts the container.
|
||||
|
||||
```bash
|
||||
# Interactive (confirms before deploying)
|
||||
.scripts/deploy.sh
|
||||
|
||||
# Non-interactive
|
||||
.scripts/deploy.sh -y
|
||||
```
|
||||
|
||||
### What it does
|
||||
|
||||
1. Shows current branch, commit, and target
|
||||
2. Saves previous image digest (for rollback)
|
||||
3. Pulls latest image via `docker compose pull`
|
||||
4. Restarts container via `docker compose up -d`
|
||||
5. Waits 5 seconds, shows container status and logs
|
||||
6. Prints rollback command if image changed
|
||||
|
||||
### Key details
|
||||
|
||||
- Uses SSH alias (`ssh akamai`) — never hardcode `ssh -i` paths
|
||||
- Image tag is `production` (floating) — compose always pulls the latest release
|
||||
- Rollback uses the saved digest, not a version tag, so it's exact
|
||||
|
||||
### Production docker-compose
|
||||
|
||||
```yaml
|
||||
services:
|
||||
myapp:
|
||||
image: myorg/myapp:production
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./credentials.json:/app/credentials.json:ro # secrets read-only
|
||||
- ./storage:/app/storage:rw # state files writable
|
||||
- ./logs:/app/logs
|
||||
```
|
||||
|
||||
## Hotfix Workflow
|
||||
|
||||
When production is broken and you need to fix fast:
|
||||
|
||||
1. Create a `hotfix/` branch from `main`
|
||||
2. Fix, test, push, open PR
|
||||
3. Merge PR to `main`
|
||||
4. Delete the current tag and re-release on the new HEAD:
|
||||
|
||||
```bash
|
||||
git tag -d YYYY.M.BUILD
|
||||
git push origin :refs/tags/YYYY.M.BUILD
|
||||
.scripts/release.sh YYYY.M.BUILD -y
|
||||
# wait for CI...
|
||||
.scripts/deploy.sh -y
|
||||
```
|
||||
|
||||
Re-using the same version number is fine for hotfixes — it keeps the version meaningful ("this is what's deployed") rather than burning a new number for a 1-line fix.
|
||||
|
||||
## Why This Pattern
|
||||
|
||||
| Alternative | Downside |
|
||||
|-------------|----------|
|
||||
| Branch-push trigger | Every merge = build = deploy pressure. Can't batch changes. |
|
||||
| `next-release` staging branch | Extra ceremony, merge conflicts, easy to forget to promote |
|
||||
| `workflow_dispatch` (manual UI button) | Less scriptable, no git tag trail |
|
||||
| GitHub Releases UI | Heavier than needed for Docker-only deploys |
|
||||
|
||||
Tag-triggered releases give you:
|
||||
- Clear audit trail (`git tag` history)
|
||||
- Easy rollbacks (`docker pull myorg/myapp:2026.3.9`)
|
||||
- Scriptable (`release.sh` + `deploy.sh`)
|
||||
- Decoupled merge and release cycles
|
||||
@ -3,7 +3,7 @@ title: "Docker Container Technology Context"
|
||||
description: "Architecture patterns, GPU acceleration, performance optimization, and security practices for Docker/Podman containerization in homelab environments."
|
||||
type: context
|
||||
domain: docker
|
||||
tags: [docker, podman, containers, gpu, nvidia, architecture, security, compose, homelab]
|
||||
tags: [docker, podman, containers, gpu, nvidia, architecture, security, compose]
|
||||
---
|
||||
|
||||
# Docker Container Technology - Technology Context
|
||||
|
||||
@ -1,66 +0,0 @@
|
||||
---
|
||||
title: "Fix: kb-search MCP server 'needs authentication' after server restart"
|
||||
description: "Claude Code shows OAuth errors connecting to kb-search MCP after ubuntu-manticore restart. Fix involves reconfiguring with bearer token headers and clearing stale OAuth credentials."
|
||||
type: troubleshooting
|
||||
domain: docker
|
||||
tags: [troubleshooting, kb-rag, claude-code, docker, manticore]
|
||||
---
|
||||
|
||||
# Fix: kb-search MCP server 'needs authentication' after server restart
|
||||
|
||||
**Date:** 2026-03-25
|
||||
**Severity:** Medium — kb-search MCP unavailable across all Claude Code sessions
|
||||
|
||||
## Problem
|
||||
|
||||
After restarting ubuntu-manticore (crash recovery), the kb-search MCP server showed "needs authentication" in Claude Code's `/mcp` panel. Error message:
|
||||
|
||||
```
|
||||
Error: HTTP 404: Invalid OAuth error response: SyntaxError: JSON Parse error: Unexpected EOF. Raw body:
|
||||
```
|
||||
|
||||
The server was healthy (`/health` returned OK) but Claude Code was attempting OAuth discovery against a server that only supports static bearer token auth.
|
||||
|
||||
## Root Cause
|
||||
|
||||
Two issues compounded:
|
||||
|
||||
1. **Stale MCP session:** The server restart invalidated all existing MCP sessions. Claude Code clients got "Session not found" errors on reconnect.
|
||||
|
||||
2. **Stale OAuth credential:** Claude Code had a cached OAuth entry in `~/.claude/.credentials.json` under the `mcpOAuth` key (`kb-search|120dc71b28e46913`). This entry caused Claude Code to attempt OAuth discovery (hitting `/.well-known/oauth-authorization-server`) instead of using the static `Authorization: Bearer` header from the MCP config. The server returned 404 on the OAuth endpoint, which Claude Code couldn't parse.
|
||||
|
||||
The stale OAuth entry persisted even after reconfiguring the MCP server with correct `headers` config — **`mcpOAuth` credentials override static headers**.
|
||||
|
||||
## Fix
|
||||
|
||||
1. **Reconfigure MCP with bearer token header** (user scope so it applies globally):
|
||||
```bash
|
||||
claude mcp remove kb-search
|
||||
claude mcp add-json kb-search \
|
||||
'{"type":"http","url":"http://10.10.0.226:8001/mcp","headers":{"Authorization":"Bearer <token>"}}' \
|
||||
--scope user
|
||||
```
|
||||
Token is in `~/docker/md-kb-rag/.env` on manticore (`MCP_BEARER_TOKEN` value).
|
||||
|
||||
2. **Remove stale OAuth credential** from `~/.claude/.credentials.json`:
|
||||
```python
|
||||
import json
|
||||
f = '/home/cal/.claude/.credentials.json'
|
||||
d = json.load(open(f))
|
||||
oauth = d.get('mcpOAuth', {})
|
||||
keys = [k for k in oauth if 'kb-search' in k]
|
||||
for k in keys:
|
||||
del oauth[k]
|
||||
d['mcpOAuth'] = oauth
|
||||
with open(f, 'w') as fh:
|
||||
json.dump(d, fh, indent=2)
|
||||
```
|
||||
|
||||
3. **Restart Claude Code** to establish a fresh MCP connection.
|
||||
|
||||
## Lessons
|
||||
|
||||
- Stale `mcpOAuth` entries in `.credentials.json` take priority over static `headers` config — always check and clear these when MCP auth issues occur
|
||||
- After any server hosting MCP endpoints restarts, all Claude Code sessions need restart to reconnect
|
||||
- The `--scope user` flag on `claude mcp add-json` is essential — without it, config goes to project-local and won't appear in other projects
|
||||
- kb-rag uses bearer token auth, NOT OAuth — if Claude Code shows OAuth errors for this server, the config is wrong
|
||||
@ -1,50 +0,0 @@
|
||||
---
|
||||
title: "MLB The Show Grind — 2026.4.02"
|
||||
description: "Pack opening command, full cycle orchestrator, keyboard dismiss fix, package split."
|
||||
type: reference
|
||||
domain: gaming
|
||||
tags: [release-notes, deployment, mlb-the-show, automation]
|
||||
---
|
||||
|
||||
# MLB The Show Grind — 2026.4.02
|
||||
|
||||
**Date:** 2026-04-02
|
||||
**Project:** mlb-the-show (`/mnt/NV2/Development/mlb-the-show`)
|
||||
|
||||
## Release Summary
|
||||
|
||||
Added pack opening automation and a full buy→exchange→open cycle command. Fixed a critical bug where KEYCODE_BACK was closing the buy order modal instead of dismissing the keyboard, preventing all order placement. Split the 1600-line single-file script into a proper Python package.
|
||||
|
||||
## Changes
|
||||
|
||||
### New Features
|
||||
- **`open-packs` command** — navigates to My Packs, finds the target pack by name (default: Exchange - Live Series Gold), rapid-taps Open Next at ~0.3s/pack with periodic verification
|
||||
- **`cycle` command** — full orchestrated flow: buy silvers for specified OVR tiers → exchange all dupes into gold packs → open all gold packs
|
||||
- **`DEFAULT_PACK_NAME` constant** — `"Exchange - Live Series Gold"` extracted from inline strings
|
||||
|
||||
### Bug Fixes
|
||||
- **Keyboard dismiss fix** — `KEYCODE_BACK` was closing the entire buy order modal instead of just dismissing the numeric keyboard. Replaced with `tap(540, 900)` to tap a neutral area. This was the root cause of all buy orders silently failing (0 orders placed despite cards having room).
|
||||
- **`full_cycle` passed no args to `open_packs()`** — now passes `packs_exchanged` count to bound the open loop
|
||||
- **`isinstance(result, dict)` dead code** removed from `full_cycle` — `grind_exchange` always returns `int`
|
||||
- **`_find_nearest_open_button`** — added x-column constraint (200px) and zero-width element filtering to prevent matching ghost buttons from collapsed packs
|
||||
|
||||
### Refactoring
|
||||
- **Package split** — `scripts/grind.py` (1611 lines) → `scripts/grind/` package:
|
||||
- `constants.py` (104 lines) — coordinates, price gates, UI element maps
|
||||
- `adb_utils.py` (125 lines) — ADB shell, tap, swipe, dump_ui, element finders
|
||||
- `navigation.py` (107 lines) — screen navigation (nav_to, nav_tab, FAB)
|
||||
- `exchange.py` (283 lines) — gold exchange logic
|
||||
- `market.py` (469 lines) — market scanning and buy order placement
|
||||
- `packs.py` (131 lines) — pack opening
|
||||
- `__main__.py` (390 lines) — CLI entry point and orchestrators (grind_loop, full_cycle)
|
||||
- `scripts/grind.py` retained as a thin wrapper for `uv run` backward compatibility
|
||||
- Invocation changed from `uv run scripts/grind.py` to `PYTHONPATH=scripts python3 -m grind`
|
||||
- Raw `adb("input swipe ...")` calls replaced with `swipe()` helper
|
||||
|
||||
## Session Stats
|
||||
|
||||
- **Buy orders placed:** 532 orders across two runs (474 + 58)
|
||||
- **Stubs spent:** ~63,655
|
||||
- **Gold packs exchanged:** 155 (94 + 61)
|
||||
- **Gold packs opened:** 275
|
||||
- **OVR tiers worked:** 77 (primary), 78 (all above max price)
|
||||
@ -214,58 +214,6 @@ For full HDR setup (vk-hdr-layer, KDE config, per-API env vars), see the **steam
|
||||
|
||||
**Diagnostic tip**: Look for rapid retry patterns in Pi-hole logs (same domain queried every 1-3s from the Xbox IP) — this signals a blocked domain causing timeout loops.
|
||||
|
||||
## Gray Zone Warfare — EAC Failures on Proton (2026-03-31) [RESOLVED]
|
||||
|
||||
**Severity:** High — game unplayable online
|
||||
**Status:** RESOLVED — corrupted prebuild world cache file
|
||||
|
||||
**Problem:** EAC errors when connecting to servers on Linux/Proton. Three error codes observed across attempts:
|
||||
- `0x0002000A` — "The client failed an anti-cheat client runtime check" (the actual root cause)
|
||||
- `0x0002000F` — "The client failed to register in time" (downstream timeout)
|
||||
- `0x00020011` — "The client failed to start the session" (downstream session failure)
|
||||
|
||||
Game launches fine, EAC bootstrapper reports success, but fails when joining a server at "Synchronizing Live Data".
|
||||
|
||||
**Root Cause:** A corrupted/stale prebuild world cache file that EAC flagged during runtime checks:
|
||||
```
|
||||
LogEOSAntiCheat: [AntiCheatClient] [PollStatusInternal] Client Violation with Type: 5
|
||||
Message: Unknown file version (GZW/Content/SKALLA/PrebuildWorldData/World/cache/0xb9af63cee2e43b6c_0x3cb3b3354fb31606.dat)
|
||||
```
|
||||
EAC scanned this file, found an unrecognized version, and flagged a client violation. The other errors (`0x0002000F`, `0x00020011`) were downstream consequences — EAC couldn't complete session registration after the violation.
|
||||
|
||||
Compounding factors that made diagnosis harder:
|
||||
- Epic EOS scheduled maintenance (Fortnite v40.10, Apr 1 08:00-09:30 UTC) returned 503s from `api.epicgames.dev/auth/v1/oauth/token`, masking the real issue
|
||||
- `steam_api64.dll` EOS SDK errors at startup are **benign noise** under Proton — red herring
|
||||
- Nuking the compatdata prefix and upgrading Proton happened concurrently, adding confusion
|
||||
|
||||
**Fix:**
|
||||
1. Delete the specific cache file: `rm "GZW/Content/SKALLA/PrebuildWorldData/World/cache/0xb9af63cee2e43b6c_0x3cb3b3354fb31606.dat"`
|
||||
2. Verify game files in Steam — Steam redownloads a fresh copy with different hash
|
||||
3. Launch game — clean logs, no EAC errors
|
||||
|
||||
Key detail: the file was the same size (60.7MB) before and after, but different md5 hash — Steam's verify replaced it with a corrected version.
|
||||
|
||||
**Log locations:**
|
||||
- EAC bootstrapper: `compatdata/2479810/pfx/drive_c/users/steamuser/AppData/Roaming/EasyAntiCheat/.../anticheatlauncher.log`
|
||||
- Game log: `compatdata/2479810/pfx/drive_c/users/steamuser/AppData/Local/GZW/Saved/Logs/GZW.log`
|
||||
- STL launch log: `~/.config/steamtinkerlaunch/logs/gamelaunch/id/2479810.log`
|
||||
|
||||
**What did NOT fix it (for reference):**
|
||||
1. Installing Proton EasyAntiCheat Runtime (AppID 1826330) — good to have but not the issue
|
||||
2. Deleting the entire cache directory without re-verifying — Steam verify re-downloaded the same bad file the first time (20 files fixed); needed a second targeted delete + verify
|
||||
3. Nuking compatdata prefix for clean rebuild
|
||||
4. Switching Proton versions (GE-Proton9-25 ↔ GE-Proton10-25)
|
||||
|
||||
**Lessons:**
|
||||
- When EAC logs show "Unknown file version" for a specific `.dat` file, delete that file and verify — don't nuke the whole cache or prefix
|
||||
- `steam_api64.dll` EOS errors are benign under Proton and not related to EAC failures
|
||||
- Check Epic's status page for scheduled maintenance before deep-diving Proton issues
|
||||
- Multiple verify-and-fix cycles may be needed — the first verify can redownload a stale cached version from Steam's CDN
|
||||
|
||||
**Game version:** 0.4.0.0-231948-H (EA Pre-Alpha)
|
||||
**Working Proton:** GE-Proton10-25
|
||||
**STL config:** `~/.config/steamtinkerlaunch/gamecfgs/id/2479810.conf`
|
||||
|
||||
## Useful Commands
|
||||
|
||||
### Check Running Game Process
|
||||
|
||||
@ -21,7 +21,7 @@
|
||||
{
|
||||
"parameters": {
|
||||
"operation": "executeCommand",
|
||||
"command": "/root/.local/bin/claude -p \"Run python3 ~/.claude/skills/server-diagnostics/client.py health paper-dynasty and analyze the results. If any containers are not running or there are critical issues, summarize them. Otherwise just say 'All systems healthy'.\" --output-format json --json-schema '{\"type\":\"object\",\"properties\":{\"status\":{\"type\":\"string\",\"enum\":[\"healthy\",\"issues_found\"]},\"summary\":{\"type\":\"string\"},\"root_cause\":{\"type\":\"string\"},\"severity\":{\"type\":\"string\",\"enum\":[\"low\",\"medium\",\"high\",\"critical\"]},\"affected_services\":{\"type\":\"array\",\"items\":{\"type\":\"string\"}},\"actions_taken\":{\"type\":\"array\",\"items\":{\"type\":\"string\"}}},\"required\":[\"status\",\"severity\",\"summary\"]}' --allowedTools \"Read,Grep,Glob,Bash(python3 ~/.claude/skills/server-diagnostics/client.py *)\" --append-system-prompt \"You are a server diagnostics agent. Use the server-diagnostics skill client.py for all operations. Never run destructive commands.\"",
|
||||
"command": "/root/.local/bin/claude -p \"Run python3 ~/.claude/skills/server-diagnostics/client.py health paper-dynasty and analyze the results. If any containers are not running or there are critical issues, summarize them. Otherwise just say 'All systems healthy'.\" --output-format json --json-schema '{\"type\":\"object\",\"properties\":{\"status\":{\"type\":\"string\",\"enum\":[\"healthy\",\"issues_found\"]},\"summary\":{\"type\":\"string\"},\"root_cause\":{\"type\":\"string\"},\"severity\":{\"type\":\"string\",\"enum\":[\"low\",\"medium\",\"high\",\"critical\"]},\"affected_services\":{\"type\":\"array\",\"items\":{\"type\":\"string\"}},\"actions_taken\":{\"type\":\"array\",\"items\":{\"type\":\"string\"}}},\"required\":[\"status\",\"severity\",\"summary\"]}' --allowedTools \"Read,Grep,Glob,Bash(python3 ~/.claude/skills/server-diagnostics/client.py *)\"",
|
||||
"options": {}
|
||||
},
|
||||
"id": "ssh-claude-code",
|
||||
@ -75,48 +75,20 @@
|
||||
"typeVersion": 2,
|
||||
"position": [660, 0]
|
||||
},
|
||||
{
|
||||
"parameters": {
|
||||
"operation": "executeCommand",
|
||||
"command": "=/root/.local/bin/claude -p \"The previous health check found issues. Investigate deeper: check container logs, resource usage, and recent events. Provide a detailed root cause analysis and recommended remediation steps.\" --resume \"{{ $json.session_id }}\" --output-format json --json-schema '{\"type\":\"object\",\"properties\":{\"root_cause_detail\":{\"type\":\"string\"},\"container_logs\":{\"type\":\"string\"},\"resource_status\":{\"type\":\"string\"},\"remediation_steps\":{\"type\":\"array\",\"items\":{\"type\":\"string\"}},\"requires_human\":{\"type\":\"boolean\"}},\"required\":[\"root_cause_detail\",\"remediation_steps\",\"requires_human\"]}' --allowedTools \"Read,Grep,Glob,Bash(python3 ~/.claude/skills/server-diagnostics/client.py *)\" --max-turns 15 --append-system-prompt \"You are a server diagnostics agent performing a follow-up investigation. The initial health check found issues. Dig deeper into logs and metrics. Never run destructive commands.\"",
|
||||
"options": {}
|
||||
},
|
||||
"id": "ssh-followup",
|
||||
"name": "Follow Up Diagnostics",
|
||||
"type": "n8n-nodes-base.ssh",
|
||||
"typeVersion": 1,
|
||||
"position": [880, -200],
|
||||
"credentials": {
|
||||
"sshPassword": {
|
||||
"id": "REPLACE_WITH_CREDENTIAL_ID",
|
||||
"name": "Claude Code LXC"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"parameters": {
|
||||
"jsCode": "// Parse follow-up diagnostics response\nconst stdout = $input.first().json.stdout || '';\nconst initial = $('Parse Claude Response').first().json;\n\ntry {\n const response = JSON.parse(stdout);\n const data = response.structured_output || JSON.parse(response.result || '{}');\n \n return [{\n json: {\n ...initial,\n followup: {\n root_cause_detail: data.root_cause_detail || 'No detail available',\n container_logs: data.container_logs || '',\n resource_status: data.resource_status || '',\n remediation_steps: data.remediation_steps || [],\n requires_human: data.requires_human || false,\n cost_usd: response.total_cost_usd,\n session_id: response.session_id\n },\n total_cost_usd: (initial.cost_usd || 0) + (response.total_cost_usd || 0)\n }\n }];\n} catch (e) {\n return [{\n json: {\n ...initial,\n followup: {\n error: e.message,\n root_cause_detail: 'Follow-up parse failed',\n remediation_steps: [],\n requires_human: true\n },\n total_cost_usd: initial.cost_usd || 0\n }\n }];\n}"
|
||||
},
|
||||
"id": "parse-followup",
|
||||
"name": "Parse Follow-up Response",
|
||||
"type": "n8n-nodes-base.code",
|
||||
"typeVersion": 2,
|
||||
"position": [1100, -200]
|
||||
},
|
||||
{
|
||||
"parameters": {
|
||||
"method": "POST",
|
||||
"url": "https://discord.com/api/webhooks/1451783909409816763/O9PMDiNt6ZIWRf8HKocIZ_E4vMGV_lEwq50aAiZ9HVFR2UGwO6J1N9_wOm82p0MetIqT",
|
||||
"sendBody": true,
|
||||
"specifyBody": "json",
|
||||
"jsonBody": "={\n \"embeds\": [{\n \"title\": \"{{ $json.severity === 'critical' ? '🔴' : $json.severity === 'high' ? '🟠' : '🟡' }} Server Alert\",\n \"description\": {{ JSON.stringify($json.summary) }},\n \"color\": {{ $json.severity === 'critical' ? 15158332 : $json.severity === 'high' ? 15105570 : 16776960 }},\n \"fields\": [\n {\n \"name\": \"Severity\",\n \"value\": \"{{ $json.severity.toUpperCase() }}\",\n \"inline\": true\n },\n {\n \"name\": \"Server\",\n \"value\": \"paper-dynasty (10.10.0.88)\",\n \"inline\": true\n },\n {\n \"name\": \"Cost\",\n \"value\": \"${{ $json.total_cost_usd ? $json.total_cost_usd.toFixed(4) : '0.0000' }}\",\n \"inline\": true\n },\n {\n \"name\": \"Root Cause\",\n \"value\": {{ JSON.stringify(($json.followup && $json.followup.root_cause_detail) || $json.root_cause || 'N/A') }},\n \"inline\": false\n },\n {\n \"name\": \"Affected Services\",\n \"value\": \"{{ $json.affected_services.length ? $json.affected_services.join(', ') : 'None' }}\",\n \"inline\": false\n },\n {\n \"name\": \"Remediation Steps\",\n \"value\": {{ JSON.stringify(($json.followup && $json.followup.remediation_steps.length) ? $json.followup.remediation_steps.map((s, i) => (i+1) + '. ' + s).join('\\n') : ($json.actions_taken.length ? $json.actions_taken.join('\\n') : 'None')) }},\n \"inline\": false\n },\n {\n \"name\": \"Requires Human?\",\n \"value\": \"{{ ($json.followup && $json.followup.requires_human) ? '⚠️ Yes' : '✅ No' }}\",\n \"inline\": true\n }\n ],\n \"timestamp\": \"{{ new Date().toISOString() }}\"\n }]\n}",
|
||||
"jsonBody": "={\n \"embeds\": [{\n \"title\": \"{{ $json.severity === 'critical' ? '🔴' : $json.severity === 'high' ? '🟠' : '🟡' }} Server Alert\",\n \"description\": {{ JSON.stringify($json.summary) }},\n \"color\": {{ $json.severity === 'critical' ? 15158332 : $json.severity === 'high' ? 15105570 : 16776960 }},\n \"fields\": [\n {\n \"name\": \"Severity\",\n \"value\": \"{{ $json.severity.toUpperCase() }}\",\n \"inline\": true\n },\n {\n \"name\": \"Server\",\n \"value\": \"paper-dynasty (10.10.0.88)\",\n \"inline\": true\n },\n {\n \"name\": \"Cost\",\n \"value\": \"${{ $json.cost_usd ? $json.cost_usd.toFixed(4) : '0.0000' }}\",\n \"inline\": true\n },\n {\n \"name\": \"Root Cause\",\n \"value\": \"{{ $json.root_cause || 'N/A' }}\",\n \"inline\": false\n },\n {\n \"name\": \"Affected Services\",\n \"value\": \"{{ $json.affected_services.length ? $json.affected_services.join(', ') : 'None' }}\",\n \"inline\": false\n },\n {\n \"name\": \"Actions Taken\",\n \"value\": \"{{ $json.actions_taken.length ? $json.actions_taken.join('\\n') : 'None' }}\",\n \"inline\": false\n }\n ],\n \"timestamp\": \"{{ new Date().toISOString() }}\"\n }]\n}",
|
||||
"options": {}
|
||||
},
|
||||
"id": "discord-alert",
|
||||
"name": "Discord Alert",
|
||||
"type": "n8n-nodes-base.httpRequest",
|
||||
"typeVersion": 4.2,
|
||||
"position": [1320, -200]
|
||||
"position": [880, -100]
|
||||
},
|
||||
{
|
||||
"parameters": {
|
||||
@ -173,7 +145,7 @@
|
||||
"main": [
|
||||
[
|
||||
{
|
||||
"node": "Follow Up Diagnostics",
|
||||
"node": "Discord Alert",
|
||||
"type": "main",
|
||||
"index": 0
|
||||
}
|
||||
@ -186,28 +158,6 @@
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"Follow Up Diagnostics": {
|
||||
"main": [
|
||||
[
|
||||
{
|
||||
"node": "Parse Follow-up Response",
|
||||
"type": "main",
|
||||
"index": 0
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"Parse Follow-up Response": {
|
||||
"main": [
|
||||
[
|
||||
{
|
||||
"node": "Discord Alert",
|
||||
"type": "main",
|
||||
"index": 0
|
||||
}
|
||||
]
|
||||
]
|
||||
}
|
||||
},
|
||||
"settings": {
|
||||
|
||||
@ -1,34 +0,0 @@
|
||||
---
|
||||
title: "Database API Release — 2026.4.1"
|
||||
description: "Query limit caps to prevent worker timeouts, plus hotfix to exempt /players endpoint."
|
||||
type: reference
|
||||
domain: major-domo
|
||||
tags: [release-notes, deployment, database, hotfix]
|
||||
---
|
||||
|
||||
# Database API Release — 2026.4.1
|
||||
|
||||
**Date:** 2026-04-01
|
||||
**Tag:** `2026.3.7` + 3 post-tag commits (CI auto-generates CalVer on merge)
|
||||
**Image:** `manticorum67/major-domo-database`
|
||||
**Server:** akamai (`~/container-data/sba-database`)
|
||||
**Deploy method:** `docker compose pull && docker compose down && docker compose up -d`
|
||||
|
||||
## Release Summary
|
||||
|
||||
Added bounded pagination (`MAX_LIMIT=500`, `DEFAULT_LIMIT=200`) to all list endpoints to prevent Gunicorn worker timeouts caused by unbounded queries. Two follow-up fixes corrected response `count` fields in fieldingstats that were computed after the limit was applied. A hotfix (PR #103) then removed the caps from the `/players` endpoint specifically, since the bot and website depend on fetching full player lists.
|
||||
|
||||
## Changes
|
||||
|
||||
### Bug Fixes
|
||||
- **PR #99** — Fix unbounded API queries causing Gunicorn worker timeouts. Added `MAX_LIMIT=500` and `DEFAULT_LIMIT=200` constants in `dependencies.py`, enforced `le=MAX_LIMIT` on all list endpoints. Added middleware to strip empty query params preventing validation bypass.
|
||||
- **PR #100** — Fix fieldingstats `get_fieldingstats` count: captured `total_count` before `.limit()` so the response reflects total rows, not page size.
|
||||
- **PR #101** — Fix fieldingstats `get_totalstats`: removed line that overwrote `count` with `len(page)` after it was correctly set from `total_count`.
|
||||
|
||||
### Hotfix
|
||||
- **PR #103** — Remove output caps from `GET /api/v3/players`. Reverted `limit` param to `Optional[int] = Query(default=None, ge=1)` (no ceiling). The `/players` table is a bounded dataset (~1500 rows/season) and consumers depend on uncapped results. All other endpoints retain their caps.
|
||||
|
||||
## Deployment Notes
|
||||
- No migrations required
|
||||
- No config changes
|
||||
- Rollback: `docker compose pull manticorum67/major-domo-database:<previous-tag> && docker compose down && docker compose up -d`
|
||||
@ -1,112 +0,0 @@
|
||||
---
|
||||
title: Major Domo v2 Release — 2026.3.20
|
||||
description: "Performance release: parallelized API calls, caching improvements, CI overhaul to tag-triggered releases, async hotfix, and chart path fix."
|
||||
type: reference
|
||||
domain: major-domo
|
||||
tags: [discord, major-domo, deployment, release-notes, docker, ci]
|
||||
---
|
||||
|
||||
# Major Domo v2 Release — 2026.3.20
|
||||
|
||||
**Date:** 2026-03-20
|
||||
**Tags:** `2026.3.10`, `2026.3.11` (bugfix)
|
||||
**Image:** `manticorum67/major-domo-discordapp:production`
|
||||
**Server:** akamai (`/root/container-data/major-domo`)
|
||||
**Deploy method:** `.scripts/release.sh` → CI → `.scripts/deploy.sh`
|
||||
|
||||
## Release Summary
|
||||
|
||||
Performance-focused release with 12 merged PRs covering parallelized API calls, caching improvements, CI workflow overhaul, and a production hotfix. Also retired the `next-release` staging branch in favor of direct-to-main merges with tag-triggered releases.
|
||||
|
||||
## Hotfix During Release
|
||||
|
||||
**PR #117** — ScorecardTracker async mismatch. PR #106 added `await` to all `scorecard_tracker` method calls across `scorebug.py`, `live_scorebug_tracker.py`, and `cleanup_service.py`, but the tracker methods themselves were still synchronous. This caused `TypeError: object NoneType can't be used in 'await' expression` on `/scorebug` and `TypeError: object list can't be used in 'await' expression` in the background scorebug update loop. Fixed by making all 6 public `ScorecardTracker` methods async and adding 5 missing `await`s in `cleanup_service.py`.
|
||||
|
||||
**Root cause:** PR #106 was created by an issue-worker agent that modified callers without modifying the tracker class. The async tracker conversion existed only in uncommitted working tree changes that were never included in any PR.
|
||||
|
||||
**Lesson:** Issue-worker agent PRs that add `await` to calls must verify the called methods are actually async — not just that the callers compile.
|
||||
|
||||
## Infrastructure Changes
|
||||
|
||||
### CI: Tag-triggered releases (PRs #110, #113)
|
||||
|
||||
Replaced branch-push CI with tag-push CI. Merging to `main` no longer triggers a Docker build.
|
||||
|
||||
- **Before:** Push to `main` or `next-release` → auto-build → auto-tag CalVer
|
||||
- **After:** Push CalVer tag (`git tag 2026.3.11 && git push --tags`) → build → Docker image tagged `:version` + `:production`
|
||||
|
||||
Also removed the `pull_request` trigger that was building Docker images on every PR branch push.
|
||||
|
||||
### Release and deploy scripts (PRs #114, #115)
|
||||
|
||||
- `.scripts/release.sh` — auto-generates next CalVer tag, shows changelog, confirms, pushes tag
|
||||
- `.scripts/deploy.sh` — updated to use SSH alias (`ssh akamai`) and `:production` image tag
|
||||
|
||||
### Docker volume split (PR #86)
|
||||
|
||||
Split the single `./storage:/app/data` volume into:
|
||||
- `./storage/major-domo-service-creds.json:/app/data/major-domo-service-creds.json:ro` (credentials)
|
||||
- `./storage:/app/storage:rw` (state files)
|
||||
|
||||
Production compose on akamai was updated manually before deploy. All 5 tracker default paths changed from `data/` to `storage/`.
|
||||
|
||||
### Retired `next-release` branch
|
||||
|
||||
All references to `next-release` removed from CLAUDE.md and CI workflow. New workflow: branch from `main` → PR to `main` → tag to release.
|
||||
|
||||
## Performance Changes
|
||||
|
||||
### Parallelized API calls
|
||||
|
||||
| PR | What | Impact |
|
||||
|----|------|--------|
|
||||
| #88 | `schedule_service`: `get_team_schedule`, `get_recent_games`, `get_upcoming_games` use `asyncio.gather()` | Up to 18 sequential HTTP requests → concurrent |
|
||||
| #90 | Team lookups in `/publish-scorecard`, `/scorebug`, `/injury`, trade validation | 2 sequential calls → concurrent per location |
|
||||
| #102 | `asyncio.gather()` across multiple command files | Broad latency reduction |
|
||||
|
||||
### Caching
|
||||
|
||||
| PR | What | Impact |
|
||||
|----|------|--------|
|
||||
| #99 | Cache user team lookup in `player_autocomplete` with 60s TTL, reduce Discord limit to 25 | Faster autocomplete on repeat use |
|
||||
| #98 | Replace Redis `KEYS` with `SCAN` for cache invalidation | Non-blocking invalidation |
|
||||
|
||||
### Micro-optimizations
|
||||
|
||||
| PR | What | Impact |
|
||||
|----|------|--------|
|
||||
| #93 | Use `channel.purge()` instead of per-message `message.delete()` loops | 1 API call vs up to 100 per channel clear |
|
||||
| #96 | Replace `json.dumps(value)` probe with `isinstance()` in JSON logger | Eliminates full serialization on every log call |
|
||||
| #97 | Cache `inspect.signature()` at decoration time in all 3 decorators | Introspection cost paid once, not per-call |
|
||||
|
||||
## Cleanup
|
||||
|
||||
| PR | What |
|
||||
|----|------|
|
||||
| #104 | Remove dead `@self.tree.interaction_check` decorator block and duplicate `self.maintenance_mode` assignment in `bot.py` |
|
||||
| #103 | Remove unused `weeks_ahead` parameter from `get_upcoming_games` |
|
||||
|
||||
## Test Coverage
|
||||
|
||||
- 16 new tests for `schedule_service` (`test_services_schedule.py` — first coverage for this service)
|
||||
- Tests use existing `GameFactory`/`TeamFactory` from `tests/factories.py`
|
||||
- 2 existing scorebug tests updated for async tracker methods
|
||||
- Full suite: 967+ tests passing
|
||||
|
||||
## Bugfix: 2026.3.11
|
||||
|
||||
**PR #119** — `chart_service.py` CHARTS_FILE path still pointed to `data/charts.json` after PR #86 moved state files to `storage/`. The `/charts` autocomplete returned no results because the file was at `/app/storage/charts.json` but the code read from `/app/data/charts.json`. One-line path fix.
|
||||
|
||||
**Root cause:** PR #86 updated the 5 tracker classes but missed `ChartService`, which uses a class-level `Path` constant instead of the `__init__` pattern used by the trackers.
|
||||
|
||||
**Lesson:** When moving file paths across volumes, grep for the old path across the entire codebase — not just the files being modified.
|
||||
|
||||
## Deployment Notes
|
||||
|
||||
- Production compose updated on akamai before deploy (volume split for PR #86)
|
||||
- Image tag changed from `:latest` to `:production`
|
||||
- Deployed three times total:
|
||||
1. `2026.3.10` — initial release (broken `/scorebug` and scorebug tracker)
|
||||
2. `2026.3.10` — re-tagged after hotfix #117 (async mismatch)
|
||||
3. `2026.3.11` — chart path fix (#119)
|
||||
- Final deploy confirmed healthy — all background tasks started, gateway connected
|
||||
@ -1,38 +0,0 @@
|
||||
---
|
||||
title: "Discord Bot Release — 2026.3.13"
|
||||
description: "Enforce free agency lock deadline — block /dropadd FA pickups after week 14, plus performance batch from backlog issues."
|
||||
type: reference
|
||||
domain: major-domo
|
||||
tags: [release-notes, deployment, discord, major-domo]
|
||||
---
|
||||
|
||||
# Discord Bot Release — 2026.3.13
|
||||
|
||||
**Date:** 2026-03-31
|
||||
**Tag:** `2026.3.13`
|
||||
**Image:** `manticorum67/major-domo-discordapp:2026.3.13` / `:production`
|
||||
**Server:** akamai (`~/container-data/major-domo`)
|
||||
**Deploy method:** `.scripts/deploy.sh -y` (docker compose pull + up)
|
||||
|
||||
## Release Summary
|
||||
|
||||
Enforces the previously unused `fa_lock_week` config (week 14) in the transaction builder. After the deadline, `/dropadd` blocks adding players FROM Free Agency while still allowing drops TO FA. Also includes a batch of performance PRs from the backlog that were merged between 2026.3.12 and this tag.
|
||||
|
||||
## Changes
|
||||
|
||||
### New Features
|
||||
- **Free agency lock enforcement** — `TransactionBuilder.add_move()` now checks `current_week >= fa_lock_week` and rejects FA pickups after the deadline. Dropping to FA remains allowed. Config already existed at `fa_lock_week = 14` but was never enforced. (PR #122)
|
||||
|
||||
### Performance
|
||||
- Eliminate redundant API calls in trade views (PR #116, issue #94)
|
||||
- Eliminate redundant GET after create/update and parallelize stats (PR #112, issue #95)
|
||||
- Parallelize N+1 player/creator lookups with `asyncio.gather()` (PR #118, issue #89)
|
||||
- Consolidate duplicate `league_service.get_current_state()` calls in `add_move()` into a single shared fetch (PR #122)
|
||||
|
||||
### Bug Fixes
|
||||
- Fix race condition: use per-user dict for `_checked_teams` in trade views (PR #116)
|
||||
|
||||
## Deployment Notes
|
||||
- No migrations required
|
||||
- No config changes needed — `fa_lock_week = 14` already existed in config
|
||||
- Rollback: `ssh akamai "cd ~/container-data/major-domo && docker pull manticorum67/major-domo-discordapp@sha256:94d59135f127d5863b142136aeeec9d63b06ee63e214ef59f803cedbd92b473e && docker tag manticorum67/major-domo-discordapp@sha256:94d59135f127d5863b142136aeeec9d63b06ee63e214ef59f803cedbd92b473e manticorum67/major-domo-discordapp:production && docker compose up -d discord-app"`
|
||||
@ -1,86 +0,0 @@
|
||||
---
|
||||
title: "Discord Bot Release — 2026.3.12"
|
||||
description: "Major catch-up release: trade deadline enforcement, performance parallelization, security fixes, CI/CD migration to CalVer, and 148 commits of accumulated improvements."
|
||||
type: reference
|
||||
domain: major-domo
|
||||
tags: [release-notes, deployment, discord, major-domo]
|
||||
---
|
||||
|
||||
# Discord Bot Release — 2026.3.12
|
||||
|
||||
**Date:** 2026-03-31
|
||||
**Tag:** `2026.3.12`
|
||||
**Image:** `manticorum67/major-domo-discordapp:2026.3.12` / `:production`
|
||||
**Server:** akamai (`~/container-data/major-domo`)
|
||||
**Deploy method:** `.scripts/deploy.sh -y` (docker compose pull + up)
|
||||
**Previous tag:** `v2.29.4` (148 commits behind)
|
||||
|
||||
## Release Summary
|
||||
|
||||
Large catch-up release covering months of accumulated work since the last tag. The headline feature is trade deadline enforcement — `/trade` commands are now blocked after the configured deadline week, with fail-closed behavior when API data is unavailable. Also includes significant performance improvements (parallelized API calls, cached signatures, Redis SCAN), security hardening, dependency pinning, and a full CI/CD migration from version-file bumps to CalVer tag-triggered builds.
|
||||
|
||||
## Changes
|
||||
|
||||
### New Features
|
||||
- **Trade deadline enforcement** — `is_past_trade_deadline` property on Current model; guards on `/trade initiate`, submit button, and `_finalize_trade`. Fail-closed when API returns no data. 4 new tests. (PR #121)
|
||||
- `is_admin()` helper in `utils/permissions.py` (#55)
|
||||
- Team ownership verification on `/injury set-new` and `/injury clear` (#18)
|
||||
- Current week number included in weekly-info channel posts
|
||||
- Local deploy script for production deploys
|
||||
|
||||
### Performance
|
||||
- Parallelize independent API calls with `asyncio.gather()` (#90)
|
||||
- Cache `inspect.signature()` at decoration time (#97)
|
||||
- Replace `json.dumps` serialization test with `isinstance` fast path (#96)
|
||||
- Use `channel.purge()` instead of per-message delete loops (#93)
|
||||
- Parallelize schedule_service week fetches (#88)
|
||||
- Replace Redis `KEYS` with `SCAN` in `clear_prefix` (#98)
|
||||
- Reuse persistent `aiohttp.ClientSession` in GiphyService (#26)
|
||||
- Cache user team lookup in player_autocomplete, reduce limit to 25
|
||||
|
||||
### Bug Fixes
|
||||
- Fix chart_service path from `data/` to `storage/`
|
||||
- Make ScorecardTracker methods async to match await callers
|
||||
- Prevent partial DB writes and show detailed errors on scorecard submission failure
|
||||
- Add trailing slashes to API URLs to prevent 307 redirects dropping POST bodies
|
||||
- Trade validation: check against next week's projected roster, include pending trades and org affiliate transactions
|
||||
- Prefix trade validation errors with team abbreviation
|
||||
- Auto-detect player roster type in trade commands instead of assuming ML
|
||||
- Fix key plays score text ("tied at X" instead of "Team up X-X") (#48)
|
||||
- Fix scorebug stale data, win probability parsing, and read-failure tolerance (#39, #40)
|
||||
- Batch quick-wins: 4 issues resolved (#37, #27, #25, #38)
|
||||
- Fix ContextualLogger crash when callers pass `exc_info=True`
|
||||
- Fix thaw report posting to use channel ID instead of channel names
|
||||
- Use explicit America/Chicago timezone for freeze/thaw scheduling
|
||||
- Replace broken `@self.tree.interaction_check` with MaintenanceAwareTree subclass
|
||||
- Implement actual maintenance mode flag in `/admin-maintenance` (#28)
|
||||
- Validate and sanitize pitching decision data from Google Sheets
|
||||
- Fix `/player` autocomplete timeout by using current season only
|
||||
- Split read-only data volume to allow state file writes (#85)
|
||||
- Update roster labels to use Minor League and Injured List (#59)
|
||||
|
||||
### Security
|
||||
- Address 7 security issues across the codebase
|
||||
- Remove 226 unused imports (#33)
|
||||
- Pin all Python dependency versions in `requirements.txt` (#76)
|
||||
|
||||
### Refactoring & Cleanup
|
||||
- Extract duplicate command hash logic into `_compute_command_hash` (#31)
|
||||
- Move 42 unnecessary lazy imports to top-level
|
||||
- Remove dead maintenance mode artifacts in bot.py (#104)
|
||||
- Remove unused `weeks_ahead` parameter from `get_upcoming_games`
|
||||
- Invalidate roster cache after submission instead of force-refreshing
|
||||
|
||||
## Infrastructure Changes
|
||||
- **CI/CD migration**: Switched from version-file bumps to CalVer tag-triggered Docker builds
|
||||
- Added `.scripts/release.sh` for creating CalVer tags
|
||||
- Updated `.scripts/deploy.sh` for tag-triggered releases
|
||||
- Docker build cache switched from `type=gha` to `type=registry`
|
||||
- Used `docker-tags` composite action for multi-channel release support
|
||||
- Fixed act_runner auth with short-form local actions + full GitHub URLs
|
||||
- Use Gitea API for tag creation to avoid branch protection failures
|
||||
|
||||
## Deployment Notes
|
||||
- No migrations required
|
||||
- No config changes needed
|
||||
- Rollback: `ssh akamai "cd ~/container-data/major-domo && docker pull manticorum67/major-domo-discordapp@<previous-digest> && docker tag <digest> manticorum67/major-domo-discordapp:production && docker compose up -d discord-app"`
|
||||
@ -1,59 +0,0 @@
|
||||
---
|
||||
title: "Fix: Gunicorn Worker Timeouts from Unbounded API Queries"
|
||||
description: "External clients sent limit=99999 and empty filter params through the reverse proxy, causing API workers to timeout and get killed."
|
||||
type: troubleshooting
|
||||
domain: major-domo
|
||||
tags: [troubleshooting, major-domo, database, deployment, docker]
|
||||
---
|
||||
|
||||
# Fix: Gunicorn Worker Timeouts from Unbounded API Queries
|
||||
|
||||
**Date:** 2026-04-01
|
||||
**PR:** cal/major-domo-database#99
|
||||
**Issues:** #98 (main), #100 (fieldingstats count bug), #101 (totalstats count overwrite, pre-existing)
|
||||
**Severity:** Critical — active production instability during Season 12, 12 worker timeouts in 2 days and accelerating
|
||||
|
||||
## Problem
|
||||
|
||||
The monitoring app kept flagging the SBA API container (`sba_db_api`) as unhealthy and restarting it. Container logs showed repeated `CRITICAL WORKER TIMEOUT` and `WARNING Worker was sent SIGABRT` messages from Gunicorn. The container itself wasn't restarting (0 Docker restarts, up 2 weeks), but individual workers were being killed and respawned, causing brief API unavailability windows.
|
||||
|
||||
## Root Cause
|
||||
|
||||
External clients (via nginx-proxy-manager at `172.25.0.3`) were sending requests with `limit=99999` and empty filter parameters (e.g., `?game_id=&pitcher_id=`). The API had no defenses:
|
||||
|
||||
- **No max limit cap** on any endpoint except `/players/search` (which had `le=50`). Clients could request 99,999 rows.
|
||||
- **Empty string params passed validation** — FastAPI parsed `game_id=` as `['']`, which passed `if param is not None` checks but generated wasteful full-table-scan queries.
|
||||
- **`/transactions` had no limit parameter at all** — always returned every matching row with recursive serialization (`model_to_dict(recurse=True)`).
|
||||
- **Recursive serialization amplified cost** — each row triggered additional DB lookups for FK relations (player, team, etc.).
|
||||
|
||||
Combined, these caused queries to exceed the 120-second Gunicorn timeout, killing the worker.
|
||||
|
||||
### IP Attribution Gotcha
|
||||
|
||||
Initial assumption was the Discord bot was the source (IP `172.25.0.3` was assumed to be the bot container). Docker IP mapping revealed `172.25.0.3` was actually **nginx-proxy-manager** — the queries came from external clients through the reverse proxy. The Discord bot is at `172.18.0.2` on a completely separate Docker network and generates none of these queries.
|
||||
|
||||
```bash
|
||||
# Command to map container IPs
|
||||
docker inspect --format='{{.Name}} {{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' $(docker ps -q)
|
||||
```
|
||||
|
||||
## Fix
|
||||
|
||||
PR #99 merged into main with the following changes (27 files, 503 insertions):
|
||||
|
||||
1. **`MAX_LIMIT=500` and `DEFAULT_LIMIT=200` constants** in `app/dependencies.py`, enforced with `le=MAX_LIMIT` across all list endpoints
|
||||
2. **`strip_empty_query_params` middleware** in `app/main.py` — strips empty string values from query params before FastAPI parses them, so `?game_id=` is treated as absent
|
||||
3. **`limit`/`offset` added to `/transactions`** — previously returned all rows; now defaults to 200, max 500, with `total_count` computed before pagination
|
||||
4. **11 existing limit params capped** with `le=MAX_LIMIT`
|
||||
5. **13 endpoints with no limit** received `limit`/`offset` params
|
||||
6. **Manual `if limit < 1` guards removed** — now handled by FastAPI's `ge=1` validation
|
||||
7. **5 unit tests** covering limit validation (422 on exceeding max, zero, negative), transaction response shape, and empty string stripping
|
||||
8. **fieldingstats count bug fixed** — `.count()` was being called after `.limit()`, capping the reported count at the page size instead of total matching rows (#100)
|
||||
|
||||
## Lessons
|
||||
|
||||
- **Always verify container IP attribution** before investigating the wrong service. `docker inspect` with format string is the canonical way to map IPs to container names. Don't assume based on Docker network proximity.
|
||||
- **APIs should never trust client-provided limits** — enforce `le=MAX_LIMIT` on every list endpoint. The only safe endpoint was `/players/search` which had been properly capped at `le=50`.
|
||||
- **Empty string params are a silent danger** — FastAPI parses `?param=` as `['']`, not `None`. A global middleware is the right fix since it protects all endpoints including future ones.
|
||||
- **Recursive serialization (`model_to_dict(recurse=True)`) is O(n * related_objects)** — dangerous on unbounded queries. Consider forcing `short_output=True` for large result sets.
|
||||
- **Heavy reformatting mixed with functional changes obscures bugs** — the fieldingstats count bug was missed in review because the file had 262 lines of diff from quote/formatting changes. Separate cosmetic and functional changes into different commits.
|
||||
@ -548,34 +548,6 @@ tar -czf ~/jellyfin-config-backup-$(date +%Y%m%d).tar.gz ~/docker/jellyfin/confi
|
||||
- Test on non-production instance if possible
|
||||
- Document current working configuration
|
||||
|
||||
## Roku Buffering on Weak WiFi — Client Bitrate Cap (2026-03-26)
|
||||
|
||||
**Severity:** Low — single device, non-critical viewing location
|
||||
|
||||
**Problem:** Roku in a far corner of the house with poor WiFi signal was buffering/failing to play videos. Content was not being transcoded down to accommodate the limited bandwidth.
|
||||
|
||||
**Root Cause:** Jellyfin does not dynamically adapt bitrate mid-stream (no HLS ABR like Netflix). The server's `RemoteClientBitrateLimit` was set to `0` (unlimited), and LAN clients are treated as "local" anyway so that setting wouldn't apply. The Roku Jellyfin app was requesting full-quality streams that exceeded the WiFi throughput.
|
||||
|
||||
**Fix:** Set **Max Streaming Bitrate** in the Jellyfin Roku app settings (Settings > Playback) to a lower value (4-8 Mbps). This forces the server to transcode down via NVENC before sending. No server-side changes needed.
|
||||
|
||||
**Lesson:** For bandwidth-constrained clients, the client-side bitrate setting is the first lever to pull. For a server-enforced cap that survives app resets, create a dedicated Jellyfin user for that device and set a per-user bitrate limit in Dashboard > Users > Playback. The `RemoteClientBitrateLimit` in system.xml only applies to clients Jellyfin considers "remote" — LAN devices are always "local."
|
||||
|
||||
---
|
||||
|
||||
## PGS Subtitle Default Flags Causing Roku Playback Hang (2026-04-01)
|
||||
|
||||
**Severity:** Medium — affects all Roku/Apple TV clients attempting to play remuxes with PGS subtitles
|
||||
|
||||
**Problem:** Playback on Roku hangs at "Loading" and stops at 0 ms. Jellyfin logs show ffmpeg extracting all subtitle streams (including PGS) from the full-length movie before playback can begin. User Staci reported Jurassic Park (1993) taking forever to start on the living room Roku.
|
||||
|
||||
**Root Cause:** PGS (hdmv_pgs_subtitle) tracks flagged as `default` in MKV files cause the Roku client to auto-select them. Roku can't decode PGS natively, so Jellyfin must burn them in — triggering a full subtitle extraction pass and video transcode before any data reaches the client. 178 out of ~400 movies in the library had this flag set, mostly remuxes that predate the Tdarr `clrSubDef` flow plugin.
|
||||
|
||||
**Fix:**
|
||||
1. **Batch fix (existing library):** Wrote `fix-pgs-defaults.sh` — scans all MKVs with `mkvmerge -J`, finds PGS tracks with `default_track: true`, clears via `mkvpropedit --edit track:N --set flag-default=0`. Key gotcha: mkvpropedit uses 1-indexed track numbers (`track_id + 1`), NOT `track:=ID` (which matches by UID). Script is on manticore at `/tmp/fix-pgs-defaults.sh`. Fixed 178 files, no re-encoding needed.
|
||||
2. **Going forward (Tdarr):** The flow already has a "Clear Subtitle Default Flags" custom function plugin (`clrSubDef`) that clears default disposition on non-forced subtitle tracks during transcoding. New files processed by Tdarr are handled automatically.
|
||||
|
||||
**Lesson:** Remux files from automated downloaders almost always have PGS defaults set. Any bulk import of remuxes should be followed by a PGS default flag sweep. The CIFS media mount on manticore is read-only inside the Jellyfin container — mkvpropedit must run from the host against `/mnt/truenas/media/Movies`.
|
||||
|
||||
## Related Documentation
|
||||
- **Setup Guide**: `/media-servers/jellyfin-ubuntu-manticore.md`
|
||||
- **NVIDIA Driver Management**: See jellyfin-ubuntu-manticore.md
|
||||
|
||||
@ -1,37 +0,0 @@
|
||||
---
|
||||
title: "MLB The Show Market Tracker — 0.1.0"
|
||||
description: "Initial release of the CLI market scanner with flip scanning and exchange program support."
|
||||
type: reference
|
||||
domain: gaming
|
||||
tags: [release-notes, deployment, mlb-the-show, rust]
|
||||
---
|
||||
|
||||
# MLB The Show Market Tracker — 0.1.0
|
||||
|
||||
**Date:** 2026-03-28
|
||||
**Version:** `0.1.0`
|
||||
**Repo:** `cal/mlb-the-show-market-tracker` on Gitea
|
||||
**Deploy method:** Local CLI tool — `cargo build --release` on workstation
|
||||
|
||||
## Release Summary
|
||||
|
||||
Initial release of `showflip`, a Rust CLI tool for scanning the MLB The Show 26 Community Market. Supports finding profitable card flips and identifying silver cards at target buy-order prices for the gold pack exchange program.
|
||||
|
||||
## Changes
|
||||
|
||||
### New Features
|
||||
|
||||
- **`scan` command** — Concurrent market scanner that finds profitable flip opportunities. Supports filters for rarity, team, position, budget, and sorting by profit/margin. Includes watch mode for repeated scans and optional Discord webhook alerts.
|
||||
- **`exchange` command** — Scans for silver cards (OVR 77-79) priced within configurable buy-order gates for the gold pack exchange program. Tiers: 79 OVR (target 170/max 175), 78 OVR (target 140/max 145), 77 OVR (target 117/max 122). Groups results by OVR with color-coded target/OK status.
|
||||
- **`detail` command** — Shows price history and recent sales for a specific card by name or UUID.
|
||||
- **`meta` command** — Lists available series, brands, and sets for use as filter values.
|
||||
- OVR-based price floor calculation for live and non-live series cards
|
||||
- 10% Community Market tax built into all profit calculations
|
||||
- Handles API price format inconsistencies (integers vs comma-formatted strings)
|
||||
- HTTP client with 429 retry handling
|
||||
|
||||
## Deployment Notes
|
||||
|
||||
- No server deployment — runs locally via `cargo run -- <subcommand>`
|
||||
- API is public at `https://mlb26.theshow.com/apis/` — no auth required
|
||||
- No tests or CI configured yet
|
||||
@ -1,45 +0,0 @@
|
||||
---
|
||||
title: "MLB The Show Companion Automation — 2026.3.31"
|
||||
description: "Fix gold exchange navigation, add grind harness for automated buy→exchange loops, CLI cleanup."
|
||||
type: reference
|
||||
domain: gaming
|
||||
tags: [release-notes, deployment, mlb-the-show, python, automation]
|
||||
---
|
||||
|
||||
# MLB The Show Companion Automation — 2026.3.31
|
||||
|
||||
**Date:** 2026-03-31
|
||||
**Repo:** `cal/mlb-the-show-market-tracker` on Gitea
|
||||
**Branch:** `main` (merge commit `ea66e2c`)
|
||||
**Deploy method:** Local script — `uv run scripts/grind.py`
|
||||
|
||||
## Release Summary
|
||||
|
||||
Major fixes to the companion app automation (`grind.py`). The gold exchange navigation was broken — the script thought it had entered the card grid when it was still on the exchange selection list. Added a new `grind` command that orchestrates the full buy→exchange loop with multi-tier OVR rotation.
|
||||
|
||||
## Changes
|
||||
|
||||
### Bug Fixes
|
||||
- Fixed `_is_on_exchange_grid()` to require `Exchange Value` card labels, distinguishing the card grid from the Exchange Players list page (`d4c038b`)
|
||||
- Added retry loop (3 attempts, 2s apart) in `ensure_on_exchange_grid()` for variable load times
|
||||
- Added `time.sleep(2)` after tapping into the Gold Exchange grid
|
||||
- Removed low-OVR bail logic — the grid is sorted ascending, so bail fired on first screen before scrolling to profitable cards
|
||||
- Fixed buy-orders market scroll — retry loop attempts up to 10 scrolls before giving up (was 1) (`6912a7e`). Note: scroll method itself was still broken (KEYCODE_PAGE_DOWN); fixed in 2026.4.01 release.
|
||||
- Restored `_has_low_ovr_cards` fix lost during PR #2 merge (`c29af78`)
|
||||
|
||||
### New Features
|
||||
- **`grind` command** — automated buy→exchange loop with OVR tier rotation (`6912a7e`)
|
||||
- Rotates through OVR tiers in descending order (default: 79, 78, 77)
|
||||
- Buys 2 tiers per round, then exchanges all available dupes
|
||||
- Flags: `--ovrs`, `--rounds`, `--max-players`, `--max-price`, `--budget`, `--max-packs`
|
||||
- Per-round and cumulative summary output
|
||||
- Clean Ctrl+C handling with final totals
|
||||
|
||||
### CLI Changes
|
||||
- Renamed `grind` → `exchange` (bulk exchange command)
|
||||
- Removed redundant single-exchange command (use `exchange 1` instead)
|
||||
- `grind` now refers to the full buy→exchange orchestration loop
|
||||
|
||||
## Known Issues
|
||||
- Default price gates (`MAX_BUY_PRICES`) may be too low during market inflation periods. Current gates: 79→170, 78→140, 77→125. Use `--max-price` to override.
|
||||
- No order fulfillment polling — the grind loop relies on natural timing (2 buy rounds ≈ 2-5 min gives orders time to fill)
|
||||
@ -1,26 +0,0 @@
|
||||
---
|
||||
title: "MLB The Show Companion Automation — 2026.4.01"
|
||||
description: "Fix buy-orders scroll to use touch swipes, optimize exchange card selection."
|
||||
type: reference
|
||||
domain: gaming
|
||||
tags: [release-notes, deployment, mlb-the-show, python, automation]
|
||||
---
|
||||
|
||||
# MLB The Show Companion Automation — 2026.4.01
|
||||
|
||||
**Date:** 2026-04-01
|
||||
**Repo:** `cal/mlb-the-show-market-tracker` on Gitea
|
||||
**Branch:** `main` (latest `f15e98a`)
|
||||
**Deploy method:** Local script — `uv run scripts/grind.py`
|
||||
|
||||
## Release Summary
|
||||
|
||||
Two fixes to the companion app automation. The buy-orders command couldn't scroll through the market list because it used keyboard events instead of touch swipes. The exchange command now stops selecting cards once it has enough points for a pack.
|
||||
|
||||
## Changes
|
||||
|
||||
### Bug Fixes
|
||||
- **Fixed buy-orders market scrolling** — replaced `KEYCODE_PAGE_DOWN` (keyboard event ignored by WebView) with `scroll_load_jiggle()` which uses touch swipes + a reverse micro-swipe to trigger lazy loading. This matches the working exchange scroll strategy. (`49fe7b6`)
|
||||
|
||||
### Optimizations
|
||||
- **Early break in exchange card selection** — the selection loop now stops as soon as accumulated points meet the exchange threshold, avoiding unnecessary taps on additional card types the app won't consume. (`f15e98a`)
|
||||
@ -1,528 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# homelab-audit.sh — SSH-based homelab health audit
|
||||
#
|
||||
# Runs on the Proxmox host. Discovers running LXCs and VMs, SSHes into each
|
||||
# to collect system metrics, then generates a summary report.
|
||||
#
|
||||
# Usage:
|
||||
# homelab-audit.sh [--output-dir DIR] [--hosts label:ip,label:ip,...]
|
||||
#
|
||||
# Environment overrides:
|
||||
# STUCK_PROC_CPU_WARN CPU% at which a D-state process is flagged (default: 10)
|
||||
# REPORT_DIR Output directory for per-host reports and logs
|
||||
# SSH_USER Remote user (default: root)
|
||||
|
||||
# -e omitted intentionally — unreachable hosts should not abort the full audit
|
||||
set -uo pipefail
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Configuration
|
||||
# ---------------------------------------------------------------------------
|
||||
STUCK_PROC_CPU_WARN="${STUCK_PROC_CPU_WARN:-10}"
|
||||
REPORT_DIR="${REPORT_DIR:-/tmp/homelab-audit-$(date +%Y%m%d-%H%M%S)}"
|
||||
SSH_USER="${SSH_USER:-root}"
|
||||
SSH_OPTS="-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes"
|
||||
|
||||
DISK_WARN=80
|
||||
DISK_CRIT=90
|
||||
LOAD_WARN=2.0
|
||||
MEM_WARN=85
|
||||
ZOMBIE_WARN=1
|
||||
SWAP_WARN=512
|
||||
HOSTS_FILTER="" # comma-separated host list from --hosts; empty = audit all
|
||||
JSON_OUTPUT=0 # set to 1 by --json
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--output-dir)
|
||||
if [[ $# -lt 2 ]]; then
|
||||
echo "Error: --output-dir requires an argument" >&2
|
||||
exit 1
|
||||
fi
|
||||
REPORT_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
--hosts)
|
||||
if [[ $# -lt 2 ]]; then
|
||||
echo "Error: --hosts requires an argument" >&2
|
||||
exit 1
|
||||
fi
|
||||
HOSTS_FILTER="$2"
|
||||
shift 2
|
||||
;;
|
||||
--json)
|
||||
JSON_OUTPUT=1
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
mkdir -p "$REPORT_DIR"
|
||||
SSH_FAILURES_LOG="$REPORT_DIR/ssh-failures.log"
|
||||
FINDINGS_FILE="$REPORT_DIR/findings.txt"
|
||||
AUDITED_HOSTS=() # populated in main; used by generate_summary for per-host counts
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Remote collector script
|
||||
#
|
||||
# Kept single-quoted so no local variables are interpolated into the heredoc.
|
||||
# STUCK_PROC_CPU_WARN is passed as $1 when invoking the remote bash session,
|
||||
# so the configurable threshold reaches the collector without escaping issues.
|
||||
# ---------------------------------------------------------------------------
|
||||
COLLECTOR_SCRIPT='#!/usr/bin/env bash
|
||||
STUCK_PROC_CPU_WARN="${1:-10}"
|
||||
|
||||
cpu_load() {
|
||||
uptime | awk -F"load average:" "{print \$2}" | awk -F"[, ]+" "{print \$2}"
|
||||
}
|
||||
|
||||
mem_pct() {
|
||||
free | awk "/^Mem:/ {printf \"%.0f\", \$3/\$2*100}"
|
||||
}
|
||||
|
||||
disk_usage() {
|
||||
df --output=pcent,target -x tmpfs -x devtmpfs 2>/dev/null | tail -n +2 | \
|
||||
while read -r pct mnt; do echo "${pct%%%} $mnt"; done
|
||||
}
|
||||
|
||||
zombie_count() {
|
||||
ps -eo stat= | grep -c "^Z" || true
|
||||
}
|
||||
|
||||
stuck_procs() {
|
||||
ps -eo stat=,pcpu=,comm= | \
|
||||
awk -v t="$STUCK_PROC_CPU_WARN" '\''$1 ~ /^D/ && $2+0 >= t+0 {print $3}'\'' | \
|
||||
paste -sd,
|
||||
}
|
||||
|
||||
zombie_parents() {
|
||||
ps -eo pid=,ppid=,stat= | awk '\''$3 ~ /^Z/ {print $2}'\'' | sort -u | \
|
||||
xargs -I{} ps -o comm= -p {} 2>/dev/null | paste -sd,
|
||||
}
|
||||
|
||||
swap_mb() {
|
||||
free | awk '\''/^Swap:/ {printf "%.0f", $3/1024; found=1} END {if (!found) print "0"}'\''
|
||||
}
|
||||
|
||||
oom_events() {
|
||||
local count
|
||||
count=$(journalctl -k --since "7 days ago" 2>/dev/null | grep -ci "out of memory") || true
|
||||
echo "${count:-0}"
|
||||
}
|
||||
|
||||
io_wait_pct() {
|
||||
vmstat 1 2 2>/dev/null | tail -1 | awk '\''{print $16}'\''
|
||||
}
|
||||
|
||||
echo "CPU_LOAD=$(cpu_load)"
|
||||
echo "MEM_PCT=$(mem_pct)"
|
||||
echo "ZOMBIES=$(zombie_count)"
|
||||
echo "STUCK_PROCS=$(stuck_procs)"
|
||||
echo "ZOMBIE_PARENTS=$(zombie_parents)"
|
||||
echo "SWAP_MB=$(swap_mb)"
|
||||
echo "OOM_EVENTS=$(oom_events)"
|
||||
echo "IO_WAIT=$(io_wait_pct)"
|
||||
disk_usage | while read -r pct mnt; do
|
||||
echo "DISK $pct $mnt"
|
||||
done
|
||||
'
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SSH helper — logs stderr to ssh-failures.log instead of silently discarding
|
||||
# ---------------------------------------------------------------------------
|
||||
ssh_cmd() {
|
||||
local host="$1"
|
||||
shift
|
||||
# shellcheck disable=SC2086
|
||||
ssh $SSH_OPTS "${SSH_USER}@${host}" "$@" 2>>"$SSH_FAILURES_LOG"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LXC IP discovery
|
||||
#
|
||||
# lxc-info only returns IPs for containers using Proxmox-managed DHCP bridges.
|
||||
# Containers with static IPs defined inside the container (not via Proxmox
|
||||
# network config) return nothing. Fall back to parsing `pct config` in that
|
||||
# case to find the ip= field from the container's network interface config.
|
||||
# ---------------------------------------------------------------------------
|
||||
get_lxc_ip() {
|
||||
local ctid="$1"
|
||||
local ip
|
||||
ip=$(lxc-info -n "$ctid" -iH 2>/dev/null | head -1)
|
||||
if [[ -z "$ip" ]]; then
|
||||
ip=$(pct config "$ctid" 2>/dev/null | grep -oP '(?<=ip=)[^/,]+' | head -1)
|
||||
fi
|
||||
echo "$ip"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Inventory: running LXCs and VMs
|
||||
# Returns lines of "label ip"
|
||||
# ---------------------------------------------------------------------------
|
||||
collect_inventory() {
|
||||
# LXCs
|
||||
pct list 2>/dev/null | tail -n +2 | while read -r ctid status _name; do
|
||||
[[ "$status" != "running" ]] && continue
|
||||
local ip
|
||||
ip=$(get_lxc_ip "$ctid")
|
||||
[[ -n "$ip" ]] && echo "lxc-${ctid} $ip"
|
||||
done
|
||||
|
||||
# VMs — use agent network info if available, fall back to qm config
|
||||
qm list 2>/dev/null | tail -n +2 | while read -r vmid _name status _mem _bootdisk _pid; do
|
||||
[[ "$status" != "running" ]] && continue
|
||||
local ip
|
||||
ip=$(qm guest cmd "$vmid" network-get-interfaces 2>/dev/null |
|
||||
python3 -c "
|
||||
import sys, json
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
for iface in data:
|
||||
for addr in iface.get('ip-addresses', []):
|
||||
if addr['ip-address-type'] == 'ipv4' and not addr['ip-address'].startswith('127.'):
|
||||
print(addr['ip-address'])
|
||||
raise SystemExit
|
||||
except Exception:
|
||||
pass
|
||||
" 2>/dev/null)
|
||||
[[ -n "$ip" ]] && echo "vm-${vmid} $ip"
|
||||
done
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Collect metrics from one host and record findings
|
||||
# ---------------------------------------------------------------------------
|
||||
parse_and_report() {
|
||||
local label="$1"
|
||||
local addr="$2"
|
||||
local raw
|
||||
|
||||
if ! raw=$(echo "$COLLECTOR_SCRIPT" | ssh_cmd "$addr" bash -s -- "$STUCK_PROC_CPU_WARN"); then
|
||||
echo "SSH_FAILURE $label $addr" >>"$SSH_FAILURES_LOG"
|
||||
echo "WARN $label: SSH connection failed" >>"$FINDINGS_FILE"
|
||||
return
|
||||
fi
|
||||
|
||||
while IFS= read -r line; do
|
||||
case "$line" in
|
||||
CPU_LOAD=*)
|
||||
local load="${line#CPU_LOAD=}"
|
||||
if [[ -n "$load" ]] && awk "BEGIN{exit !($load > $LOAD_WARN)}"; then
|
||||
echo "WARN $label: load average ${load} > ${LOAD_WARN}" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
MEM_PCT=*)
|
||||
local mem="${line#MEM_PCT=}"
|
||||
if [[ -n "$mem" ]] && ((mem >= MEM_WARN)); then
|
||||
echo "WARN $label: memory ${mem}% >= ${MEM_WARN}%" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
ZOMBIES=*)
|
||||
local zombies="${line#ZOMBIES=}"
|
||||
if [[ -n "$zombies" ]] && ((zombies >= ZOMBIE_WARN)); then
|
||||
echo "WARN $label: ${zombies} zombie process(es)" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
STUCK_PROCS=*)
|
||||
local procs="${line#STUCK_PROCS=}"
|
||||
if [[ -n "$procs" ]]; then
|
||||
echo "WARN $label: D-state procs with CPU>=${STUCK_PROC_CPU_WARN}%: ${procs}" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
ZOMBIE_PARENTS=*)
|
||||
local zparents="${line#ZOMBIE_PARENTS=}"
|
||||
if [[ -n "$zparents" ]]; then
|
||||
echo "INFO $label: zombie parent process(es): ${zparents}" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
SWAP_MB=*)
|
||||
local swap="${line#SWAP_MB=}"
|
||||
if [[ -n "$swap" ]] && ((swap >= SWAP_WARN)); then
|
||||
echo "WARN $label: swap usage ${swap} MB >= ${SWAP_WARN} MB" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
OOM_EVENTS=*)
|
||||
local ooms="${line#OOM_EVENTS=}"
|
||||
if [[ -n "$ooms" ]] && ((ooms > 0)); then
|
||||
echo "WARN $label: ${ooms} OOM kill event(s) in last 7 days" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
IO_WAIT=*)
|
||||
local iowait="${line#IO_WAIT=}"
|
||||
if [[ -n "$iowait" ]] && ((iowait > 20)); then
|
||||
echo "WARN $label: I/O wait ${iowait}% > 20%" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
DISK\ *)
|
||||
local pct mnt
|
||||
read -r _ pct mnt <<<"$line"
|
||||
if ((pct >= DISK_CRIT)); then
|
||||
echo "CRIT $label: disk ${mnt} at ${pct}% >= ${DISK_CRIT}%" >>"$FINDINGS_FILE"
|
||||
elif ((pct >= DISK_WARN)); then
|
||||
echo "WARN $label: disk ${mnt} at ${pct}% >= ${DISK_WARN}%" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
done <<<"$raw"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Summary — driven by actual findings in findings.txt and ssh-failures.log
|
||||
# ---------------------------------------------------------------------------
|
||||
generate_summary() {
|
||||
local host_count="$1"
|
||||
local ssh_failure_count=0
|
||||
local warn_count=0
|
||||
local crit_count=0
|
||||
|
||||
[[ -f "$SSH_FAILURES_LOG" ]] &&
|
||||
ssh_failure_count=$(grep -c '^SSH_FAILURE' "$SSH_FAILURES_LOG" 2>/dev/null || true)
|
||||
[[ -f "$FINDINGS_FILE" ]] &&
|
||||
warn_count=$(grep -c '^WARN' "$FINDINGS_FILE" 2>/dev/null || true)
|
||||
[[ -f "$FINDINGS_FILE" ]] &&
|
||||
crit_count=$(grep -c '^CRIT' "$FINDINGS_FILE" 2>/dev/null || true)
|
||||
|
||||
echo ""
|
||||
echo "=============================="
|
||||
echo " HOMELAB AUDIT SUMMARY"
|
||||
echo "=============================="
|
||||
printf " Hosts audited : %d\n" "$host_count"
|
||||
printf " SSH failures : %d\n" "$ssh_failure_count"
|
||||
printf " Warnings : %d\n" "$warn_count"
|
||||
printf " Critical : %d\n" "$crit_count"
|
||||
echo "=============================="
|
||||
|
||||
if [[ ${#AUDITED_HOSTS[@]} -gt 0 ]] && ((warn_count + crit_count > 0)); then
|
||||
echo ""
|
||||
printf " %-30s %8s %8s\n" "Host" "Warnings" "Critical"
|
||||
printf " %-30s %8s %8s\n" "----" "--------" "--------"
|
||||
for host in "${AUDITED_HOSTS[@]}"; do
|
||||
local hw hc
|
||||
hw=$(grep -c "^WARN ${host}:" "$FINDINGS_FILE" 2>/dev/null || true)
|
||||
hc=$(grep -c "^CRIT ${host}:" "$FINDINGS_FILE" 2>/dev/null || true)
|
||||
((hw + hc > 0)) && printf " %-30s %8d %8d\n" "$host" "$hw" "$hc"
|
||||
done
|
||||
fi
|
||||
|
||||
if ((warn_count + crit_count > 0)); then
|
||||
echo ""
|
||||
echo "Findings:"
|
||||
sort "$FINDINGS_FILE"
|
||||
fi
|
||||
|
||||
if ((ssh_failure_count > 0)); then
|
||||
echo ""
|
||||
echo "SSH failures (see $SSH_FAILURES_LOG for details):"
|
||||
grep '^SSH_FAILURE' "$SSH_FAILURES_LOG" | awk '{print " " $2 " (" $3 ")"}'
|
||||
fi
|
||||
|
||||
echo ""
|
||||
printf "Total: %d warning(s), %d critical across %d host(s)\n" \
|
||||
"$warn_count" "$crit_count" "$host_count"
|
||||
echo ""
|
||||
echo "Reports: $REPORT_DIR"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Proxmox backup recency — queries vzdump task history via pvesh (runs locally)
|
||||
# ---------------------------------------------------------------------------
|
||||
check_backup_recency() {
|
||||
local tasks_json_file="$REPORT_DIR/vzdump-tasks.json"
|
||||
pvesh get /nodes/proxmox/tasks --typefilter vzdump --limit 50 --output-format json \
|
||||
>"$tasks_json_file" 2>/dev/null || {
|
||||
echo "WARN proxmox: failed to query vzdump task history" >>"$FINDINGS_FILE"
|
||||
return
|
||||
}
|
||||
|
||||
[[ ! -s "$tasks_json_file" ]] && return
|
||||
|
||||
local running_ids=()
|
||||
while read -r ctid; do
|
||||
running_ids+=("$ctid")
|
||||
done < <(pct list 2>/dev/null | awk 'NR>1 && $2=="running"{print $1}')
|
||||
while read -r vmid; do
|
||||
running_ids+=("$vmid")
|
||||
done < <(qm list 2>/dev/null | awk 'NR>1 && $3=="running"{print $1}')
|
||||
|
||||
[[ ${#running_ids[@]} -eq 0 ]] && return
|
||||
|
||||
local week_ago
|
||||
week_ago=$(($(date +%s) - 7 * 86400))
|
||||
|
||||
python3 - "$tasks_json_file" "$week_ago" "${running_ids[@]}" <<'PYEOF' >>"$FINDINGS_FILE"
|
||||
import sys, json, datetime
|
||||
|
||||
tasks_file, week_ago = sys.argv[1], int(sys.argv[2])
|
||||
running_ids = set(sys.argv[3:])
|
||||
|
||||
try:
|
||||
tasks = json.load(open(tasks_file))
|
||||
except Exception:
|
||||
sys.exit(0)
|
||||
|
||||
last_backup = {}
|
||||
for task in tasks:
|
||||
if task.get("type") != "vzdump" or task.get("status") != "OK":
|
||||
continue
|
||||
vmid = str(task.get("id", ""))
|
||||
endtime = int(task.get("endtime", 0))
|
||||
if vmid and endtime and endtime > last_backup.get(vmid, 0):
|
||||
last_backup[vmid] = endtime
|
||||
|
||||
for vmid in sorted(running_ids):
|
||||
ts = last_backup.get(vmid)
|
||||
if ts and ts >= week_ago:
|
||||
pass
|
||||
elif ts:
|
||||
dt = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d")
|
||||
print(f"WARN proxmox/vm-{vmid}: last backup {dt} is older than 7 days")
|
||||
else:
|
||||
print(f"CRIT proxmox/vm-{vmid}: no backup found in task history")
|
||||
PYEOF
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Certificate expiry check — runs from the audit host via openssl
|
||||
# ---------------------------------------------------------------------------
|
||||
check_cert_expiry() {
|
||||
local label="$1"
|
||||
local addr="$2"
|
||||
local now
|
||||
now=$(date +%s)
|
||||
|
||||
for port in 443 8443; do
|
||||
local enddate
|
||||
enddate=$(echo | timeout 10 openssl s_client -connect "${addr}:${port}" 2>/dev/null |
|
||||
openssl x509 -noout -enddate 2>/dev/null) || continue
|
||||
[[ -z "$enddate" ]] && continue
|
||||
|
||||
local expiry_str="${enddate#notAfter=}"
|
||||
local expiry_epoch
|
||||
expiry_epoch=$(date -d "$expiry_str" +%s 2>/dev/null) || continue
|
||||
local days_left=$(((expiry_epoch - now) / 86400))
|
||||
|
||||
if ((days_left <= 7)); then
|
||||
echo "CRIT $label: TLS cert on :${port} expires in ${days_left} days" >>"$FINDINGS_FILE"
|
||||
elif ((days_left <= 14)); then
|
||||
echo "WARN $label: TLS cert on :${port} expires in ${days_left} days" >>"$FINDINGS_FILE"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# JSON report — writes findings.json to $REPORT_DIR when --json is used
|
||||
# ---------------------------------------------------------------------------
|
||||
write_json_report() {
|
||||
local host_count="$1"
|
||||
local json_file="$REPORT_DIR/findings.json"
|
||||
local ssh_failure_count=0
|
||||
local warn_count=0
|
||||
local crit_count=0
|
||||
|
||||
[[ -f "$SSH_FAILURES_LOG" ]] &&
|
||||
ssh_failure_count=$(grep -c '^SSH_FAILURE' "$SSH_FAILURES_LOG" 2>/dev/null || true)
|
||||
[[ -f "$FINDINGS_FILE" ]] &&
|
||||
warn_count=$(grep -c '^WARN' "$FINDINGS_FILE" 2>/dev/null || true)
|
||||
[[ -f "$FINDINGS_FILE" ]] &&
|
||||
crit_count=$(grep -c '^CRIT' "$FINDINGS_FILE" 2>/dev/null || true)
|
||||
|
||||
python3 - "$json_file" "$host_count" "$ssh_failure_count" \
|
||||
"$warn_count" "$crit_count" "$FINDINGS_FILE" <<'PYEOF'
|
||||
import sys, json, datetime
|
||||
|
||||
json_file = sys.argv[1]
|
||||
host_count = int(sys.argv[2])
|
||||
ssh_failure_count = int(sys.argv[3])
|
||||
warn_count = int(sys.argv[4])
|
||||
crit_count = int(sys.argv[5])
|
||||
findings_file = sys.argv[6]
|
||||
|
||||
findings = []
|
||||
try:
|
||||
with open(findings_file) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
parts = line.split(None, 2)
|
||||
if len(parts) < 3:
|
||||
continue
|
||||
severity, host_colon, message = parts[0], parts[1], parts[2]
|
||||
findings.append({
|
||||
"severity": severity,
|
||||
"host": host_colon.rstrip(":"),
|
||||
"message": message,
|
||||
})
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
output = {
|
||||
"timestamp": datetime.datetime.utcnow().isoformat() + "Z",
|
||||
"hosts_audited": host_count,
|
||||
"warnings": warn_count,
|
||||
"critical": crit_count,
|
||||
"ssh_failures": ssh_failure_count,
|
||||
"total_findings": warn_count + crit_count,
|
||||
"findings": findings,
|
||||
}
|
||||
|
||||
with open(json_file, "w") as f:
|
||||
json.dump(output, f, indent=2)
|
||||
print(f"JSON report: {json_file}")
|
||||
PYEOF
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
main() {
|
||||
echo "Starting homelab audit — $(date)"
|
||||
echo "Report dir: $REPORT_DIR"
|
||||
echo "STUCK_PROC_CPU_WARN threshold: ${STUCK_PROC_CPU_WARN}%"
|
||||
[[ -n "$HOSTS_FILTER" ]] && echo "Host filter: $HOSTS_FILTER"
|
||||
echo ""
|
||||
|
||||
>"$FINDINGS_FILE"
|
||||
|
||||
local host_count=0
|
||||
|
||||
if [[ -n "$HOSTS_FILTER" ]]; then
|
||||
# --hosts mode: audit specified hosts directly, skip Proxmox inventory
|
||||
local check_proxmox=0
|
||||
IFS=',' read -ra filter_hosts <<<"$HOSTS_FILTER"
|
||||
for host in "${filter_hosts[@]}"; do
|
||||
[[ "$host" == "proxmox" ]] && check_proxmox=1
|
||||
done
|
||||
if ((check_proxmox)); then
|
||||
echo " Checking Proxmox backup recency..."
|
||||
check_backup_recency
|
||||
fi
|
||||
for host in "${filter_hosts[@]}"; do
|
||||
echo " Auditing $host..."
|
||||
parse_and_report "$host" "$host"
|
||||
check_cert_expiry "$host" "$host"
|
||||
AUDITED_HOSTS+=("$host")
|
||||
((host_count++)) || true
|
||||
done
|
||||
else
|
||||
echo " Checking Proxmox backup recency..."
|
||||
check_backup_recency
|
||||
|
||||
while read -r label addr; do
|
||||
echo " Auditing $label ($addr)..."
|
||||
parse_and_report "$label" "$addr"
|
||||
check_cert_expiry "$label" "$addr"
|
||||
AUDITED_HOSTS+=("$label")
|
||||
((host_count++)) || true
|
||||
done < <(collect_inventory)
|
||||
fi
|
||||
|
||||
generate_summary "$host_count"
|
||||
[[ "$JSON_OUTPUT" -eq 1 ]] && write_json_report "$host_count"
|
||||
}
|
||||
|
||||
main "$@"
|
||||
@ -1,126 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# test-audit-collectors.sh — validates homelab-audit.sh collector output format
|
||||
#
|
||||
# Re-implements each collector function inline and runs it locally, checking
|
||||
# that output matches the expected format. Exits non-zero on any failure.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
pass() {
|
||||
((PASS++)) || true
|
||||
echo " PASS: $1"
|
||||
}
|
||||
fail() {
|
||||
((FAIL++)) || true
|
||||
echo " FAIL: $1 — $2"
|
||||
}
|
||||
|
||||
echo "=== Collector output format tests ==="
|
||||
|
||||
# Run each collector function locally and validate output format
|
||||
# These functions are designed to work on any Linux host
|
||||
|
||||
# --- cpu_load ---
|
||||
result=$(uptime | awk -F'load average:' '{print $2}' | awk -F'[, ]+' '{print $2}')
|
||||
if [[ "$result" =~ ^[0-9]+\.?[0-9]*$ ]]; then
|
||||
pass "cpu_load returns numeric value: $result"
|
||||
else
|
||||
fail "cpu_load" "expected numeric, got: '$result'"
|
||||
fi
|
||||
|
||||
# --- mem_pct ---
|
||||
result=$(free | awk '/^Mem:/ {printf "%.0f", $3/$2*100}')
|
||||
if [[ "$result" =~ ^[0-9]+$ ]] && ((result >= 0 && result <= 100)); then
|
||||
pass "mem_pct returns percentage: $result"
|
||||
else
|
||||
fail "mem_pct" "expected 0-100, got: '$result'"
|
||||
fi
|
||||
|
||||
# --- zombie_count ---
|
||||
result=$(ps -eo stat= | grep -c "^Z" || true)
|
||||
if [[ "$result" =~ ^[0-9]+$ ]]; then
|
||||
pass "zombie_count returns integer: $result"
|
||||
else
|
||||
fail "zombie_count" "expected integer, got: '$result'"
|
||||
fi
|
||||
|
||||
# --- zombie_parents ---
|
||||
# May be empty if no zombies — that's valid
|
||||
result=$(ps -eo pid=,ppid=,stat= | awk '$3 ~ /^Z/ {print $2}' | sort -u |
|
||||
xargs -I{} ps -o comm= -p {} 2>/dev/null | paste -sd, || true)
|
||||
if [[ -z "$result" || "$result" =~ ^[a-zA-Z0-9_.,/-]+$ ]]; then
|
||||
pass "zombie_parents returns csv or empty: '${result:-<empty>}'"
|
||||
else
|
||||
fail "zombie_parents" "unexpected format: '$result'"
|
||||
fi
|
||||
|
||||
# --- swap_mb ---
|
||||
result=$(free | awk '/^Swap:/ {printf "%.0f", $3/1024}')
|
||||
if [[ "$result" =~ ^[0-9]+$ ]]; then
|
||||
pass "swap_mb returns integer MB: $result"
|
||||
else
|
||||
fail "swap_mb" "expected integer, got: '$result'"
|
||||
fi
|
||||
|
||||
# --- oom_events ---
|
||||
result=$(journalctl -k --since "7 days ago" 2>/dev/null | grep -ci "out of memory") || true
|
||||
result="${result:-0}"
|
||||
if [[ "$result" =~ ^[0-9]+$ ]]; then
|
||||
pass "oom_events returns integer: $result"
|
||||
else
|
||||
fail "oom_events" "expected integer, got: '$result'"
|
||||
fi
|
||||
|
||||
# --- stuck_procs ---
|
||||
# May be empty — that's valid
|
||||
result=$(ps -eo stat=,pcpu=,comm= |
|
||||
awk '$1 ~ /^D/ && $2+0 >= 10 {print $3}' | paste -sd, || true)
|
||||
if [[ -z "$result" || "$result" =~ ^[a-zA-Z0-9_.,/-]+$ ]]; then
|
||||
pass "stuck_procs returns csv or empty: '${result:-<empty>}'"
|
||||
else
|
||||
fail "stuck_procs" "unexpected format: '$result'"
|
||||
fi
|
||||
|
||||
# --- disk_usage format ---
|
||||
result=$(df --output=pcent,target -x tmpfs -x devtmpfs 2>/dev/null | tail -n +2 | head -1 |
|
||||
while read -r pct mnt; do echo "${pct%%%} $mnt"; done)
|
||||
if [[ "$result" =~ ^[0-9]+\ / ]]; then
|
||||
pass "disk_usage returns 'pct mount' format: $result"
|
||||
else
|
||||
fail "disk_usage" "expected 'N /path', got: '$result'"
|
||||
fi
|
||||
|
||||
# --- --hosts flag parsing ---
|
||||
echo ""
|
||||
echo "=== --hosts argument parsing tests ==="
|
||||
|
||||
# Single host
|
||||
input="vm-115:10.10.0.88"
|
||||
IFS=',' read -ra entries <<<"$input"
|
||||
label="${entries[0]%%:*}"
|
||||
addr="${entries[0]#*:}"
|
||||
if [[ "$label" == "vm-115" && "$addr" == "10.10.0.88" ]]; then
|
||||
pass "--hosts single entry parsed: $label $addr"
|
||||
else
|
||||
fail "--hosts single" "expected 'vm-115 10.10.0.88', got: '$label $addr'"
|
||||
fi
|
||||
|
||||
# Multiple hosts
|
||||
input="vm-115:10.10.0.88,lxc-225:10.10.0.225"
|
||||
IFS=',' read -ra entries <<<"$input"
|
||||
label1="${entries[0]%%:*}"
|
||||
addr1="${entries[0]#*:}"
|
||||
label2="${entries[1]%%:*}"
|
||||
addr2="${entries[1]#*:}"
|
||||
if [[ "$label1" == "vm-115" && "$addr1" == "10.10.0.88" && "$label2" == "lxc-225" && "$addr2" == "10.10.0.225" ]]; then
|
||||
pass "--hosts multi entry parsed: $label1 $addr1, $label2 $addr2"
|
||||
else
|
||||
fail "--hosts multi" "unexpected parse result"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== Results: $PASS passed, $FAIL failed ==="
|
||||
((FAIL == 0))
|
||||
@ -92,42 +92,6 @@ CT 302 does **not** have an SSH key registered with Gitea, so SSH git remotes wo
|
||||
3. Commit to Gitea, pull on CT 302
|
||||
4. Add Uptime Kuma monitors if desired
|
||||
|
||||
## Health Check Thresholds
|
||||
|
||||
Thresholds are evaluated in `health_check.py`. All load thresholds use **per-core** metrics
|
||||
to avoid false positives from LXC containers (which see the Proxmox host's aggregate load).
|
||||
|
||||
### Load Average
|
||||
|
||||
| Metric | Value | Rationale |
|
||||
|--------|-------|-----------|
|
||||
| `LOAD_WARN_PER_CORE` | `0.7` | Elevated — investigate if sustained |
|
||||
| `LOAD_CRIT_PER_CORE` | `1.0` | Saturated — CPU is a bottleneck |
|
||||
| Sample window | 5-minute | Filters transient spikes (not 1-minute) |
|
||||
|
||||
**Formula**: `load_per_core = load_5m / nproc`
|
||||
|
||||
**Why per-core?** Proxmox LXC containers see the host's aggregate load average via the
|
||||
shared kernel. A 32-core Proxmox host at load 9 is at 0.28/core (healthy), but a naive
|
||||
absolute threshold of 2× would trigger at 9 for a 4-core LXC. Using `load_5m / nproc`
|
||||
where `nproc` returns the host's visible core count gives the correct ratio.
|
||||
|
||||
**Validation examples**:
|
||||
- Proxmox host: load 9 / 32 cores = 0.28/core → no alert ✓
|
||||
- VM 116 at 0.75/core → warning ✓ (above 0.7 threshold)
|
||||
- VM at 1.1/core → critical ✓
|
||||
|
||||
### Other Thresholds
|
||||
|
||||
| Check | Threshold | Notes |
|
||||
|-------|-----------|-------|
|
||||
| Zombie processes | 5 | Single zombies are transient noise; alert only if ≥ 5 |
|
||||
| Swap usage | 30% of total swap | Percentage-based to handle varied swap sizes across hosts |
|
||||
| Disk warning | 85% | |
|
||||
| Disk critical | 95% | |
|
||||
| Memory | 90% | |
|
||||
| Uptime alert | Non-urgent Discord post | Not a page-level alert |
|
||||
|
||||
## Related
|
||||
|
||||
- [monitoring/CONTEXT.md](../CONTEXT.md) — Overall monitoring architecture
|
||||
|
||||
@ -3,7 +3,7 @@ title: "Networking Infrastructure Context"
|
||||
description: "Architecture patterns and best practices for homelab networking including reverse proxy, SSH key management, DNS, SSL/TLS, network segmentation, and CIFS mounts."
|
||||
type: context
|
||||
domain: networking
|
||||
tags: [nginx, ssh, dns, ssl, vlan, cifs, reverse-proxy, firewall]
|
||||
tags: [nginx, ssh, dns, ssl, vlan, cifs, reverse-proxy, firewall, networking]
|
||||
---
|
||||
|
||||
# Networking Infrastructure - Technology Context
|
||||
|
||||
@ -47,13 +47,12 @@ home_network:
|
||||
services: ["media", "transcoding"]
|
||||
description: "Tdarr media transcoding"
|
||||
|
||||
# DECOMMISSIONED: vpn_docker (10.10.0.121) - VM 105 destroyed 2026-04
|
||||
# vpn_docker:
|
||||
# hostname: "10.10.0.121"
|
||||
# port: 22
|
||||
# user: "cal"
|
||||
# services: ["vpn", "docker"]
|
||||
# description: "VPN and Docker services"
|
||||
vpn_docker:
|
||||
hostname: "10.10.0.121"
|
||||
port: 22
|
||||
user: "cal"
|
||||
services: ["vpn", "docker"]
|
||||
description: "VPN and Docker services"
|
||||
|
||||
remote_servers:
|
||||
akamai_nano:
|
||||
|
||||
@ -23,7 +23,7 @@ servers:
|
||||
pihole: 10.10.0.16 # Pi-hole DNS and ad blocking
|
||||
sba_pd_bots: 10.10.0.88 # SBa and PD bot services
|
||||
tdarr: 10.10.0.43 # Media transcoding
|
||||
# vpn_docker: 10.10.0.121 # DECOMMISSIONED — VM 105 destroyed, migrated to arr-stack LXC 221
|
||||
vpn_docker: 10.10.0.121 # VPN and Docker services
|
||||
```
|
||||
|
||||
### Cloud Servers
|
||||
@ -175,12 +175,11 @@ Host tdarr media
|
||||
Port 22
|
||||
IdentityFile ~/.ssh/homelab_rsa
|
||||
|
||||
# DECOMMISSIONED: docker-vpn (10.10.0.121) - VM 105 destroyed, migrated to arr-stack LXC 221
|
||||
# Host docker-vpn
|
||||
# HostName 10.10.0.121
|
||||
# User cal
|
||||
# Port 22
|
||||
# IdentityFile ~/.ssh/homelab_rsa
|
||||
Host docker-vpn
|
||||
HostName 10.10.0.121
|
||||
User cal
|
||||
Port 22
|
||||
IdentityFile ~/.ssh/homelab_rsa
|
||||
|
||||
# Remote Cloud Servers
|
||||
Host akamai-nano akamai
|
||||
|
||||
@ -1,96 +0,0 @@
|
||||
---
|
||||
title: "Paper Dynasty Ecosystem Organization & Growth Engine Foundation"
|
||||
description: "Major organizational session — built cross-project infrastructure, cleared PR backlog (21 merged, 15 closed), established PO agent hierarchy, initiative tracker, and Refractor naming."
|
||||
type: context
|
||||
domain: paper-dynasty
|
||||
tags: [paper-dynasty, organization, infrastructure, agents, roadmap]
|
||||
---
|
||||
|
||||
# Paper Dynasty Ecosystem Organization & Growth Engine Foundation
|
||||
|
||||
**Date:** 2026-03-22
|
||||
**Scope:** Cross-project (all Paper Dynasty repos)
|
||||
|
||||
## What Was Done
|
||||
|
||||
Transformed Paper Dynasty from a collection of loosely related repos into an orchestrated product ecosystem with specialized agents, a growth roadmap, and a local initiative tracker.
|
||||
|
||||
### Directory Cleanup
|
||||
1. **Archived stale files** — moved `pd_master.db`, `gameplay.db`, `discord.log`, `wpa_data.csv`, `PROJECT_PLAN.json`, `pr-card-run.md` to `.archive/`
|
||||
2. **Deleted dead projects** — `gameplay-webapp/` (superseded by multi-league project, remote up to date) and `gameplay-website/` (no git repo, abandoned predecessor)
|
||||
3. **Archived `database-checks/`** — one-off SQL scripts from June 2025 moved to `.archive/`
|
||||
4. **Created `.archive/` directory** — `dev-storage/` was previously misused for archival (it's a Docker shared volume mount)
|
||||
|
||||
### Ecosystem Documentation
|
||||
5. **Created parent `CLAUDE.md`** — full ecosystem map with architecture diagram, deployment topology, "which repo do I change?" routing table, cross-project conventions
|
||||
6. **Created `ROADMAP.md`** — 3-phase growth roadmap (Foundation → Engagement → Growth) with specific initiatives, sizing, impact categories, and engagement metrics framework. Grounded in actual Gitea issue numbers.
|
||||
7. **Fixed SQLite references** — updated all docs and agent definitions to reflect PostgreSQL (both prod and dev are now PG)
|
||||
|
||||
### Agent Architecture
|
||||
8. **Created 5 specialized agents** in `~/.claude/agents/`:
|
||||
- `pd-database` (Opus) — Product Owner for Database API
|
||||
- `pd-discord` (Opus) — Product Owner for Discord Bot
|
||||
- `pd-cards` (Opus) — Product Owner for Card Pipeline
|
||||
- `pd-growth` (Opus) — Growth Product Owner (cross-project strategy)
|
||||
- `pd-ops` (Sonnet) — Release operations (merge workflow, deploys, process)
|
||||
9. **PO agent hierarchy** — Opus agents decide what to build, `pd-ops` ensures it ships correctly, implementation delegated to existing `engineer`/`issue-worker`/`swarm-coder` agents
|
||||
|
||||
### PR Backlog Clearance
|
||||
10. **Merged 21 PRs across 3 repos:**
|
||||
- Card-creation: 8 PRs (#29 security, #38, #39, #37, #35, #40, #43, #45 — bug fixes)
|
||||
- Discord: 9 PRs (#86, #84, #85, #89 crash fixes; #104 dead code cleanup; #105 duplicate file removal; #106 gameplay fixes; #107 check-in + paperdex)
|
||||
- Database: 2 PRs (#103 paperdex timeout, #95 benchmark script)
|
||||
11. **Closed 15 PRs:**
|
||||
- Card-creation: 2 superseded (#42, #44 — covered by #29)
|
||||
- Discord: 13 stale (#61-#71 targeting dead `next-release` branch, #96 superseded by individual WP PRs)
|
||||
12. **Re-implemented 11 fixes fresh** — the stale Discord PRs had unrebaseable conflicts from a scouting refactor; dispatched issue-worker agents to re-implement against current `main` in 4 batched PRs
|
||||
13. **PR reviewer caught 2 real bugs** — rarity param silently ignored by API (#106), None crash in paperdex dupe detection (#107). Both fixed before merge.
|
||||
|
||||
### Skill & Tooling Upgrades
|
||||
14. **Upgraded `/paper-dynasty` skill** — added ecosystem dashboard script, growth roadmap reference, specialized agent dispatch table, initiative tracker documentation
|
||||
15. **Created `ecosystem_status.sh`** — cross-repo dashboard pulling issues/PRs/commits from all 6 Gitea repos
|
||||
16. **Built `pd-plan` initiative tracker** — SQLite-based CLI at `~/.claude/skills/paper-dynasty/plan/cli.py` with 17 seeded initiatives, activity logging, `--json` output, filtering by phase/repo/status/impact
|
||||
|
||||
### Issues Created
|
||||
17. **6 new Gitea issues:**
|
||||
- Database #122: Remove legacy SQLite compatibility code
|
||||
- Database #123: Update database CLAUDE.md
|
||||
- Database #124: Rewrite skill scripts to use API instead of SQLite
|
||||
- Discord #108: Resolve ruff pre-commit hook vs pre-existing violations
|
||||
|
||||
## Decisions
|
||||
|
||||
### PO Agent Model: Opus for Strategy, Sonnet for Execution
|
||||
PD-specific agents run on Opus for strategic reasoning (backlog prioritization, design decisions, cross-project coordination). Implementation and ops use Sonnet. This avoids wasting Opus tokens on mechanical work while ensuring product decisions get deep reasoning.
|
||||
|
||||
### "Refractor" Naming for Card Progression
|
||||
The card evolution system (internal codename "Evolution") is renamed to **Refractor** for all user-facing surfaces. Inspired by Topps Chrome card collecting terminology. Tier names: Base Chrome → Refractor → Gold Refractor → Superfractor. Chosen over alternatives like Breakout, Grade, Press, Callup after competitive analysis across 15+ games (MLB The Show, NBA 2K, Marvel Snap, etc.). "Evolution" was already used by NBA 2K and EA FC.
|
||||
|
||||
### next-release Branch Pattern Retired
|
||||
All work now targets `main` directly. The `next-release` staging branch pattern is no longer used. This simplified the PR backlog significantly — 12 stale PRs targeting `next-release` were closed and re-implemented against `main`.
|
||||
|
||||
### PR Review Process: Mandatory pr-reviewer Gate
|
||||
PRs must always go through a `pr-reviewer` agent before merging. Cal corrected the workflow when auto-approving was attempted. The reviewer caught 2 real bugs this session.
|
||||
|
||||
### Two-User Auth for Merges
|
||||
Claude-authored PRs → cal approves. Cal-authored PRs → Claude approves. Both tokens stored in `~/.claude/secrets/`. Documented in `pd-ops` agent.
|
||||
|
||||
### Local SQLite for Initiative Tracking
|
||||
Chose a local SQLite database over Gitea project boards (no API in Gitea 1.22.6), n8n datatables (external dependency), or structured JSON (merge conflict risk). SQLite gives full schema control, instant queries, no external deps, and agent-friendly access.
|
||||
|
||||
### dev-storage is Docker Volume, Not Archive
|
||||
`dev-storage/` is a mapped volume for local Docker containers — do not use for archival. Use `.archive/` instead.
|
||||
|
||||
## Follow-Up
|
||||
|
||||
| Item | Priority | Notes |
|
||||
|---|---|---|
|
||||
| Card-creation: 11 PRs need Cal's review | High | Logic/balance changes: #36, #34, #33, #41, #46, #49, #48, #47, #30, #31, #32 |
|
||||
| Discord #87: Refractor rename before merge | High | Rename cog, command group, tier badges |
|
||||
| Discord #88: Tier badge approach rethink | Medium | Comment left — reconsider API call per embed |
|
||||
| Discord #73: Ranked mode merge | Medium | Hold until after cleanup, needs rebase |
|
||||
| Discord #108: Ruff pre-commit resolution | Medium | Options documented in issue |
|
||||
| Database #122-124: SQLite cleanup | Medium | 3 issues created |
|
||||
| Database #84, #99, #96: Rebase onto correct branches | Low | Evolution migration + next-release PRs |
|
||||
| Delete `next-release` branch in Discord repo | Low | Pattern retired |
|
||||
| Rotate secrets exposed in git history | High | PR #29 (card-creation) removed hardcoded PD API token and Supabase JWT — both need rotation |
|
||||
@ -1,61 +0,0 @@
|
||||
---
|
||||
title: "PR Review Pipeline + Refractor Rename + CI/CD Modernization"
|
||||
description: "Reviewed 12 PRs across all PD repos, completed Refractor rename on Discord PRs, converted CI to tag-based builds, built /release skill, migrated pd-ops agent."
|
||||
type: context
|
||||
domain: paper-dynasty
|
||||
tags: [paper-dynasty, deployment, discord, database, card-creation, ci-cd, refractor]
|
||||
---
|
||||
|
||||
# PR Review Pipeline + Refractor Rename + CI/CD Modernization
|
||||
|
||||
**Date:** 2026-03-23
|
||||
**Repos:** paper-dynasty-umbrella, paper-dynasty-database, paper-dynasty-discord, paper-dynasty-card-creation
|
||||
|
||||
## What Was Done
|
||||
|
||||
Major cross-repo operations session covering PR review, feature work, and infrastructure modernization.
|
||||
|
||||
1. **Reviewed 12 PRs across all repos** — Ran pd-ops GO/NO-GO assessment on `ai-reviewed` PRs, then launched 11 parallel pr-reviewer agents against all `ai-reviewing` PRs. Results: 4 approved, 7 changes requested, 1 do-not-merge.
|
||||
|
||||
2. **Merged 3 clean PRs** — card-creation #30 (pytest.mark.asyncio), card-creation #41 (timeout fix), database #127 (SQLite refs cleanup). Closed database #99 as superseded by #89.
|
||||
|
||||
3. **Completed Refractor rename on Discord PRs #87 and #88** — Renamed Evolution system to Refractor across cog files, test files, tier names, and badge labels. Multiple review-fix-push cycles to nail down the final tier names.
|
||||
|
||||
4. **Finalized Refractor tier names** — Base Card (T0) → Base Chrome (T1) → Refractor (T2) → Gold Refractor (T3) → Superfractor (T4). Badge labels: [BC], [R], [GR], [SF]. No badge for T0.
|
||||
|
||||
5. **Closed PR #84 and created fresh migration PR #128** — Old migration had schema drift (single player_season_stats vs split batting/pitching). Validated dev database has all evolution tables already applied; prod does not. New migration SQL generated from actual dev schema.
|
||||
|
||||
6. **Converted CI/CD to tag-based Docker builds** — Replaced branch/PR-triggered builds with tag-only triggers matching Major Domo pattern. Created PRs: database #129, discord #110. Removed calver/docker-tags/gitea-tag reusable actions. Added build cache to database workflow.
|
||||
|
||||
7. **Built `/release` skill** — Shell script at `.claude/skills/release/release.sh` that validates repo state, auto-increments CalVer from existing git tags, creates tag, and pushes to trigger CI. Supports database, discord, and card-creation services.
|
||||
|
||||
8. **Migrated pd-ops agent to project-level** — Moved from `~/.claude/agents/pd-ops.md` to `.claude/agents/pd-ops.md` in the umbrella repo. Updated with `/release` skill reference and new CI/CD conventions.
|
||||
|
||||
9. **Fixed card-creation #47** — Pushed url_get return type fix (dict, not aiohttp.ClientResponse). PR reviewer found the function body also needs updating — still returns raw response object.
|
||||
|
||||
## Decisions
|
||||
|
||||
### Refractor Tier Names (Final)
|
||||
Settled on baseball card collecting terminology. T3 and T4 were both "Superfractor" initially — Cal corrected to shift all tiers down by one so each tier has a unique name. T4 ("Superfractor") is the only fully-evolved state.
|
||||
|
||||
### Tag-Based CI Over Auto-Tag
|
||||
Chose fully manual tagging (Major Domo pattern) over auto-tag-on-merge. Auto-tagging adds noise — not every merge is release-worthy. The `/release` skill provides the convenience layer without the automation overhead.
|
||||
|
||||
### pd-ops Stays One Agent (No Separate pd-ci)
|
||||
CI/CD knowledge belongs in pd-ops, not a separate agent. Release coordination is a natural extension of pd-ops' merge and deploy responsibilities. A `/release` skill gives the action without splitting the persona.
|
||||
|
||||
### context7 MCP Not Worth Adding Yet
|
||||
Researched context7 for live library docs. FastAPI and aiohttp are well-indexed, but discord.py and Peewee (the two most hallucination-prone libraries) are not. Coverage too patchy to justify the overhead.
|
||||
|
||||
## Follow-Up
|
||||
|
||||
| Item | Repo | Priority |
|
||||
|---|---|---|
|
||||
| Merge CI PRs #129 and #110 | database, discord | High |
|
||||
| Final review pass on Discord #87 and #88 | discord | High |
|
||||
| Fix card-creation #47 (url_get body + hardcoded token) | card-creation | Medium |
|
||||
| Address database #128 reviewer feedback (card_type uniqueness, rollback section) | database | Medium |
|
||||
| Fix database #125 (skill scripts pagination + false assertions) | database | Medium |
|
||||
| Fix database #126 (compose.production.yml + docs cleanup) | database | Low |
|
||||
| Rotate exposed secrets (card-creation #50, database #9) | card-creation, database | High |
|
||||
| Run evolution migration on prod when ready | database | Medium |
|
||||
@ -1,62 +0,0 @@
|
||||
---
|
||||
title: "Refractor Dev Deploy + Priority Council"
|
||||
description: "Deployed Refractor rename to dev (DB + Discord), created test plan, built council-meeting skill, ran first PO priority council."
|
||||
type: context
|
||||
domain: paper-dynasty
|
||||
tags: [paper-dynasty, database, discord, deployment, refractor]
|
||||
---
|
||||
|
||||
# Refractor Dev Deploy + Priority Council
|
||||
|
||||
**Date:** 2026-03-24
|
||||
**Repos:** paper-dynasty (orchestration), paper-dynasty-database, paper-dynasty-discord
|
||||
|
||||
## What Was Done
|
||||
|
||||
1. **Refractor dev deployment unblocked** — Discovered the DB API `dev` tag was behind the rename PR #131. pd-ops reconciled the DB repo (local main diverged from origin), merged CI catchup as PR #153, and retagged `dev` at `6a217f9`.
|
||||
|
||||
2. **Discord PR #114 merged** — pd-ops rebased the `evolution/cards` -> `refractor/cards` endpoint rename PR onto main, reviewed, merged, and retagged Discord `dev` at `55efdb3`.
|
||||
|
||||
3. **Refractor test plan created** — Comprehensive 5-phase manual test plan covering smoke tests, `/refractor status` filters, post-game evaluation, card embed badges, and error resilience. Saved as `discord-app/tests/refractor-preflight.sh` (scriptable pre-flight checks) and documented in plan file.
|
||||
|
||||
4. **Priority council conducted** — Spawned pd-database, pd-discord, and pd-cards as a team to deliberate on next priorities. Council converged on:
|
||||
- Merge pagination PRs #150-#152 (quick wins)
|
||||
- HTTPException fix (#4) — correctness bug affecting all API consumers
|
||||
- Pack-opening perf cluster (#98-#101) — event loop blocking bug
|
||||
- SQLite removal PR #126 + ranked mode PR #73
|
||||
- Content pipeline automation (#11)
|
||||
|
||||
5. **`/council-meeting` skill created** — Repeatable multi-round deliberation skill for PO agents. Supports 3-5 rounds of propose/debate/vote with pd-database, pd-discord, pd-cards, and pd-growth (Opus model). Output is a ranked recommendation.
|
||||
|
||||
6. **Pagination PR #154 opened** — scout_opportunities endpoint, created by pd-ops during repo work.
|
||||
|
||||
## Decisions
|
||||
|
||||
### No backward compatibility for evolution endpoints
|
||||
The DB API rename (PR #131) removes `/evolution/` routes entirely — only `/refractor/` is served. The Discord bot's `helpers/main.py:118` still calls the old path, so card embed badges won't display until PR #114's changes are deployed. This was a deliberate choice — no backward-compat shims.
|
||||
|
||||
### Council meeting uses Opus for all PO agents
|
||||
Sonnet worked for the first run but Cal requested Opus for deeper reasoning in the multi-round deliberation format. The `/council-meeting` skill defaults to Opus for all 4 agents.
|
||||
|
||||
### pd-growth included in council by default
|
||||
The 4th strategic agent (pd-growth) is included alongside the 3 domain POs to provide retention/acquisition/engagement perspective during priority discussions.
|
||||
|
||||
## Follow-Up
|
||||
|
||||
| # | Title | Priority |
|
||||
|---|-------|----------|
|
||||
| — | Refractor manual testing in dev Discord server | High |
|
||||
| — | Verify Docker builds triggered for both repos (dev tags) | High |
|
||||
| #150 | [Merge notifications pagination PR](https://git.manticorum.com/cal/paper-dynasty-database/pulls/150) | Medium |
|
||||
| #151 | [Merge scout_claims pagination PR](https://git.manticorum.com/cal/paper-dynasty-database/pulls/151) | Medium |
|
||||
| #152 | [Merge rewards pagination PR](https://git.manticorum.com/cal/paper-dynasty-database/pulls/152) | Medium |
|
||||
| #154 | [Merge scout_opportunities pagination PR](https://git.manticorum.com/cal/paper-dynasty-database/pulls/154) | Medium |
|
||||
| #4 | HTTPException fix — needs new PR (old #47 closed) | High |
|
||||
| #98-#101 | Pack-opening perf cluster (Discord) | High |
|
||||
|
||||
## Files Changed
|
||||
|
||||
- `.claude/skills/council-meeting/SKILL.md` — new skill for repeatable PO deliberation
|
||||
- `discord-app/tests/refractor-preflight.sh` — scriptable pre-deploy verification checks
|
||||
- `.claude/plans/tidy-stirring-globe.md` — Refractor test plan
|
||||
- `.claude/command-permissions.json` — added `git tag*` project permission
|
||||
@ -1,63 +0,0 @@
|
||||
---
|
||||
title: "Refractor Phase 2: Integration — boost wiring, tests, and review"
|
||||
description: "Implemented apply_tier_boost orchestration, dry_run evaluator, evaluate-game wiring with kill switch, and 51 new tests across paper-dynasty-database. PRs #176 and #177 merged."
|
||||
type: context
|
||||
domain: paper-dynasty
|
||||
tags: [paper-dynasty-database, refractor, phase-2, testing]
|
||||
---
|
||||
|
||||
# Refractor Phase 2: Integration — boost wiring, tests, and review
|
||||
|
||||
**Date:** 2026-03-30
|
||||
**Branch:** `feature/refractor-phase2-integration` (merged to `main`)
|
||||
**Repo:** paper-dynasty-database
|
||||
|
||||
## What Was Done
|
||||
|
||||
Full implementation of Refractor Phase 2 Integration — wiring the Phase 2 Foundation boost functions (PR #176) into the live evaluate-game endpoint so that tier-ups actually create boosted variant cards with modified ratings.
|
||||
|
||||
1. **PR #176 merged (Foundation)** — Review findings fixed (renamed `evolution_tier` to `refractor_tier`, removed redundant parens), then merged via pd-ops
|
||||
2. **`evaluate_card(dry_run=True)`** — Added dry_run parameter to separate tier detection from tier write. `apply_tier_boost()` becomes the sole writer of `current_tier`, ensuring atomicity with variant creation. Added `computed_tier` and `computed_fully_evolved` to return dict.
|
||||
3. **`apply_tier_boost()` orchestration** — Full flow: source card lookup, boost application per vs_hand split, variant card + ratings creation with idempotency guards, audit record with idempotency guard, atomic state mutations via `db.atomic()`. Display stat helpers compute fresh avg/obp/slg.
|
||||
4. **`evaluate_game()` wiring** — Calls evaluate_card with dry_run=True, loops through intermediate tiers on tier-up, handles partial multi-tier failures (reports last successful tier), `REFRACTOR_BOOST_ENABLED` env var kill switch, suppresses false notifications when boost is disabled or card_type is missing.
|
||||
5. **79-sum documentation fix** — Clarified all references to "79-sum" across code, tests, and docs to note the 108-total card invariant (79 variable + 29 x-check for pitchers).
|
||||
6. **51 new tests** — Display stat unit tests (12), integration tests for orchestration (27), HTTP endpoint tests (7), dry_run evaluator tests (6). Total suite: 223 passed.
|
||||
7. **Five rounds of swarm reviews** — Each change reviewed individually by swarm-reviewer agents. All findings addressed: false notification on null card_type, wrong tier in log message, partial multi-tier failure reporting, atomicity test accuracy, audit idempotency gap, import os placement.
|
||||
8. **PR #177 merged** — Review found two issues (import os inside function, audit idempotency gap on PostgreSQL UNIQUE constraint). Both fixed, pushed, approved by Claude, merged via pd-ops.
|
||||
|
||||
## Decisions
|
||||
|
||||
### Display stats computed fresh, not set to None
|
||||
The original PO review note suggested setting avg/obp/slg to None on variant cards and deferring recalculation. Cal decided to compute them fresh using the exact Pydantic validator formulas instead — strictly better than stale or missing values. Design doc updated to reflect this.
|
||||
|
||||
### Card/ratings creation outside db.atomic()
|
||||
The design doc specified all writes inside `db.atomic()`. Implementation splits card/ratings creation outside (idempotent, retry-safe via get_or_none guards) with only state mutations (audit, tier write, Card.variant propagation) inside the atomic block. This is pragmatically correct — on retry, existing card/ratings are reused. Design doc updated.
|
||||
|
||||
### Kill switch suppresses notifications entirely
|
||||
When `REFRACTOR_BOOST_ENABLED=false`, the router skips both the boost AND the tier_up notification (via `continue`). This prevents false notifications to the Discord bot during maintenance windows. Initially the code fell through and emitted a notification without a variant — caught during coverage gap analysis and fixed.
|
||||
|
||||
### Audit idempotency guard added
|
||||
PR review identified that `RefractorBoostAudit` has a `UNIQUE(card_state_id, tier)` constraint in PostgreSQL (from the migration) that the SQLite test DB doesn't enforce. Added `get_or_none` before `create` to prevent IntegrityError on retry.
|
||||
|
||||
## Follow-Up
|
||||
|
||||
- Phase 3: Documentation updates in `card-creation` repo (docs only, no code)
|
||||
- Phase 4a: Validation test cases in `database` repo
|
||||
- Phase 4b: Discord bot tier-up notification fix (must ship alongside or after Phase 2 deploy)
|
||||
- Deploy Phase 2 to dev: run migration `2026-03-28_refractor_phase2_boost.sql` on dev DB
|
||||
- Stale branches to clean up in database repo: `feat/evolution-refractor-schema-migration`, `test/refractor-tier3`
|
||||
|
||||
## Files Changed
|
||||
|
||||
**paper-dynasty-database:**
|
||||
- `app/services/refractor_boost.py` — apply_tier_boost orchestration, display stat helpers, card_type validation, audit idempotency guard
|
||||
- `app/services/refractor_evaluator.py` — dry_run parameter, computed_tier/computed_fully_evolved in return dict
|
||||
- `app/routers_v2/refractor.py` — evaluate_game wiring, kill switch, partial multi-tier failure, isoformat crash fix
|
||||
- `tests/test_refractor_boost.py` — 12 new display stat tests, 79-sum comment fixes
|
||||
- `tests/test_refractor_boost_integration.py` — 27 new integration tests (new file)
|
||||
- `tests/test_postgame_refractor.py` — 7 new HTTP endpoint tests
|
||||
- `tests/test_refractor_evaluator.py` — 6 new dry_run unit tests
|
||||
|
||||
**paper-dynasty (parent repo):**
|
||||
- `docs/refractor-phase2/01-phase1-foundation.md` — 79-sum clarifications
|
||||
- `docs/refractor-phase2/02-phase2-integration.md` — atomicity boundary, display stats updates
|
||||
@ -1,192 +0,0 @@
|
||||
---
|
||||
title: "Paper Dynasty Database API — Deployment Guide"
|
||||
description: "Complete deployment guide for the PD Database API covering dev and prod release flows, CI/CD pipeline, Docker tag strategy, rollback procedures, and common gotchas."
|
||||
type: guide
|
||||
domain: paper-dynasty
|
||||
tags: [paper-dynasty, deployment, docker, gitea, ci-cd, database, release]
|
||||
---
|
||||
|
||||
# Paper Dynasty Database API — Deployment Guide
|
||||
|
||||
## Overview
|
||||
|
||||
The Database API (`cal/paper-dynasty-database`) uses a tag-driven CI/CD pipeline via Gitea Actions. Pushing a git tag triggers a Docker image build and push to Docker Hub. There are two deployment tracks:
|
||||
|
||||
| Track | Git Tag | Docker Tags | Environment | URL |
|
||||
|---|---|---|---|---|
|
||||
| **Production** | CalVer (e.g., `2026.3.6`) | `:2026.3.6` + `:production` | Prod | `pd.manticorum.com` |
|
||||
| **Dev** | `dev` (force-updated) | `:dev` | Dev | `pddev.manticorum.com` |
|
||||
|
||||
## Release Commands
|
||||
|
||||
### Dev Deploy
|
||||
|
||||
```bash
|
||||
# Via release script (preferred)
|
||||
bash /mnt/NV2/Development/paper-dynasty/.claude/skills/release/release.sh database dev
|
||||
|
||||
# Manual
|
||||
cd /mnt/NV2/Development/paper-dynasty/database
|
||||
git checkout main && git pull --ff-only origin main
|
||||
git tag -f dev && git push origin dev --force
|
||||
```
|
||||
|
||||
The `dev` tag is **force-updated** every time — it always points to the latest commit you want to test. This is safe because the dev environment is not customer-facing.
|
||||
|
||||
### Production Deploy
|
||||
|
||||
```bash
|
||||
# Auto-increment version (preferred)
|
||||
bash /mnt/NV2/Development/paper-dynasty/.claude/skills/release/release.sh database
|
||||
|
||||
# Explicit version
|
||||
bash /mnt/NV2/Development/paper-dynasty/.claude/skills/release/release.sh database 2026.3.7
|
||||
```
|
||||
|
||||
CalVer format: `YYYY.M.BUILD` (e.g., `2026.3.6`). The script auto-increments BUILD if omitted.
|
||||
|
||||
## CI/CD Pipeline
|
||||
|
||||
**Workflow file:** `.gitea/workflows/build.yml`
|
||||
|
||||
**Trigger:** Push of tags matching `20*` (CalVer) or `dev`
|
||||
|
||||
**Steps:**
|
||||
1. Checkout code with full history
|
||||
2. Set up Docker Buildx with layer caching
|
||||
3. Login to Docker Hub (`manticorum67`)
|
||||
4. Build and push image with appropriate tags
|
||||
5. Rotate build cache
|
||||
6. Send Discord notification (success/failure)
|
||||
|
||||
**Docker image:** `manticorum67/paper-dynasty-database`
|
||||
|
||||
**Build cache:** Persistent volume `pd-buildx-cache` on the Gitea runner — significantly speeds up rebuilds since only changed layers are rebuilt.
|
||||
|
||||
## Post-Build: Pulling the New Image
|
||||
|
||||
CI builds and pushes the image, but **does not auto-deploy**. You must pull and restart the container on the target host.
|
||||
|
||||
### Dev Environment
|
||||
|
||||
```bash
|
||||
ssh pd-database "cd /home/cal/container-data/dev-pd-database && docker compose pull && docker compose up -d"
|
||||
```
|
||||
|
||||
- Host: `pd-database` (SSH alias)
|
||||
- Container data: `/home/cal/container-data/dev-pd-database/`
|
||||
- Container: `dev_pd_database`
|
||||
- Port: 813 (mapped to internal 8000)
|
||||
- URL: `pddev.manticorum.com`
|
||||
|
||||
### Production Environment
|
||||
|
||||
```bash
|
||||
ssh akamai "cd /root/container-data/paper-dynasty && docker compose pull && docker compose up -d"
|
||||
```
|
||||
|
||||
- Host: `akamai` (SSH alias)
|
||||
- Container: `pd_api`
|
||||
- Port: 815 (mapped to internal 8000)
|
||||
- URL: `pd.manticorum.com`
|
||||
|
||||
## Verification
|
||||
|
||||
### Quick Curl Test
|
||||
|
||||
```bash
|
||||
# Dev
|
||||
curl -s https://pddev.manticorum.com/api/v2/awards?limit=2 | python3 -m json.tool
|
||||
|
||||
# Prod
|
||||
curl -s https://pd.manticorum.com/api/v2/awards?limit=2 | python3 -m json.tool
|
||||
```
|
||||
|
||||
### Smoke Test
|
||||
|
||||
Use the `/smoke-test` skill:
|
||||
```
|
||||
/smoke-test dev
|
||||
/smoke-test prod
|
||||
```
|
||||
|
||||
### Check Running Version
|
||||
|
||||
```bash
|
||||
# Dev — check container image digest
|
||||
ssh pd-database "docker inspect dev_pd_database --format '{{.Image}}'"
|
||||
|
||||
# Prod
|
||||
ssh akamai "docker inspect pd_api --format '{{.Image}}'"
|
||||
```
|
||||
|
||||
## Rollback
|
||||
|
||||
### Dev
|
||||
|
||||
Just re-tag `dev` to a known-good commit and force-push:
|
||||
|
||||
```bash
|
||||
git tag -f dev <good-commit-sha>
|
||||
git push origin dev --force
|
||||
# Then pull on dev host
|
||||
ssh pd-database "cd /home/cal/container-data/paper-dynasty-database && docker compose pull && docker compose up -d"
|
||||
```
|
||||
|
||||
### Production
|
||||
|
||||
Production images are tagged with both the CalVer version and `:production`. To roll back:
|
||||
|
||||
```bash
|
||||
# On the prod host, pull a specific older version
|
||||
ssh akamai "docker pull manticorum67/paper-dynasty-database:2026.3.5"
|
||||
# Update compose to pin to that version, or just retag
|
||||
ssh akamai "cd /root/container-data/paper-dynasty && docker compose up -d"
|
||||
```
|
||||
|
||||
## Common Gotchas
|
||||
|
||||
### CalVer tag does NOT deploy to dev
|
||||
|
||||
CalVer tags (e.g., `2026.3.6`) only build `:version` + `:production` Docker tags. The dev environment pulls `:dev`. You **must** push the `dev` git tag separately to deploy to dev.
|
||||
|
||||
### CI build takes ~2-3 minutes
|
||||
|
||||
After pushing a tag, wait for CI to complete before pulling on the host. Check the Gitea Actions tab or wait for the Discord notification.
|
||||
|
||||
### "dev" tag requires force-push
|
||||
|
||||
Since `dev` is reused, you must use `git tag -f dev` and `git push origin dev --force`. The release script handles this automatically.
|
||||
|
||||
### Branch protection doesn't affect tags
|
||||
|
||||
Tags can be pushed by anyone with write access — no PR or review required. The review happens before merging to `main`; tagging is the release step.
|
||||
|
||||
### scout_opportunities.py was base64-encoded
|
||||
|
||||
As of 2026-03-24, this file was stored as raw base64 in git. The `total-count-pagination` branch rewrote it to proper Python source. If you see base64 content in this file on older branches, that's expected — it's a legacy artifact.
|
||||
|
||||
### Multiple PRs merging concurrently causes rebase races
|
||||
|
||||
When merging many PRs at once (e.g., batch pagination PRs), branch protection rules (`block_on_outdated_branch` + `dismiss_stale_approvals`) cause a merge storm. Each rebase invalidates the approval, requiring re-approve + re-merge. pd-ops agents handle this automatically but may take several attempts.
|
||||
|
||||
## Environment Variables
|
||||
|
||||
| Var | Purpose |
|
||||
|---|---|
|
||||
| `API_TOKEN` | Bearer token for authenticated endpoints |
|
||||
| `LOG_LEVEL` | Logging verbosity (default: INFO) |
|
||||
| `DATABASE_TYPE` | `postgresql` |
|
||||
| `POSTGRES_HOST` | Container name of PostgreSQL |
|
||||
| `POSTGRES_DB` | Database name (`pd_master`) |
|
||||
| `POSTGRES_USER` | DB username |
|
||||
| `POSTGRES_PASSWORD` | DB password |
|
||||
|
||||
## Topology Quick Reference
|
||||
|
||||
| Component | Host | Container | Port |
|
||||
|---|---|---|---|
|
||||
| Database API (prod) | `ssh akamai` | `pd_api` | 815 |
|
||||
| Database API (dev) | `ssh pd-database` | `dev_pd_database` | 813 |
|
||||
| PostgreSQL (prod) | `ssh akamai` | `pd_postgres` | 5432 |
|
||||
| PostgreSQL (dev) | `ssh pd-database` | `pd_postgres` | 5432 |
|
||||
@ -1,118 +0,0 @@
|
||||
---
|
||||
title: "Card Evolution Phase 1 — Implementation Log"
|
||||
description: "Full implementation log for Card Evolution Phase 1 (schema, API, formula engine, bot integration) across paper-dynasty-database and paper-dynasty-discord repos. Includes architecture decisions, bug fixes found in review, and first smoke test results."
|
||||
type: context
|
||||
domain: paper-dynasty
|
||||
tags: [paper-dynasty, evolution, deployment, architecture, testing]
|
||||
---
|
||||
|
||||
# Card Evolution Phase 1 — Implementation Log
|
||||
|
||||
**Date:** 2026-03-18 through 2026-03-19
|
||||
**Repos:** paper-dynasty-database (card-evolution branch), paper-dynasty-discord (main/next-release)
|
||||
**PRD:** `docs/prd-evolution/` in card-creation repo
|
||||
**Plan:** `docs/prd-evolution/PHASE1_PROJECT_PLAN.md` v2.2
|
||||
|
||||
## Overview
|
||||
|
||||
Phase 1 delivers the structural foundation for the Card Evolution system. Every card gets an evolution state tracking progress toward 4 tiers via simple formulas (batters: `PA + TB*2`, pitchers: `IP + K`). No rating boosts are applied — tiers are tracked but boosts are deferred to Phase 2.
|
||||
|
||||
## Architecture
|
||||
|
||||
### Single Metric Per Track (Key Design Decision)
|
||||
|
||||
Each track uses one cumulative formula. Progress is computed from career totals (SUM across all season stats rows for a player-team pair) and compared against four tier thresholds stored on the track itself. No separate milestone rows — thresholds ARE the milestones.
|
||||
|
||||
| Track | Formula | T1 | T2 | T3 | T4 |
|
||||
|-------|---------|----|----|----|----|
|
||||
| Batter | PA + (TB x 2) | 37 | 149 | 448 | 896 |
|
||||
| Starting Pitcher | IP + K | 10 | 40 | 120 | 240 |
|
||||
| Relief Pitcher | IP + K | 3 | 12 | 35 | 70 |
|
||||
|
||||
### Data Flow
|
||||
|
||||
1. **Game completes** -> bot calls `POST /season-stats/update-game/{game_id}`
|
||||
2. Season stats upserted into `batting_season_stats` / `pitching_season_stats`
|
||||
3. Bot calls `POST /evolution/evaluate-game/{game_id}`
|
||||
4. For each player in the game with an `evolution_card_state`, career totals are summed, formula applied, tier checked
|
||||
5. Tier-ups returned to bot -> Discord notification embeds sent
|
||||
|
||||
### Tables Created
|
||||
|
||||
- `batting_season_stats` — per-player per-team per-season batting totals
|
||||
- `pitching_season_stats` — per-player per-team per-season pitching totals
|
||||
- `evolution_track` — 3 tracks with formulas and thresholds
|
||||
- `evolution_card_state` — per-player per-team evolution progress (tier, value, fully_evolved)
|
||||
- `evolution_tier_boost` — Phase 2 stub for stat boosts
|
||||
- `evolution_cosmetic` — Phase 2 stub for visual unlocks
|
||||
- `processed_game` — idempotency ledger for update_season_stats()
|
||||
|
||||
## Sub-Phases Completed
|
||||
|
||||
### Phase 1a — Schema & Data Foundation (PR #104)
|
||||
- WP-01: Evolution Peewee models
|
||||
- WP-02: PlayerSeasonStats model (BattingSeasonStats + PitchingSeasonStats)
|
||||
- WP-03: Track seed data (JSON + idempotent seed function)
|
||||
- WP-04: SQL migration (7 new tables + card.variant, battingcard/pitchingcard.image_url)
|
||||
- WP-05: update_season_stats(game_id) service with dual-backend upsert
|
||||
|
||||
### Phase 1b — API & Evaluation Engine (PRs #98, #106, #107, #108)
|
||||
- WP-06: Track Catalog API (GET /v2/evolution/tracks)
|
||||
- WP-07: Card State API (GET /v2/evolution/cards/{card_id}, GET /v2/teams/{team_id}/evolutions)
|
||||
- WP-08: Evaluate Endpoint (POST /v2/evolution/cards/{card_id}/evaluate)
|
||||
- WP-09: Formula Engine (compute_batter_value, compute_pitcher_value, tier_from_value)
|
||||
- WP-10: Pack Opening Hook (evolution_card_state init on card acquisition)
|
||||
- ProcessedGame Ledger (#105) — emerged from Phase 1a review
|
||||
|
||||
### Phase 1c — Bot Integration (PRs #91-94 discord, #109 database)
|
||||
- WP-11: /evo status slash command with progress bars
|
||||
- WP-12: Tier badge on card embeds ([T1]/[T2]/[T3]/[EVO])
|
||||
- WP-13: Post-game callback (bot hook + DB endpoints)
|
||||
- WP-14: Tier completion notification embeds
|
||||
|
||||
## Bugs Found in Review
|
||||
|
||||
### Phase 1b Review Fixes
|
||||
1. **stats.strikeouts vs stats.k** — Formula engine Protocol used `strikeouts` but evaluator's `_CareerTotals` exposed `k`. Runtime AttributeError on any pitcher evaluation.
|
||||
2. **track.t1 vs track.t1_threshold** — Formula engine read `track.t1` but DB model defines `t1_threshold`. Runtime crash on tier evaluation.
|
||||
3. **fully_evolved logic** — Was derived from `new_tier` instead of post-max `current_tier`, could produce contradictory state (tier=2 but fully_evolved=True after regression guard).
|
||||
4. **Missing pitcher_id=None guard** — `_build_pitching_groups` didn't filter null pitcher IDs, would crash on NOT NULL FK constraint.
|
||||
5. **Missing pg_conn fixture** — Test conftest.py missing the PostgreSQL connection fixture for integration tests.
|
||||
|
||||
### Phase 1c Review Fixes
|
||||
1. **Missing @pytest.mark.asyncio decorators** — 9 async test methods silently didn't run.
|
||||
2. **WP-14 files leaked into WP-13 PR** — Worktree agent picked up untracked files from another branch.
|
||||
3. **Unused Optional import** in evolution_notifs.py.
|
||||
|
||||
## First Smoke Test (2026-03-19)
|
||||
|
||||
### Environment
|
||||
- **Database API:** pddev.manticorum.com, image `manticorum67/paper-dynasty-database:next-release`
|
||||
- **Discord Bot:** Local Docker, image `manticorum67/paper-dynasty-discordapp:next-release`
|
||||
- **Database:** PostgreSQL on pd-database host, database `paperdynasty_dev`
|
||||
|
||||
### Steps Taken
|
||||
1. Ran SQL migration (evolution tables + processed_game) -> all tables created
|
||||
2. Seeded 3 evolution tracks -> verified via GET /v2/evolution/tracks (200 OK, 3 items)
|
||||
3. Seeded 2753 evolution_card_state rows for team 31 (Normal CornBelters)
|
||||
4. Called `POST /season-stats/update-game/1517` -> `{"updated": 27, "skipped": false}`
|
||||
5. Called `POST /evolution/evaluate-game/1517` -> `{"evaluated": 0, "tier_ups": []}`
|
||||
|
||||
### Issue Found
|
||||
The evaluate-game endpoint returned `evaluated: 0` despite states existing. Root cause: the deployed evaluator imports `PlayerSeasonStats` from `db_engine`, but the actual model names are `BattingSeasonStats` and `PitchingSeasonStats`. This is a naming mismatch between the WP-13 agent's evaluator and the Phase 1a models. The `except Exception` in the evaluate loop silently swallows the `ImportError`.
|
||||
|
||||
### Architectural Concern Identified
|
||||
The incremental delta upsert approach for season stats is fragile:
|
||||
- Partial processing corrupts stats
|
||||
- Upsert bugs compound over time
|
||||
- No self-healing mechanism
|
||||
|
||||
**Proposed fix:** Replace delta upserts with full recalculation (SBA-style materialized view pattern). After each game, recalculate full season stats by `SELECT SUM(...)` from stratplay across all games that season. Always correct, idempotent by nature.
|
||||
|
||||
## Operational Notes
|
||||
|
||||
- **Docker tag mapping:** `next-release` branch -> `:next-release` and `:rc` tags on Docker Hub
|
||||
- **Discord repo branch protection:** Empty approvals whitelist means API merges fail. Use Claude Gitea token at `~/.claude/secrets/gitea_claude_token` for approvals, merge via UI.
|
||||
- **Discord repo ruff:** `helpers/main.py` has 2300+ pre-existing violations. Commits need `--no-verify`.
|
||||
- **Dev database:** Migrations must be run manually via `docker exec -i sba_postgres psql` on pd-database host.
|
||||
- **Production bot on pd-bots:** Uses `:latest` tag — do NOT update without explicit approval.
|
||||
@ -1,48 +0,0 @@
|
||||
---
|
||||
title: "Fix: /open-packs crash from orphaned Check-In Player packs"
|
||||
description: "Check-In Player packs with hyphenated name caused empty Discord select menu (400 Bad Request) and KeyError in callback."
|
||||
type: troubleshooting
|
||||
domain: paper-dynasty
|
||||
tags: [troubleshooting, discord, paper-dynasty, packs, hotfix]
|
||||
---
|
||||
|
||||
# Fix: /open-packs crash from orphaned Check-In Player packs
|
||||
|
||||
**Date:** 2026-03-26
|
||||
**PR:** #134 (hotfix branch based on prod tag 2026.3.4, merged to main)
|
||||
**Tag:** 2026.3.8
|
||||
**Severity:** High --- any user with an orphaned Check-In Player pack could not open any packs at all
|
||||
|
||||
## Problem
|
||||
|
||||
Running `/open-packs` returned: `HTTPException: 400 Bad Request (error code: 50035): Invalid Form Body --- In data.components.0.components.0.options: This field is required`
|
||||
|
||||
Discord rejected the message because the select menu had zero options.
|
||||
|
||||
## Root Cause
|
||||
|
||||
Two cascading bugs triggered by the "Check-In Player" pack type name containing a hyphen:
|
||||
|
||||
1. **Empty select menu:** The `pretty_name` logic used `'-' not in key` to identify bare pack type names. "Check-In Player" contains a hyphen, so it fell into the `elif 'Team' in key` / `elif 'Cardset' in key` chain --- matching neither. `pretty_name` stayed `None`, no `SelectOption` was created, and Discord rejected the empty options list.
|
||||
|
||||
2. **KeyError in callback (secondary):** Even if displayed, selecting "Check-In Player" would call `self.values[0].split('-')` producing `['Check', 'In Player']`, which matched none of the pack type tokens in the `if/elif` chain, raising `KeyError`.
|
||||
|
||||
Check-In Player packs are normally auto-opened during the daily check-in (`/comeonmanineedthis`). An orphaned pack existed because `roll_for_cards` had previously failed mid-flow, leaving an unopened pack in inventory.
|
||||
|
||||
## Fix
|
||||
|
||||
Three-layer fix applied to both `cogs/economy.py` (production) and `cogs/economy_new/packs.py` (main):
|
||||
|
||||
1. **Filter at source:** Added `AUTO_OPEN_TYPES = {"Check-In Player"}` set. Packs with these types are skipped during grouping with `continue`, so they never reach the select menu.
|
||||
|
||||
2. **Fallback for hyphenated names:** Added `else: pretty_name = key` after the `Team`/`Cardset` checks, so any future hyphenated pack type names still get a display label.
|
||||
|
||||
3. **Graceful error in callback:** Replaced `raise KeyError` with a user-facing ephemeral message ("This pack type cannot be opened manually. Please contact Cal.") and `return`.
|
||||
|
||||
Also changed all "contact an admin" strings to "contact Cal" in `discord_ui/selectors.py`.
|
||||
|
||||
## Lessons
|
||||
|
||||
- **Production loads `cogs/economy.py`, not `cogs/economy_new/packs.py`.** The initial fix was applied to the wrong file. Always check which cogs are actually loaded by inspecting the bot startup logs (`Loaded cog: ...`) before assuming which file handles a command.
|
||||
- **Hotfix branches based on old tags may have stale CI workflows.** The `docker-build.yml` at the tagged commit had an older trigger config (branch push, not tag push), so the CalVer tag silently failed to trigger CI. Cherry-pick the current workflow into hotfix branches.
|
||||
- **Pack type names are used as dict keys and split on hyphens** throughout the open-packs flow. Any new pack type with a hyphen in its name will hit similar issues unless the grouping/parsing logic is refactored to stop using hyphen-delimited strings as composite keys.
|
||||
@ -1,57 +0,0 @@
|
||||
---
|
||||
title: "pd-plan CLI Release — 1.0.0"
|
||||
description: "Initial release of the Paper Dynasty initiative tracker — a local SQLite CLI for cross-project priority management."
|
||||
type: reference
|
||||
domain: paper-dynasty
|
||||
tags: [release-notes, pd-plan, infrastructure, paper-dynasty]
|
||||
---
|
||||
|
||||
# pd-plan CLI Release — 1.0.0
|
||||
|
||||
**Date:** 2026-03-23
|
||||
**Version:** 1.0.0
|
||||
**Install location:** `~/.claude/skills/paper-dynasty/plan/`
|
||||
**Database:** `~/.claude/skills/paper-dynasty/plan/initiatives.db`
|
||||
**Install method:** `pip install -e . --user` (editable install, `pd-plan` available system-wide)
|
||||
|
||||
## Release Summary
|
||||
|
||||
Initial release of `pd-plan`, a zero-dependency SQLite-backed CLI for tracking Paper Dynasty cross-project initiatives. Designed for PO agents and session startup to quickly understand current priorities, track progress, and coordinate work across all PD repos.
|
||||
|
||||
## Features
|
||||
|
||||
### Commands
|
||||
- **`pd-plan summary`** — Dashboard view showing phase progress, top priorities, and recent activity. Intended for session startup.
|
||||
- **`pd-plan list`** — Tabular view of initiatives with filters: `--phase`, `--status`, `--impact`, `--repo`, `--owner`, `--all`
|
||||
- **`pd-plan next`** — Returns the highest-priority non-blocked initiative. Supports `--repo` and `--owner` filters.
|
||||
- **`pd-plan show [id]`** — Full details of an initiative including description, linked issues, and activity log.
|
||||
- **`pd-plan add [title]`** — Create a new initiative with `--phase`, `--priority`, `--impact`, `--size`, `--repos`, `--linked`, `--description`
|
||||
- **`pd-plan update [id]`** — Update fields: `--status`, `--priority`, `--owner`, `--blocked-by`, `--link`, `--note`
|
||||
- **`pd-plan done [id]`** — Shortcut for `--status done`
|
||||
- **`pd-plan seed`** — Populate database with initial initiatives from the ROADMAP (idempotent)
|
||||
|
||||
### Cross-cutting
|
||||
- `--json` flag on all commands for machine-readable output
|
||||
- `--actor` flag for attribution in the activity log (e.g., `--actor pd-ops`)
|
||||
- Auto-creates database on first run
|
||||
- Activity log tracks all status changes and notes with timestamps
|
||||
|
||||
### Schema
|
||||
- **`initiatives`** table: id, title, description, phase (1-3), status (backlog/active/in_progress/blocked/done), priority (1-100), impact (retention/acquisition/engagement), size (S/M/L/XL), repos, linked_issues, blocked_by, owner, notes, timestamps
|
||||
- **`activity_log`** table: initiative_id, action, old/new values, actor, timestamp
|
||||
|
||||
### Seed Data
|
||||
17 initiatives across 3 phases, sourced from `ROADMAP.md`:
|
||||
- Phase 1 (Foundation): 8 initiatives — Refractor system, backlog clearance, API stability, SQLite cleanup, ruff resolution
|
||||
- Phase 2 (Engagement): 5 initiatives — pack experience, leagues, content pipeline, social features, gauntlets
|
||||
- Phase 3 (Growth): 4 initiatives — website, web app, onboarding, analytics
|
||||
|
||||
## Integration
|
||||
|
||||
- Referenced in parent `CLAUDE.md` as session startup instruction
|
||||
- Documented in `/paper-dynasty` skill under "Initiative Tracker" section
|
||||
- All 5 PO/ops agents (`pd-database`, `pd-discord`, `pd-cards`, `pd-growth`, `pd-ops`) have `pd-plan` instructions in their agent definitions
|
||||
|
||||
## Dependencies
|
||||
|
||||
None — pure Python stdlib (sqlite3, argparse, textwrap, json, datetime).
|
||||
@ -1,62 +0,0 @@
|
||||
---
|
||||
title: "Codex-to-Claude Agent Converter & Plugin Marketplace"
|
||||
description: "Pipeline that converts VoltAgent/awesome-codex-subagents TOML definitions to Claude Code plugin marketplace format, hosted at cal/codex-agents on Gitea."
|
||||
type: reference
|
||||
domain: productivity
|
||||
tags: [claude-code, automation, plugins, agents, gitea]
|
||||
---
|
||||
|
||||
# Codex Agents Marketplace
|
||||
|
||||
## Overview
|
||||
|
||||
136+ specialized agent definitions converted from [VoltAgent/awesome-codex-subagents](https://github.com/VoltAgent/awesome-codex-subagents) (OpenAI Codex format) to Claude Code plugin marketplace format.
|
||||
|
||||
- **Repo**: `cal/codex-agents` on Gitea (`git@git.manticorum.com:cal/codex-agents.git`)
|
||||
- **Local path**: `/mnt/NV2/Development/codex-agents/`
|
||||
- **Upstream**: Cloned to `upstream/` (gitignored), pulled on each sync
|
||||
|
||||
## Sync Pipeline
|
||||
|
||||
```bash
|
||||
cd /mnt/NV2/Development/codex-agents
|
||||
./sync.sh # pull upstream + convert changed agents
|
||||
./sync.sh --force # re-convert all regardless of hash
|
||||
./sync.sh --dry-run # preview only
|
||||
./sync.sh --verbose # per-agent status
|
||||
```
|
||||
|
||||
- `convert.py` handles TOML → Markdown+YAML frontmatter conversion
|
||||
- SHA-256 per-file hashes in `codex-manifest.json` skip unchanged agents
|
||||
- Deleted upstream agents are auto-removed locally
|
||||
- `.claude-plugin/marketplace.json` is regenerated on each sync
|
||||
|
||||
## Format Mapping
|
||||
|
||||
| Codex | Claude Code |
|
||||
|-------|------------|
|
||||
| `gpt-5.4` + `high` | `model: opus` |
|
||||
| `gpt-5.3-codex-spark` + `medium` | `model: sonnet` |
|
||||
| `sandbox_mode: read-only` | `disallowedTools: Edit, Write` |
|
||||
| `sandbox_mode: workspace-write` | full tool access |
|
||||
| `developer_instructions` | markdown body |
|
||||
| `"parent agent"` | replaced with `"orchestrating agent"` |
|
||||
|
||||
## Installing Agents
|
||||
|
||||
Add marketplace to `~/.claude/settings.json`:
|
||||
```json
|
||||
"extraKnownMarketplaces": {
|
||||
"codex-agents": { "source": { "source": "git", "url": "https://git.manticorum.com/cal/codex-agents.git" } }
|
||||
}
|
||||
```
|
||||
|
||||
Then:
|
||||
```bash
|
||||
claude plugin update codex-agents
|
||||
claude plugin install docker-expert@codex-agents --scope user
|
||||
```
|
||||
|
||||
## Agent Categories
|
||||
|
||||
10 categories: Core Development (12), Language Specialists (27), Infrastructure (16), Quality & Security (16), Data & AI (12), Developer Experience (13), Specialized Domains (12), Business & Product (11), Meta & Orchestration (10), Research & Analysis (7).
|
||||
@ -158,23 +158,6 @@ ls -t ~/.local/share/claude-scheduled/logs/backlog-triage/ | head -1
|
||||
~/.config/claude-scheduled/runner.sh backlog-triage
|
||||
```
|
||||
|
||||
## Session Resumption
|
||||
|
||||
Tasks can opt into session persistence for multi-step workflows:
|
||||
|
||||
```json
|
||||
{
|
||||
"session_resumable": true,
|
||||
"resume_last_session": true
|
||||
}
|
||||
```
|
||||
|
||||
When `session_resumable` is `true`, runner.sh saves the `session_id` to `$LOG_DIR/last_session_id` after each run. When `resume_last_session` is also `true`, the next run resumes that session with `--resume`.
|
||||
|
||||
Issue-poller and PR-reviewer capture `session_id` in logs and result JSON for manual follow-up.
|
||||
|
||||
See also: [Agent SDK Evaluation](agent-sdk-evaluation.md) for CLI vs SDK comparison.
|
||||
|
||||
## Cost Safety
|
||||
|
||||
- Per-task `max_budget_usd` cap — runner.sh detects `error_max_budget_usd` and warns
|
||||
|
||||
@ -1,175 +0,0 @@
|
||||
---
|
||||
title: "Agent SDK Evaluation — CLI vs Python/TypeScript SDK"
|
||||
description: "Comparison of Claude Code CLI invocation (claude -p) vs the native Agent SDK for programmatic use in the headless-claude and claude-scheduled systems."
|
||||
type: context
|
||||
domain: scheduled-tasks
|
||||
tags: [claude-code, sdk, agent-sdk, python, typescript, headless, automation, evaluation]
|
||||
---
|
||||
|
||||
# Agent SDK Evaluation: CLI vs Python/TypeScript SDK
|
||||
|
||||
**Date:** 2026-04-03
|
||||
**Status:** Evaluation complete — recommendation below
|
||||
**Related:** Issue #3 (headless-claude: Additional Agent SDK improvements)
|
||||
|
||||
## 1. Current Approach — CLI via `claude -p`
|
||||
|
||||
All headless Claude invocations use the CLI subprocess pattern:
|
||||
|
||||
```bash
|
||||
claude -p "<prompt>" \
|
||||
--model sonnet \
|
||||
--output-format json \
|
||||
--allowedTools "Read,Grep,Glob" \
|
||||
--append-system-prompt "..." \
|
||||
--max-budget-usd 2.00
|
||||
```
|
||||
|
||||
**Pros:**
|
||||
- Simple to invoke from any language (bash, n8n SSH nodes, systemd units)
|
||||
- Uses Claude Max OAuth — no API key needed, no per-token billing
|
||||
- Mature and battle-tested in our scheduled-tasks framework
|
||||
- CLAUDE.md and settings.json are loaded automatically
|
||||
- No runtime dependencies beyond the CLI binary
|
||||
|
||||
**Cons:**
|
||||
- Structured output requires parsing JSON from stdout
|
||||
- Error handling is exit-code-based with stderr parsing
|
||||
- No mid-stream observability (streaming requires JSONL parsing)
|
||||
- Tool approval is allowlist-only — no dynamic per-call decisions
|
||||
- Session resumption requires manual `--resume` flag plumbing
|
||||
|
||||
## 2. Python Agent SDK
|
||||
|
||||
**Package:** `claude-agent-sdk` (renamed from `claude-code`)
|
||||
**Install:** `pip install claude-agent-sdk`
|
||||
**Requires:** Python 3.10+, `ANTHROPIC_API_KEY` env var
|
||||
|
||||
```python
|
||||
from claude_agent_sdk import query, ClaudeAgentOptions
|
||||
|
||||
async for message in query(
|
||||
prompt="Diagnose server health",
|
||||
options=ClaudeAgentOptions(
|
||||
allowed_tools=["Read", "Grep", "Bash(python3 *)"],
|
||||
output_format={"type": "json_schema", "schema": {...}},
|
||||
max_budget_usd=2.00,
|
||||
),
|
||||
):
|
||||
if hasattr(message, "result"):
|
||||
print(message.result)
|
||||
```
|
||||
|
||||
**Key features:**
|
||||
- Async generator with typed `SDKMessage` objects (User, Assistant, Result, System)
|
||||
- `ClaudeSDKClient` for stateful multi-turn conversations
|
||||
- `can_use_tool` callback for dynamic per-call tool approval
|
||||
- In-process hooks (`PreToolUse`, `PostToolUse`, `Stop`, etc.)
|
||||
- `rewindFiles()` to restore filesystem to any prior message point
|
||||
- Typed exception hierarchy (`CLINotFoundError`, `ProcessError`, etc.)
|
||||
|
||||
**Limitation:** Shells out to the Claude Code CLI binary — it is NOT a pure HTTP client. The binary must be installed.
|
||||
|
||||
## 3. TypeScript Agent SDK
|
||||
|
||||
**Package:** `@anthropic-ai/claude-agent-sdk` (renamed from `@anthropic-ai/claude-code`)
|
||||
**Install:** `npm install @anthropic-ai/claude-agent-sdk`
|
||||
**Requires:** Node 18+, `ANTHROPIC_API_KEY` env var
|
||||
|
||||
```typescript
|
||||
import { query } from "@anthropic-ai/claude-agent-sdk";
|
||||
|
||||
for await (const message of query({
|
||||
prompt: "Diagnose server health",
|
||||
options: {
|
||||
allowedTools: ["Read", "Grep", "Bash(python3 *)"],
|
||||
maxBudgetUsd: 2.00,
|
||||
}
|
||||
})) {
|
||||
if ("result" in message) console.log(message.result);
|
||||
}
|
||||
```
|
||||
|
||||
**Key features (superset of Python):**
|
||||
- Same async generator pattern
|
||||
- `"auto"` permission mode (model classifier per tool call) — TS-only
|
||||
- `spawnClaudeCodeProcess` hook for remote/containerized execution
|
||||
- `setMcpServers()` for dynamic MCP server swapping mid-session
|
||||
- V2 preview: `send()` / `stream()` patterns for simpler multi-turn
|
||||
- Bundles the Claude Code binary — no separate install needed
|
||||
|
||||
## 4. Comparison Matrix
|
||||
|
||||
| Capability | `claude -p` CLI | Python SDK | TypeScript SDK |
|
||||
|---|---|---|---|
|
||||
| **Auth** | OAuth (Claude Max) | API key only | API key only |
|
||||
| **Invocation** | Shell subprocess | Async generator | Async generator |
|
||||
| **Structured output** | `--json-schema` flag | Schema in options | Schema in options |
|
||||
| **Streaming** | JSONL parsing | Typed messages | Typed messages |
|
||||
| **Tool approval** | `--allowedTools` only | `can_use_tool` callback | `canUseTool` callback + auto mode |
|
||||
| **Session resume** | `--resume` flag | `resume: sessionId` | `resume: sessionId` |
|
||||
| **Cost tracking** | Parse result JSON | `ResultMessage.total_cost_usd` | Same + per-model breakdown |
|
||||
| **Error handling** | Exit codes + stderr | Typed exceptions | Typed exceptions |
|
||||
| **Hooks** | External shell scripts | In-process callbacks | In-process callbacks |
|
||||
| **Custom tools** | Not available | `tool()` decorator | `tool()` + Zod schemas |
|
||||
| **Subagents** | Not programmatic | `agents` option | `agents` option |
|
||||
| **File rewind** | Not available | `rewindFiles()` | `rewindFiles()` |
|
||||
| **MCP servers** | `--mcp-config` file | Inline config object | Inline + dynamic swap |
|
||||
| **CLAUDE.md loading** | Automatic | Must opt-in (`settingSources`) | Must opt-in |
|
||||
| **Dependencies** | CLI binary | CLI binary + Python | Node 18+ (bundles CLI) |
|
||||
|
||||
## 5. Integration Paths
|
||||
|
||||
### A. n8n Code Nodes
|
||||
|
||||
The n8n Code node supports JavaScript (not TypeScript directly, but the SDK's JS output works). This would replace the current SSH → CLI pattern:
|
||||
|
||||
```
|
||||
Schedule Trigger → Code Node (JS, uses SDK) → IF → Discord
|
||||
```
|
||||
|
||||
**Trade-off:** Eliminates the SSH hop to CT 300, but requires `ANTHROPIC_API_KEY` and n8n to have the npm package installed. Current n8n runs in a Docker container on CT 210 — would need the SDK and CLI binary in the image.
|
||||
|
||||
### B. Standalone Python Scripts
|
||||
|
||||
Replace `claude -p` subprocess calls in custom dispatchers with the Python SDK:
|
||||
|
||||
```python
|
||||
# Instead of: subprocess.run(["claude", "-p", prompt, ...])
|
||||
async for msg in query(prompt=prompt, options=opts):
|
||||
...
|
||||
```
|
||||
|
||||
**Trade-off:** Richer error handling and streaming, but our dispatchers are bash scripts, not Python. Would require rewriting `runner.sh` and dispatchers in Python.
|
||||
|
||||
### C. Systemd-triggered Tasks (Current Architecture)
|
||||
|
||||
Keep systemd timers → bash scripts, but optionally invoke a thin Python wrapper that uses the SDK instead of `claude -p` directly.
|
||||
|
||||
**Trade-off:** Adds Python as a dependency for scheduled tasks that currently only need bash + the CLI binary. Marginal benefit unless we need hooks or dynamic tool approval.
|
||||
|
||||
## 6. Recommendation
|
||||
|
||||
**Stay with CLI invocation for now. Revisit the Python SDK when we need dynamic tool approval or in-process hooks.**
|
||||
|
||||
### Rationale
|
||||
|
||||
1. **Auth is the blocker.** The SDK requires `ANTHROPIC_API_KEY` (API billing). Our entire scheduled-tasks framework runs on Claude Max OAuth at zero marginal cost. Switching to the SDK means paying per-token for every scheduled task, issue-worker, and PR-reviewer invocation. This alone makes the SDK non-viable for our current architecture.
|
||||
|
||||
2. **The CLI covers our needs.** With `--append-system-prompt` (done), `--resume` (this PR), `--json-schema`, and `--allowedTools`, the CLI provides everything we currently need. Session resumption was the last missing piece.
|
||||
|
||||
3. **Bash scripts are the right abstraction.** Our runners are launched by systemd timers. Bash + CLI is the natural fit — no runtime dependencies, no async event loops, no package management.
|
||||
|
||||
### When to Revisit
|
||||
|
||||
- If Anthropic adds OAuth support to the SDK (eliminating the billing difference)
|
||||
- If we need dynamic tool approval (e.g., "allow this Bash command but deny that one" at runtime)
|
||||
- If we build a long-running Python service that orchestrates multiple Claude sessions (the `ClaudeSDKClient` stateful pattern would be valuable there)
|
||||
- If we move to n8n custom nodes written in TypeScript (the TS SDK bundles the CLI binary)
|
||||
|
||||
### Migration Path (If Needed Later)
|
||||
|
||||
1. Start with the Python SDK in a single task (e.g., `backlog-triage`) as a proof of concept
|
||||
2. Create a thin `sdk-runner.py` wrapper that reads the same `settings.json` and `prompt.md` files
|
||||
3. Swap the systemd unit's `ExecStart` from `runner.sh` to `sdk-runner.py`
|
||||
4. Expand to other tasks if the POC proves valuable
|
||||
@ -245,25 +245,11 @@ hosts:
|
||||
- sqlite-major-domo
|
||||
- temp-postgres
|
||||
|
||||
# Docker Home Servers VM (Proxmox) - decommission candidate
|
||||
# VM 116: Only Jellyfin remains after 2026-04-03 cleanup (watchstate removed — duplicate of manticore's canonical instance)
|
||||
# Jellyfin on manticore already covers this service. VM 116 + VM 110 are candidates to reclaim 8 vCPUs + 16 GB RAM.
|
||||
# See issue #31 for cleanup details.
|
||||
docker-home-servers:
|
||||
type: docker
|
||||
ip: 10.10.0.124
|
||||
vmid: 116
|
||||
user: cal
|
||||
description: "Legacy home servers VM — Jellyfin only, decommission candidate"
|
||||
config_paths:
|
||||
docker-compose: /home/cal/container-data
|
||||
services:
|
||||
- jellyfin # only remaining service; duplicate of ubuntu-manticore jellyfin
|
||||
decommission_candidate: true
|
||||
notes: "watchstate removed 2026-04-03 (duplicate of manticore); 3.36 GB images pruned; see issue #31"
|
||||
|
||||
# Decommissioned hosts (kept for reference)
|
||||
# decommissioned:
|
||||
# tdarr-old:
|
||||
# ip: 10.10.0.43
|
||||
# note: "Replaced by ubuntu-manticore tdarr"
|
||||
# docker-home:
|
||||
# ip: 10.10.0.124
|
||||
# note: "Decommissioned"
|
||||
|
||||
@ -1,246 +0,0 @@
|
||||
---
|
||||
title: "Proxmox Monthly Maintenance Reboot"
|
||||
description: "Runbook for the first-Sunday-of-the-month Proxmox host reboot — dependency-aware shutdown/startup order, validation checklist, and Ansible automation."
|
||||
type: runbook
|
||||
domain: server-configs
|
||||
tags: [proxmox, maintenance, reboot, ansible, operations, systemd]
|
||||
---
|
||||
|
||||
# Proxmox Monthly Maintenance Reboot
|
||||
|
||||
## Overview
|
||||
|
||||
| Detail | Value |
|
||||
|--------|-------|
|
||||
| **Schedule** | 1st Sunday of every month, 3:00 AM ET (08:00 UTC) |
|
||||
| **Expected downtime** | ~15 minutes (host reboot + VM/LXC startup) |
|
||||
| **Orchestration** | Ansible on LXC 304 — shutdown playbook → host reboot → post-reboot startup playbook |
|
||||
| **Calendar** | Google Calendar recurring event: "Proxmox Monthly Maintenance Reboot" |
|
||||
| **HA DNS** | ubuntu-manticore (10.10.0.226) provides Pi-hole 2 during Proxmox downtime |
|
||||
|
||||
## Why
|
||||
|
||||
- Kernel updates accumulate without reboot and never take effect
|
||||
- Long uptimes allow memory leaks and process state drift (e.g., avahi busy-loops)
|
||||
- Validates that all VMs/LXCs auto-start cleanly with `onboot: 1`
|
||||
|
||||
## Architecture
|
||||
|
||||
The reboot is split into two playbooks because LXC 304 (the Ansible controller) is itself a guest on the Proxmox host being rebooted:
|
||||
|
||||
1. **`monthly-reboot.yml`** — Snapshots all guests, shuts them down in dependency order, issues a fire-and-forget `reboot` to the Proxmox host, then exits. LXC 304 is killed when the host reboots.
|
||||
2. **`post-reboot-startup.yml`** — After the host reboots, LXC 304 auto-starts via `onboot: 1`. A systemd service (`ansible-post-reboot.service`) waits 120 seconds for the Proxmox API to stabilize, then starts all guests in dependency order with staggered delays.
|
||||
|
||||
The `onboot: 1` flag on all production guests acts as a safety net — even if the post-reboot playbook fails, Proxmox will start everything (though without controlled ordering).
|
||||
|
||||
## Prerequisites (Before Maintenance)
|
||||
|
||||
- [ ] Verify no active Tdarr transcodes on ubuntu-manticore
|
||||
- [ ] Verify no running database backups
|
||||
- [ ] Ensure workstation has Pi-hole 2 (10.10.0.226) as a fallback DNS server so it fails over automatically during downtime
|
||||
- [ ] Confirm ubuntu-manticore Pi-hole 2 is healthy: `ssh manticore "docker exec pihole pihole status"`
|
||||
|
||||
## `onboot` Audit
|
||||
|
||||
All production VMs and LXCs must have `onboot: 1` so they restart automatically as a safety net.
|
||||
|
||||
**Check VMs:**
|
||||
```bash
|
||||
ssh proxmox "for id in \$(qm list | awk 'NR>1{print \$1}'); do \
|
||||
name=\$(qm config \$id | grep '^name:' | awk '{print \$2}'); \
|
||||
onboot=\$(qm config \$id | grep '^onboot:'); \
|
||||
echo \"VM \$id (\$name): \${onboot:-onboot NOT SET}\"; \
|
||||
done"
|
||||
```
|
||||
|
||||
**Check LXCs:**
|
||||
```bash
|
||||
ssh proxmox "for id in \$(pct list | awk 'NR>1{print \$1}'); do \
|
||||
name=\$(pct config \$id | grep '^hostname:' | awk '{print \$2}'); \
|
||||
onboot=\$(pct config \$id | grep '^onboot:'); \
|
||||
echo \"LXC \$id (\$name): \${onboot:-onboot NOT SET}\"; \
|
||||
done"
|
||||
```
|
||||
|
||||
**Audit results (2026-04-03):**
|
||||
|
||||
| ID | Name | Type | `onboot` | Status |
|
||||
|----|------|------|----------|--------|
|
||||
| 106 | docker-home | VM | 1 | OK |
|
||||
| 109 | homeassistant | VM | 1 | OK (fixed 2026-04-03) |
|
||||
| 110 | discord-bots | VM | 1 | OK |
|
||||
| 112 | databases-bots | VM | 1 | OK |
|
||||
| 115 | docker-sba | VM | 1 | OK |
|
||||
| 116 | docker-home-servers | VM | 1 | OK |
|
||||
| 210 | docker-n8n-lxc | LXC | 1 | OK |
|
||||
| 221 | arr-stack | LXC | 1 | OK (fixed 2026-04-03) |
|
||||
| 222 | memos | LXC | 1 | OK |
|
||||
| 223 | foundry-lxc | LXC | 1 | OK (fixed 2026-04-03) |
|
||||
| 225 | gitea | LXC | 1 | OK |
|
||||
| 227 | uptime-kuma | LXC | 1 | OK |
|
||||
| 301 | claude-discord-coordinator | LXC | 1 | OK |
|
||||
| 302 | claude-runner | LXC | 1 | OK |
|
||||
| 303 | mcp-gateway | LXC | 0 | Intentional (on-demand) |
|
||||
| 304 | ansible-controller | LXC | 1 | OK |
|
||||
|
||||
**If any production guest is missing `onboot: 1`:**
|
||||
```bash
|
||||
ssh proxmox "qm set <VMID> --onboot 1" # for VMs
|
||||
ssh proxmox "pct set <CTID> --onboot 1" # for LXCs
|
||||
```
|
||||
|
||||
## Shutdown Order (Dependency-Aware)
|
||||
|
||||
Reverse of the validated startup sequence. Stop consumers before their dependencies. Each tier polls per-guest status rather than using fixed waits.
|
||||
|
||||
```
|
||||
Tier 4 — Media & Others (no downstream dependents)
|
||||
VM 109 homeassistant
|
||||
LXC 221 arr-stack
|
||||
LXC 222 memos
|
||||
LXC 223 foundry-lxc
|
||||
LXC 302 claude-runner
|
||||
|
||||
Tier 3 — Applications (depend on databases + infra)
|
||||
VM 115 docker-sba (Paper Dynasty, Major Domo)
|
||||
VM 110 discord-bots
|
||||
LXC 301 claude-discord-coordinator
|
||||
|
||||
Tier 2 — Infrastructure + DNS (depend on databases)
|
||||
VM 106 docker-home (Pi-hole 1, NPM)
|
||||
LXC 225 gitea
|
||||
LXC 210 docker-n8n-lxc
|
||||
LXC 227 uptime-kuma
|
||||
VM 116 docker-home-servers
|
||||
|
||||
Tier 1 — Databases (no dependencies, shut down last)
|
||||
VM 112 databases-bots (force-stop after 90s if ACPI ignored)
|
||||
|
||||
→ LXC 304 issues fire-and-forget reboot to Proxmox host, then is killed
|
||||
```
|
||||
|
||||
**Known quirks:**
|
||||
- VM 112 (databases-bots) may ignore ACPI shutdown — playbook force-stops after 90s
|
||||
- VM 109 (homeassistant) is self-managed via HA Supervisor, excluded from Ansible inventory
|
||||
- LXC 303 (mcp-gateway) has `onboot: 0` and is operator-managed — not included in shutdown/startup. If it was running before maintenance, bring it up manually afterward
|
||||
|
||||
## Startup Order (Staggered)
|
||||
|
||||
After the Proxmox host reboots, LXC 304 auto-starts and the `ansible-post-reboot.service` waits 120s before running the controlled startup:
|
||||
|
||||
```
|
||||
Tier 1 — Databases first
|
||||
VM 112 databases-bots
|
||||
→ wait 30s for DB to accept connections
|
||||
|
||||
Tier 2 — Infrastructure + DNS
|
||||
VM 106 docker-home (Pi-hole 1, NPM)
|
||||
LXC 225 gitea
|
||||
LXC 210 docker-n8n-lxc
|
||||
LXC 227 uptime-kuma
|
||||
VM 116 docker-home-servers
|
||||
→ wait 30s
|
||||
|
||||
Tier 3 — Applications
|
||||
VM 115 docker-sba
|
||||
VM 110 discord-bots
|
||||
LXC 301 claude-discord-coordinator
|
||||
→ wait 30s
|
||||
|
||||
Pi-hole fix — restart container via SSH to clear UDP DNS bug
|
||||
ssh docker-home "docker restart pihole"
|
||||
→ wait 10s
|
||||
|
||||
Tier 4 — Media & Others
|
||||
VM 109 homeassistant
|
||||
LXC 221 arr-stack
|
||||
LXC 222 memos
|
||||
LXC 223 foundry-lxc
|
||||
LXC 302 claude-runner
|
||||
```
|
||||
|
||||
## Post-Reboot Validation
|
||||
|
||||
- [ ] Pi-hole 1 DNS resolving: `ssh docker-home "docker exec pihole dig google.com @127.0.0.1"`
|
||||
- [ ] Gitea accessible: `curl -sf https://git.manticorum.com/api/v1/version`
|
||||
- [ ] n8n healthy: `ssh docker-n8n-lxc "docker ps --filter name=n8n --format '{{.Status}}'"`
|
||||
- [ ] Discord bots responding (check Discord)
|
||||
- [ ] Uptime Kuma dashboard green: `curl -sf http://10.10.0.227:3001/api/status-page/homelab`
|
||||
- [ ] Home Assistant running: `curl -sf http://10.10.0.109:8123/api/ -H 'Authorization: Bearer <token>'`
|
||||
- [ ] Maintenance snapshots cleaned up (auto, 7-day retention)
|
||||
|
||||
## Automation
|
||||
|
||||
### Ansible Playbooks
|
||||
|
||||
Both located at `/opt/ansible/playbooks/` on LXC 304.
|
||||
|
||||
```bash
|
||||
# Dry run — shutdown only
|
||||
ssh ansible "ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --check"
|
||||
|
||||
# Manual full execution — shutdown + reboot
|
||||
ssh ansible "ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml"
|
||||
|
||||
# Manual post-reboot startup (if automatic startup failed)
|
||||
ssh ansible "ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml"
|
||||
|
||||
# Shutdown only — skip the host reboot
|
||||
ssh ansible "ansible-playbook /opt/ansible/playbooks/monthly-reboot.yml --tags shutdown"
|
||||
```
|
||||
|
||||
### Systemd Units (on LXC 304)
|
||||
|
||||
| Unit | Purpose | Schedule |
|
||||
|------|---------|----------|
|
||||
| `ansible-monthly-reboot.timer` | Triggers shutdown + reboot playbook | 1st Sunday of month, 08:00 UTC |
|
||||
| `ansible-monthly-reboot.service` | Runs `monthly-reboot.yml` | Activated by timer |
|
||||
| `ansible-post-reboot.service` | Runs `post-reboot-startup.yml` | On boot (multi-user.target), only if uptime < 10 min |
|
||||
|
||||
```bash
|
||||
# Check timer status
|
||||
ssh ansible "systemctl status ansible-monthly-reboot.timer"
|
||||
|
||||
# Next scheduled run
|
||||
ssh ansible "systemctl list-timers ansible-monthly-reboot.timer"
|
||||
|
||||
# Check post-reboot service status
|
||||
ssh ansible "systemctl status ansible-post-reboot.service"
|
||||
|
||||
# Disable for a month (e.g., during an incident)
|
||||
ssh ansible "systemctl stop ansible-monthly-reboot.timer"
|
||||
```
|
||||
|
||||
### Deployment (one-time setup on LXC 304)
|
||||
|
||||
```bash
|
||||
# Copy playbooks
|
||||
scp ansible/playbooks/monthly-reboot.yml ansible:/opt/ansible/playbooks/
|
||||
scp ansible/playbooks/post-reboot-startup.yml ansible:/opt/ansible/playbooks/
|
||||
|
||||
# Copy and enable systemd units
|
||||
scp ansible/systemd/ansible-monthly-reboot.timer ansible:/etc/systemd/system/
|
||||
scp ansible/systemd/ansible-monthly-reboot.service ansible:/etc/systemd/system/
|
||||
scp ansible/systemd/ansible-post-reboot.service ansible:/etc/systemd/system/
|
||||
ssh ansible "sudo systemctl daemon-reload && \
|
||||
sudo systemctl enable --now ansible-monthly-reboot.timer && \
|
||||
sudo systemctl enable ansible-post-reboot.service"
|
||||
|
||||
# Verify SSH key access from LXC 304 to docker-home (needed for Pi-hole restart)
|
||||
ssh ansible "ssh -o BatchMode=yes docker-home 'echo ok'"
|
||||
```
|
||||
|
||||
## Rollback
|
||||
|
||||
If a guest fails to start after reboot:
|
||||
1. Check Proxmox web UI or `pvesh get /nodes/proxmox/qemu/<VMID>/status/current`
|
||||
2. Review guest logs: `ssh proxmox "journalctl -u pve-guests -n 50"`
|
||||
3. Manual start: `ssh proxmox "pvesh create /nodes/proxmox/qemu/<VMID>/status/start"`
|
||||
4. If guest is corrupted, restore from the pre-reboot Proxmox snapshot
|
||||
5. If post-reboot startup failed entirely, run manually: `ssh ansible "ansible-playbook /opt/ansible/playbooks/post-reboot-startup.yml"`
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- [Ansible Controller Setup](../../vm-management/ansible-controller-setup.md) — LXC 304 details and inventory
|
||||
- [Proxmox 7→9 Upgrade Plan](../../vm-management/proxmox-upgrades/proxmox-7-to-9-upgrade-plan.md) — original startup order and Phase 1 lessons
|
||||
- [VM Decommission Runbook](../../vm-management/vm-decommission-runbook.md) — removing VMs from the rotation
|
||||
15
server-configs/proxmox/qemu/105.conf
Normal file
15
server-configs/proxmox/qemu/105.conf
Normal file
@ -0,0 +1,15 @@
|
||||
agent: 1
|
||||
boot: order=scsi0;net0
|
||||
cores: 8
|
||||
memory: 16384
|
||||
meta: creation-qemu=6.1.0,ctime=1646688596
|
||||
name: docker-vpn
|
||||
net0: virtio=76:36:85:A7:6A:A3,bridge=vmbr0,firewall=1
|
||||
numa: 0
|
||||
onboot: 1
|
||||
ostype: l26
|
||||
scsi0: local-lvm:vm-105-disk-0,size=256G
|
||||
scsihw: virtio-scsi-pci
|
||||
smbios1: uuid=55061264-b9b1-4ce4-8d44-9c187affcb1d
|
||||
sockets: 1
|
||||
vmgenid: 30878bdf-66f9-41bf-be34-c31b400340f9
|
||||
@ -12,5 +12,5 @@ ostype: l26
|
||||
scsi0: local-lvm:vm-115-disk-0,size=256G
|
||||
scsihw: virtio-scsi-pci
|
||||
smbios1: uuid=19be98ee-f60d-473d-acd2-9164717fcd11
|
||||
sockets: 1
|
||||
sockets: 2
|
||||
vmgenid: 682dfeab-8c63-4f0b-8ed2-8828c2f808ef
|
||||
|
||||
@ -3,7 +3,6 @@ services:
|
||||
tdarr:
|
||||
image: ghcr.io/haveagitgat/tdarr:latest
|
||||
container_name: tdarr-server
|
||||
init: true
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "8265:8265" # Web UI
|
||||
@ -24,7 +23,6 @@ services:
|
||||
tdarr-node:
|
||||
image: ghcr.io/haveagitgat/tdarr_node:latest
|
||||
container_name: tdarr-node
|
||||
init: true
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- PUID=1000
|
||||
@ -39,8 +37,6 @@ services:
|
||||
- /mnt/NV2/tdarr-cache:/temp
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 28g
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
|
||||
@ -18,7 +18,6 @@ Virtual machine management for home lab environments with focus on automated pro
|
||||
- **SSH**: `ssh -i ~/.ssh/homelab_rsa root@10.10.0.11`
|
||||
- **Storage**: local (100GB dir), local-lvm (2.3TB thin), home-truenas (17TB CIFS at 10.10.0.35)
|
||||
- **Networking**: vmbr0 (10.10.0.x/24 via eno1), vmbr1 (10.0.0.x/24 via eno2, Matter/IoT)
|
||||
- **Ansible Controller**: LXC 304 at 10.10.0.232 — automated updates with snapshot rollback, weekly systemd timer (Sun 3 AM). See `ansible-controller-setup.md`
|
||||
- **Upgrade plan**: Phase 2 (PVE 8→9) pending — see `proxmox-upgrades/proxmox-7-to-9-upgrade-plan.md`
|
||||
|
||||
## Architecture Patterns
|
||||
|
||||
@ -1,162 +0,0 @@
|
||||
---
|
||||
title: "Ansible Controller LXC Setup"
|
||||
description: "Complete setup guide for LXC 304 (ansible-controller) at 10.10.0.232 — automated OS/Docker updates with Proxmox snapshot rollback across all VMs, LXCs, and physical servers."
|
||||
type: guide
|
||||
domain: vm-management
|
||||
tags: [ansible, proxmox, lxc, automation, updates, snapshots, rollback, systemd]
|
||||
---
|
||||
|
||||
# Ansible Controller LXC Setup
|
||||
|
||||
Centralized Ansible controller for automated infrastructure updates with Proxmox snapshot-based rollback.
|
||||
|
||||
## LXC Details
|
||||
|
||||
- **VMID**: 304
|
||||
- **Hostname**: ansible-controller
|
||||
- **IP**: 10.10.0.232
|
||||
- **SSH alias**: `ansible-controller` or `ansible`
|
||||
- **OS**: Ubuntu 24.04
|
||||
- **Resources**: 2 cores, 2GB RAM, 16GB disk
|
||||
- **Ansible version**: 2.20.4 (from PPA)
|
||||
- **Collections**: community.general, community.docker (bundled)
|
||||
- **User**: `cal` runs playbooks, SSH key at `/home/cal/.ssh/homelab_rsa`
|
||||
|
||||
## Directory Layout
|
||||
|
||||
```
|
||||
/opt/ansible/
|
||||
├── ansible.cfg # Main config (pipelining, forks=5)
|
||||
├── inventory/
|
||||
│ └── hosts.yml # Full infrastructure inventory
|
||||
├── playbooks/
|
||||
│ ├── update-all.yml # Full cycle: snapshot → OS → Docker → health → cleanup
|
||||
│ ├── os-update-only.yml # OS packages only (lighter)
|
||||
│ ├── rollback.yml # Roll back any host to a snapshot
|
||||
│ └── check-status.yml # Read-only health/status check
|
||||
├── run-update.sh # Runner script with logging
|
||||
├── roles/ # (empty, for future use)
|
||||
└── logs/ # Update run logs (12-week retention)
|
||||
```
|
||||
|
||||
## Managed Hosts (15 total)
|
||||
|
||||
### Proxmox Host
|
||||
| Host | IP | User |
|
||||
|------|----|------|
|
||||
| pve-node | 10.10.0.11 | root |
|
||||
|
||||
### VMs
|
||||
| Host | IP | VMID | User | Python |
|
||||
|------|-----|------|------|--------|
|
||||
| docker-home | 10.10.0.16 | 106 | cal | 3.9 |
|
||||
| discord-bots | 10.10.0.33 | 110 | cal | 3.9 |
|
||||
| databases-bots | 10.10.0.42 | 112 | cal | 3.9 |
|
||||
| docker-sba | 10.10.0.88 | 115 | cal | 3.9 |
|
||||
| docker-home-servers | 10.10.0.124 | 116 | cal | 3.9 |
|
||||
|
||||
### LXCs
|
||||
| Host | IP | VMID | User | Python |
|
||||
|------|-----|------|------|--------|
|
||||
| docker-n8n-lxc | 10.10.0.210 | 210 | root | 3.9 |
|
||||
| arr-stack | 10.10.0.221 | 221 | root | 3.9 |
|
||||
| memos | 10.10.0.222 | 222 | root | 3.9 |
|
||||
| foundry-lxc | 10.10.0.223 | 223 | root | 3.9 |
|
||||
| gitea | 10.10.0.225 | 225 | root | 3.9 |
|
||||
| uptime-kuma | 10.10.0.227 | 227 | root | 3.10 |
|
||||
| claude-discord-coordinator | 10.10.0.230 | 301 | root | 3.12 |
|
||||
| claude-runner | 10.10.0.148 | 302 | root | 3.12 |
|
||||
|
||||
### Physical
|
||||
| Host | IP | User | Python |
|
||||
|------|----|------|--------|
|
||||
| ubuntu-manticore | 10.10.0.226 | cal | 3.12 |
|
||||
|
||||
### Excluded
|
||||
- **Home Assistant** (VM 109): self-managed via HA Supervisor
|
||||
- **Palworld** (LXC 230): deleted 2026-03-25 (freed IP collision with LXC 301)
|
||||
|
||||
## Usage
|
||||
|
||||
SSH to the controller and run as `cal`:
|
||||
|
||||
```bash
|
||||
ssh ansible
|
||||
export ANSIBLE_CONFIG=/opt/ansible/ansible.cfg
|
||||
|
||||
# Check status of everything
|
||||
ansible-playbook /opt/ansible/playbooks/check-status.yml
|
||||
|
||||
# Full update cycle (snapshot → update → health check → cleanup)
|
||||
ansible-playbook /opt/ansible/playbooks/update-all.yml
|
||||
|
||||
# Update specific group
|
||||
ansible-playbook /opt/ansible/playbooks/update-all.yml --limit lxcs
|
||||
ansible-playbook /opt/ansible/playbooks/update-all.yml --limit docker-home
|
||||
|
||||
# Dry run
|
||||
ansible-playbook /opt/ansible/playbooks/update-all.yml --check
|
||||
|
||||
# OS updates only (no Docker)
|
||||
ansible-playbook /opt/ansible/playbooks/os-update-only.yml
|
||||
|
||||
# Skip snapshots
|
||||
ansible-playbook /opt/ansible/playbooks/update-all.yml -e skip_snapshot=true
|
||||
|
||||
# Roll back a host to latest snapshot
|
||||
ansible-playbook /opt/ansible/playbooks/rollback.yml --limit gitea
|
||||
|
||||
# Roll back to specific snapshot
|
||||
ansible-playbook /opt/ansible/playbooks/rollback.yml --limit gitea -e snapshot=pre-update-2026-03-25
|
||||
```
|
||||
|
||||
## Update Pipeline (update-all.yml)
|
||||
|
||||
1. **Snapshot**: Creates `pre-update-YYYY-MM-DD` snapshot on each Proxmox guest via `pvesh`
|
||||
2. **OS Update**: `apt update && apt upgrade safe && autoremove` (serial: 3)
|
||||
3. **Docker Update**: Finds compose files, pulls images, restarts changed stacks (serial: 1)
|
||||
4. **Health Check**: SSH ping, disk space warning (>89%), exited container report
|
||||
5. **Snapshot Cleanup**: Keeps last 3 `pre-update-*` snapshots per host
|
||||
|
||||
## Scheduled Runs
|
||||
|
||||
Systemd timer runs every **Sunday at 3:00 AM UTC** with up to 10 min jitter.
|
||||
`Persistent=true` ensures missed runs execute on next boot.
|
||||
|
||||
```bash
|
||||
# Check timer status
|
||||
ssh ansible "systemctl status ansible-update.timer"
|
||||
|
||||
# View last run
|
||||
ssh ansible "systemctl status ansible-update.service"
|
||||
|
||||
# View logs
|
||||
ssh ansible "ls -lt /opt/ansible/logs/ | head -5"
|
||||
ssh ansible "journalctl -u ansible-update.service --no-pager -n 50"
|
||||
```
|
||||
|
||||
## Inventory Groups
|
||||
|
||||
- `proxmox_host` — just pve-node
|
||||
- `vms` — all QEMU VMs
|
||||
- `lxcs` — all LXC containers
|
||||
- `physical` — bare-metal servers (manticore)
|
||||
- `docker_hosts` — any host running Docker compose stacks
|
||||
- `proxmox_guests` — union of vms + lxcs (snapshotable)
|
||||
|
||||
## Adding a New Host
|
||||
|
||||
1. Add entry to `/opt/ansible/inventory/hosts.yml` under the appropriate group
|
||||
2. Include: `ansible_host`, `ansible_user`, `proxmox_vmid`, `proxmox_type` (for guests)
|
||||
3. Set `ansible_python_interpreter` if Python < 3.9 default
|
||||
4. Ensure SSH key (`/home/cal/.ssh/homelab_rsa`) is authorized on the target
|
||||
5. For VMs: ensure NOPASSWD sudo for `cal` user
|
||||
6. Test: `ansible <hostname> -m ping`
|
||||
|
||||
## Setup Prerequisites Fixed During Initial Deployment
|
||||
|
||||
- **Python 3.9** installed via deadsnakes PPA on all Ubuntu 20.04 hosts (Ansible 2.20 requires ≥3.9)
|
||||
- **NOPASSWD sudo** set via `/etc/sudoers.d/cal` on all VMs and manticore
|
||||
- **qemu-guest-agent** enabled on VM 112 (databases-bots)
|
||||
- **VM 116 disk** expanded from 31GB→315GB (was 100% full), DNS fixed (missing resolv.conf)
|
||||
- **IP collision** between LXC 230 (palworld) and LXC 301 (claude-discord-coordinator) resolved by deleting palworld
|
||||
@ -28,8 +28,8 @@ tags: [proxmox, upgrade, pve, backup, rollback, infrastructure]
|
||||
**Production Services** (7 LXC + 7 VMs) — cleaned up 2026-02-19:
|
||||
- **Critical**: Paper Dynasty/Major Domo (VM 115), Discord bots (VM 110), Gitea (LXC 225), n8n (LXC 210), Home Assistant (VM 109), Databases (VM 112), docker-home/Pi-hole 1 (VM 106)
|
||||
- **Important**: Claude Discord Coordinator (LXC 301), arr-stack (LXC 221), Uptime Kuma (LXC 227), Foundry VTT (LXC 223), Memos (LXC 222)
|
||||
- **Decommission Candidate**: docker-home-servers (VM 116) — Jellyfin-only after 2026-04-03 cleanup; watchstate removed (duplicate of manticore); see issue #31
|
||||
- **Removed (2026-02-19)**: 108 (ansible), 224 (openclaw), 300 (openclaw-migrated), 101/102/104/111/211 (game servers), 107 (plex), 113 (tdarr - moved to .226), 114 (duplicate arr-stack), 117 (unused), 100/103 (old templates), 105 (docker-vpn - decommissioned 2026-04)
|
||||
- **Stopped/Investigate**: docker-vpn (VM 105, decommissioning), docker-home-servers (VM 116, needs investigation)
|
||||
- **Removed (2026-02-19)**: 108 (ansible), 224 (openclaw), 300 (openclaw-migrated), 101/102/104/111/211 (game servers), 107 (plex), 113 (tdarr - moved to .226), 114 (duplicate arr-stack), 117 (unused), 100/103 (old templates)
|
||||
|
||||
**Key Constraints**:
|
||||
- Home Assistant VM 109 requires dual network (vmbr1 for Matter support)
|
||||
|
||||
@ -67,15 +67,10 @@ runcmd:
|
||||
|
||||
# Add cal user to docker group (will take effect after next login)
|
||||
- usermod -aG docker cal
|
||||
|
||||
|
||||
# Test Docker installation
|
||||
- docker run --rm hello-world
|
||||
|
||||
# Mask avahi-daemon — not needed in a static-IP homelab with Pi-hole DNS,
|
||||
# and has a known kernel busy-loop bug that wastes CPU
|
||||
- systemctl stop avahi-daemon || true
|
||||
- systemctl mask avahi-daemon
|
||||
|
||||
# Write configuration files
|
||||
write_files:
|
||||
# SSH hardening configuration
|
||||
|
||||
@ -657,33 +657,4 @@ docker system info && docker system df
|
||||
- **Proxmox Console**: Direct VM access when SSH fails
|
||||
- **Emergency Contact**: Use Discord notifications for critical issues
|
||||
|
||||
This troubleshooting guide covers comprehensive recovery procedures for VM management issues in home lab environments.
|
||||
|
||||
## ubuntu-manticore crash recovery — initramfs fsck on wrong device (2026-03-24)
|
||||
|
||||
**Severity:** High — server unbootable, all services down (pihole DNS, jellyfin, tdarr, kb-rag)
|
||||
|
||||
**Problem:** After a physical server crash, ubuntu-manticore dropped to initramfs shell. Boot fsck targeted `/dev/nvme0n1p2` (an NTFS data partition labeled "2TB 970") instead of the actual ext4 root on `/dev/nvme1n1p2`. The generic busybox `fsck -y` wrapper didn't invoke the ext4 backend.
|
||||
|
||||
**Root Cause:** Two issues compounded: (1) The crash corrupted the ext4 root filesystem (block/inode count mismatches across ~15 groups). (2) The initramfs resolved the root device UUID to the wrong NVMe drive — `nvme0n1p2` instead of `nvme1n1p2`. NVMe device enumeration order can shift between boots; fstab uses UUIDs correctly but the initramfs got confused during this boot.
|
||||
|
||||
**Fix:** Ran `/usr/sbin/fsck.ext4 -y /dev/nvme1n1p2` directly from initramfs (identified correct partition via `blkid`). After `exit`, boot completed normally and all 9 Docker containers came up automatically via restart policies.
|
||||
|
||||
**Crash cause investigation:**
|
||||
- Kernel panic: `BUG: unable to handle page fault for address: fffffb2320041d50` — supervisor write to not-present page
|
||||
- PCIe AER correctable errors (Data Link Layer Timeout) on port `0000:00:01.2` (AMD X470/B450 root port) logged on Mar 19
|
||||
- Nvidia proprietary driver loaded, kernel tainted — common source of page faults
|
||||
- AMD Zen1 DIV0 bug flagged at boot (Ryzen 5 2600)
|
||||
- SMART data: both Samsung 970s healthy (PASSED), but nvme1 (250GB root drive, 22k hours) has **1 Media and Data Integrity Error** — monitor for growth
|
||||
- nvme0 (2TB data): 0 media errors, 2% used, 1,571 hours — clean
|
||||
- Most likely cause: Nvidia driver panic or PCIe timeout on NVMe controller
|
||||
|
||||
**Remediation:** Upgraded Nvidia driver 570.211.01 → 580.126.09. The 570 packages were in a broken state (partial upgrade left metapackage pinned at `.24.04.1` while deps moved to `.24.04.2`), requiring explicit removal of `nvidia-driver-570 nvidia-dkms-570 nvidia-kernel-source-570 nvidia-kernel-common-570 libnvidia-common-570 libnvidia-gl-570` with `--allow-change-held-packages` before 580 could install cleanly. Note: 590 drivers reported unstable — avoid.
|
||||
|
||||
**Lesson:**
|
||||
- Always use `blkid` in initramfs to confirm the actual root partition before running fsck — NVMe device ordering is not stable across boots
|
||||
- Use `/usr/sbin/fsck.ext4 -y` directly rather than the busybox `fsck` wrapper, which may not invoke the correct backend
|
||||
- Docker containers with restart policies recovered without intervention — validates that approach
|
||||
- Install `smartmontools` on bare-metal servers proactively — wasn't available during initial investigation
|
||||
- Monitor nvme1 media integrity error count; if it increments, plan replacement
|
||||
- When upgrading Nvidia driver major versions on Ubuntu, apt often can't resolve conflicts automatically — explicitly remove the old driver packages with `--allow-change-held-packages` first
|
||||
This troubleshooting guide covers comprehensive recovery procedures for VM management issues in home lab environments.
|
||||
@ -1,163 +0,0 @@
|
||||
---
|
||||
title: "VM Decommission Runbook"
|
||||
description: "Step-by-step procedure for safely decommissioning a Proxmox VM — dependency checks, destruction, and repo cleanup."
|
||||
type: runbook
|
||||
domain: vm-management
|
||||
tags: [proxmox, decommission, infrastructure, cleanup]
|
||||
---
|
||||
|
||||
# VM Decommission Runbook
|
||||
|
||||
Procedure for safely removing a stopped Proxmox VM and reclaiming its disk space. Derived from the VM 105 (docker-vpn) decommission (2026-04-02, issue #20).
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- VM must already be **stopped** on Proxmox
|
||||
- Services previously running on the VM must be confirmed migrated or no longer needed
|
||||
- SSH access to Proxmox host (`ssh proxmox`)
|
||||
|
||||
## Phase 1 — Dependency Verification
|
||||
|
||||
Run all checks before destroying anything. A clean result on all five means safe to proceed.
|
||||
|
||||
### 1.1 Pi-hole DNS
|
||||
|
||||
Check both primary and secondary Pi-hole for DNS records pointing to the VM's IP:
|
||||
|
||||
```bash
|
||||
ssh pihole "grep '<VM_IP>' /etc/pihole/custom.list || echo 'No DNS entries'"
|
||||
ssh pihole "pihole -q <VM_HOSTNAME>"
|
||||
```
|
||||
|
||||
### 1.2 Nginx Proxy Manager (NPM)
|
||||
|
||||
Check NPM for any proxy hosts with the VM's IP as an upstream:
|
||||
|
||||
- NPM UI: https://npm.manticorum.com → Proxy Hosts → search for VM IP
|
||||
- Or via API: `ssh npm-pihole "curl -s http://localhost:81/api/nginx/proxy-hosts" | grep <VM_IP>`
|
||||
|
||||
### 1.3 Proxmox Firewall Rules
|
||||
|
||||
```bash
|
||||
ssh proxmox "cat /etc/pve/firewall/<VMID>.fw 2>/dev/null || echo 'No firewall rules'"
|
||||
```
|
||||
|
||||
### 1.4 Backup Existence
|
||||
|
||||
```bash
|
||||
ssh proxmox "ls -la /var/lib/vz/dump/ | grep <VMID>"
|
||||
```
|
||||
|
||||
### 1.5 VPN / Tunnel References
|
||||
|
||||
Check if any WireGuard or VPN configs on other hosts reference this VM:
|
||||
|
||||
```bash
|
||||
ssh proxmox "grep -r '<VM_IP>' /etc/wireguard/ 2>/dev/null || echo 'No WireGuard refs'"
|
||||
```
|
||||
|
||||
Also check SSH config and any automation scripts in the claude-home repo:
|
||||
|
||||
```bash
|
||||
grep -r '<VM_IP>\|<VM_HOSTNAME>' ~/Development/claude-home/
|
||||
```
|
||||
|
||||
## Phase 2 — Safety Measures
|
||||
|
||||
### 2.1 Disable Auto-Start
|
||||
|
||||
Prevent the VM from starting on Proxmox reboot while you work:
|
||||
|
||||
```bash
|
||||
ssh proxmox "qm set <VMID> --onboot 0"
|
||||
```
|
||||
|
||||
### 2.2 Record Disk Space (Before)
|
||||
|
||||
```bash
|
||||
ssh proxmox "lvs | grep pve"
|
||||
```
|
||||
|
||||
Save this output for comparison after destruction.
|
||||
|
||||
### 2.3 Optional: Take a Final Backup
|
||||
|
||||
If the VM might contain anything worth preserving:
|
||||
|
||||
```bash
|
||||
ssh proxmox "vzdump <VMID> --mode snapshot --storage home-truenas --compress zstd"
|
||||
```
|
||||
|
||||
Skip if the VM has been stopped for a long time and all services are confirmed migrated.
|
||||
|
||||
## Phase 3 — Destroy
|
||||
|
||||
```bash
|
||||
ssh proxmox "qm destroy <VMID> --purge"
|
||||
```
|
||||
|
||||
The `--purge` flag removes the disk along with the VM config. Verify:
|
||||
|
||||
```bash
|
||||
ssh proxmox "qm list | grep <VMID>" # Should return nothing
|
||||
ssh proxmox "lvs | grep vm-<VMID>-disk" # Should return nothing
|
||||
ssh proxmox "lvs | grep pve" # Compare with Phase 2.2
|
||||
```
|
||||
|
||||
## Phase 4 — Repo Cleanup
|
||||
|
||||
Update these files in the `claude-home` repo:
|
||||
|
||||
| File | Action |
|
||||
|------|--------|
|
||||
| `~/.ssh/config` | Comment out Host block, add `# DECOMMISSIONED: <name> (<IP>) - <reason>` |
|
||||
| `server-configs/proxmox/qemu/<VMID>.conf` | Delete the file |
|
||||
| Migration results (if applicable) | Check off decommission tasks |
|
||||
| `vm-management/proxmox-upgrades/proxmox-7-to-9-upgrade-plan.md` | Move from Stopped/Investigate to Decommissioned |
|
||||
| `networking/examples/ssh-homelab-setup.md` | Comment out or remove entry |
|
||||
| `networking/examples/server_inventory.yaml` | Comment out or remove entry |
|
||||
|
||||
Leave historical/planning docs (migration plans, wave results) as-is — they serve as historical records.
|
||||
|
||||
## Phase 5 — Commit and PR
|
||||
|
||||
Branch naming: `chore/<ISSUE_NUMBER>-decommission-<vm-name>`
|
||||
|
||||
Commit message format:
|
||||
```
|
||||
chore: decommission VM <VMID> (<name>) — reclaim <SIZE> disk (#<ISSUE>)
|
||||
|
||||
Closes #<ISSUE>
|
||||
```
|
||||
|
||||
This is typically a docs-only PR (all `.md` and config files) which gets auto-approved by the `auto-merge-docs` workflow.
|
||||
|
||||
## Checklist Template
|
||||
|
||||
Copy this for each decommission:
|
||||
|
||||
```markdown
|
||||
### VM <VMID> (<name>) Decommission
|
||||
|
||||
**Pre-deletion verification:**
|
||||
- [ ] Pi-hole DNS — no records
|
||||
- [ ] NPM upstreams — no proxy hosts
|
||||
- [ ] Proxmox firewall — no rules
|
||||
- [ ] Backup status — verified
|
||||
- [ ] VPN/tunnel references — none
|
||||
|
||||
**Execution:**
|
||||
- [ ] Disabled onboot
|
||||
- [ ] Recorded disk space before
|
||||
- [ ] Took backup (or confirmed skip)
|
||||
- [ ] Destroyed VM with --purge
|
||||
- [ ] Verified disk space reclaimed
|
||||
|
||||
**Cleanup:**
|
||||
- [ ] SSH config updated
|
||||
- [ ] VM config file deleted from repo
|
||||
- [ ] Migration docs updated
|
||||
- [ ] Upgrade plan updated
|
||||
- [ ] Example files updated
|
||||
- [ ] Committed, pushed, PR created
|
||||
```
|
||||
@ -262,7 +262,7 @@ When connecting Jellyseerr to arr apps, be careful with tag configurations - inv
|
||||
- [x] Test movie/show requests through Jellyseerr
|
||||
|
||||
### After 48 Hours
|
||||
- [x] Decommission VM 121 (docker-vpn)
|
||||
- [ ] Decommission VM 121 (docker-vpn)
|
||||
- [ ] Clean up local migration temp files (`/tmp/arr-config-migration/`)
|
||||
|
||||
---
|
||||
|
||||
@ -90,9 +90,12 @@ All defined in `~/.claude.json` under `mcpServers`:
|
||||
|
||||
Hooks are configured in `~/.claude/settings.json` under the `hooks` key. They run shell commands or HTTP requests in response to events.
|
||||
|
||||
Current hooks are defined in `~/.claude/settings.json` — check the `hooks` key for the live list.
|
||||
### Current Hooks
|
||||
|
||||
Additionally, the `format-on-save@agent-toolkit` plugin registers its own `PostToolUse` hook for auto-formatting files on Edit/Write (runs ruff for Python, prettier for JS/TS, shfmt for shell, etc). See the plugin source at `~/.claude/plugins/cache/agent-toolkit/format-on-save/` for the full formatter list.
|
||||
| Event | Action |
|
||||
|-------|--------|
|
||||
| `PostToolUse` (Edit/Write/MultiEdit) | Auto-format code via `format-code.sh` |
|
||||
| `SubagentStop` | Notify via `notify-subagent-done.sh` |
|
||||
|
||||
## Permissions
|
||||
|
||||
@ -100,44 +103,5 @@ Permission rules live in `~/.claude/settings.json` under `permissions.allow` and
|
||||
|
||||
Common patterns:
|
||||
- `"mcp__gitea-mcp__*"` — allow all gitea MCP tools
|
||||
- `"WebFetch(domain:*)"` — allow fetching from any domain
|
||||
- `"Bash"` — allow all Bash commands (subject to cmd-gate hook below)
|
||||
|
||||
## permission-manager Plugin (cmd-gate)
|
||||
|
||||
The `permission-manager@agent-toolkit` plugin adds a `PreToolUse` hook on all `Bash` tool calls. It parses commands into an AST via `shfmt --tojson`, classifies each segment, and returns allow/ask/deny.
|
||||
|
||||
### How it works
|
||||
|
||||
1. Compound commands (`&&`, `||`, pipes) are split into segments
|
||||
2. Each segment is checked against **custom allow patterns** first (bypasses classifiers)
|
||||
3. Then dispatched to language/tool-specific classifiers (git, docker, npm, etc.)
|
||||
4. Most restrictive result wins across all segments
|
||||
|
||||
### Custom allow patterns
|
||||
|
||||
Loaded from two files (both checked, merged):
|
||||
- **Global:** `~/.claude/command-permissions.json`
|
||||
- **Project-level:** `.claude/command-permissions.json` (relative to cwd)
|
||||
|
||||
Format:
|
||||
```json
|
||||
{
|
||||
"allow": [
|
||||
"git pull*",
|
||||
"git push origin main"
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Patterns are **bash glob matched** per-segment against the command string. Project-level patterns are resolved from `$PWD` at hook time (i.e., the project Claude Code is working in, not the target of a `cd` in the command).
|
||||
|
||||
### Protected branches
|
||||
|
||||
The git classifier denies `git push` to `main` or `master` by default. To allow pushes to main from a specific project, add `"git push origin main"` to that project's `.claude/command-permissions.json`.
|
||||
|
||||
### Bypass
|
||||
|
||||
- Scheduled tasks use `--permission-mode bypassPermissions` which skips cmd-gate entirely
|
||||
- For classifier development details, see `development/permission-manager-classifier-development.md`
|
||||
- Plugin source: `~/.claude/plugins/cache/agent-toolkit/permission-manager/`
|
||||
- `"WebFetch(domain:docs.example.com)"` — allow fetching from specific domain
|
||||
- `"Bash(ssh:*)"` — allow SSH commands
|
||||
|
||||
@ -1,224 +0,0 @@
|
||||
---
|
||||
title: "Claude Code Multi-Account Setup with CLAUDE_CONFIG_DIR"
|
||||
description: "Guide to running multiple Claude Code OAuth accounts simultaneously using CLAUDE_CONFIG_DIR, direnv, and symlinked config directories."
|
||||
type: guide
|
||||
domain: workstation
|
||||
tags: [claude-code, oauth, direnv, config, multi-account]
|
||||
---
|
||||
|
||||
# Claude Code Multi-Account Setup
|
||||
|
||||
Run two different Claude OAuth accounts simultaneously — one for specific projects, another for everything else — using the `CLAUDE_CONFIG_DIR` environment variable and direnv.
|
||||
|
||||
## How CLAUDE_CONFIG_DIR Works
|
||||
|
||||
`CLAUDE_CONFIG_DIR` is an undocumented but fully implemented environment variable in the Claude Code binary. It controls where Claude Code looks for its entire configuration directory.
|
||||
|
||||
### Behavior When Set
|
||||
|
||||
| Aspect | Default | With CLAUDE_CONFIG_DIR |
|
||||
|--------|---------|----------------------|
|
||||
| Config directory | `~/.claude` | `$CLAUDE_CONFIG_DIR` |
|
||||
| Global state file | `~/.claude.json` | `$CLAUDE_CONFIG_DIR/.config.json` |
|
||||
| OAuth credentials | `~/.claude/.credentials.json` | `$CLAUDE_CONFIG_DIR/.credentials.json` |
|
||||
| Keychain entry name | `Claude Code` | `Claude Code-<hash>` (8-char hash of config path) |
|
||||
|
||||
### Internal Implementation
|
||||
|
||||
From the Claude Code binary source:
|
||||
|
||||
**Config dir resolution:**
|
||||
```js
|
||||
process.env.CLAUDE_CONFIG_DIR ?? path.join(os.homedir(), ".claude")
|
||||
```
|
||||
|
||||
**Global state file (`.config.json` fallback):**
|
||||
```js
|
||||
// When CLAUDE_CONFIG_DIR is set, checks for .config.json first
|
||||
if (existsSync(path.join(configDir, ".config.json")))
|
||||
return path.join(configDir, ".config.json");
|
||||
```
|
||||
|
||||
**Keychain collision avoidance:**
|
||||
```js
|
||||
// Adds hash suffix when CLAUDE_CONFIG_DIR is set
|
||||
let suffix = !process.env.CLAUDE_CONFIG_DIR ? ""
|
||||
: `-${hash(configDir).substring(0, 8)}`;
|
||||
return `Claude Code${suffix}`;
|
||||
```
|
||||
|
||||
**Propagated to subagents:** `CLAUDE_CONFIG_DIR` is in the explicit list of env vars passed to child processes (subagents, background agents), ensuring they use the same config directory.
|
||||
|
||||
## Setup Procedure
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Claude Code installed (native binary at `~/.local/bin/claude`)
|
||||
- direnv (`sudo dnf install direnv` on Fedora/Nobara)
|
||||
|
||||
### 1. Create Alternate Config Directory
|
||||
|
||||
```bash
|
||||
mkdir -p ~/.claude-ac
|
||||
```
|
||||
|
||||
The `-ac` suffix stands for "alternate config" — name it whatever you like.
|
||||
|
||||
### 2. Symlink Shared Config Files
|
||||
|
||||
Share configuration between accounts so both get the same CLAUDE.md, settings, MCP servers, hooks, etc.:
|
||||
|
||||
```bash
|
||||
# Core config files
|
||||
ln -s ~/.claude/CLAUDE.md ~/.claude-ac/CLAUDE.md
|
||||
ln -s ~/.claude/settings.json ~/.claude-ac/settings.json
|
||||
ln -s ~/.claude/command-permissions.json ~/.claude-ac/command-permissions.json
|
||||
|
||||
# Directories (agents, commands, hooks, skills, patterns, memory, plugins)
|
||||
for dir in agents commands hooks skills patterns memory plugins; do
|
||||
ln -s ~/.claude/$dir ~/.claude-ac/$dir
|
||||
done
|
||||
|
||||
# MCP servers + global state
|
||||
# When CLAUDE_CONFIG_DIR is set, Claude reads .config.json instead of ~/.claude.json
|
||||
ln -s ~/.claude.json ~/.claude-ac/.config.json
|
||||
```
|
||||
|
||||
#### What NOT to Symlink
|
||||
|
||||
These should remain independent per account:
|
||||
|
||||
| Path | Why independent |
|
||||
|------|----------------|
|
||||
| `.credentials.json` | Different OAuth tokens per account |
|
||||
| `projects/` | Session state tied to account |
|
||||
| `sessions/` | Active session registry |
|
||||
| `history.jsonl` | Conversation history |
|
||||
| `todos/`, `tasks/`, `plans/` | Conversation-scoped data |
|
||||
| `debug/`, `telemetry/`, `logs/` | Per-instance diagnostics |
|
||||
|
||||
These directories are created automatically by Claude Code on first use — no need to pre-create them.
|
||||
|
||||
### 3. Hook direnv Into Your Shell
|
||||
|
||||
**Fish** (`~/.config/fish/config.fish`, inside `if status is-interactive`):
|
||||
```fish
|
||||
direnv hook fish | source
|
||||
```
|
||||
|
||||
**Bash** (`~/.bashrc`):
|
||||
```bash
|
||||
eval "$(direnv hook bash)"
|
||||
```
|
||||
|
||||
### 4. Create `.envrc` for the Target Directory
|
||||
|
||||
For example, to use the alternate account in `~/work`:
|
||||
|
||||
```bash
|
||||
# ~/work/.envrc
|
||||
export CLAUDE_CONFIG_DIR="$HOME/.claude-ac"
|
||||
```
|
||||
|
||||
Allow it:
|
||||
```bash
|
||||
direnv allow ~/work/.envrc
|
||||
```
|
||||
|
||||
direnv automatically sets/unsets the env var when you `cd` into or out of `~/work` (and all subdirectories).
|
||||
|
||||
### 5. Log In With the Second Account
|
||||
|
||||
From within the target directory (where direnv activates):
|
||||
|
||||
```bash
|
||||
cd ~/work
|
||||
claude auth login
|
||||
```
|
||||
|
||||
This stores OAuth tokens in `~/.claude-ac/.credentials.json`, completely separate from the primary account.
|
||||
|
||||
### 6. Verify
|
||||
|
||||
```bash
|
||||
# In ~/work — should show alternate account
|
||||
cd ~/work && claude auth status
|
||||
|
||||
# Outside ~/work — should show primary account
|
||||
cd ~ && claude auth status
|
||||
```
|
||||
|
||||
Both accounts can run simultaneously in separate terminal windows.
|
||||
|
||||
## Current Configuration on This Workstation
|
||||
|
||||
| Location | Account | Purpose |
|
||||
|----------|---------|---------|
|
||||
| `~/.claude` | Primary (cal.corum@gmail.com) | All projects except ~/work |
|
||||
| `~/.claude-ac` | Alternate | ~/work projects |
|
||||
| `~/work/.envrc` | — | direnv trigger for CLAUDE_CONFIG_DIR |
|
||||
|
||||
## How It All Fits Together
|
||||
|
||||
```
|
||||
Terminal in ~/work/some-project/
|
||||
↓ cd triggers direnv
|
||||
↓ CLAUDE_CONFIG_DIR=~/.claude-ac
|
||||
↓ claude starts
|
||||
├── Config dir: ~/.claude-ac/
|
||||
├── Auth: ~/.claude-ac/.credentials.json (alt account)
|
||||
├── Settings: ~/.claude-ac/settings.json → symlink → ~/.claude/settings.json
|
||||
├── MCP servers: ~/.claude-ac/.config.json → symlink → ~/.claude.json
|
||||
├── Hooks: ~/.claude-ac/hooks/ → symlink → ~/.claude/hooks/
|
||||
└── Keychain: "Claude Code-a1b2c3d4" (hashed, no collision)
|
||||
|
||||
Terminal in ~/other-project/
|
||||
↓ no .envrc, CLAUDE_CONFIG_DIR unset
|
||||
↓ claude starts
|
||||
├── Config dir: ~/.claude/ (default)
|
||||
├── Auth: ~/.claude/.credentials.json (primary account)
|
||||
└── Keychain: "Claude Code" (default)
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Auth shows the wrong account
|
||||
Check that direnv loaded correctly:
|
||||
```bash
|
||||
echo $CLAUDE_CONFIG_DIR
|
||||
# Should show ~/.claude-ac when in ~/work, empty otherwise
|
||||
```
|
||||
|
||||
If empty, ensure:
|
||||
1. direnv hook is in your shell config
|
||||
2. You opened a new shell after adding the hook
|
||||
3. `direnv allow ~/work/.envrc` was run
|
||||
|
||||
### MCP servers not loading
|
||||
Verify the `.config.json` symlink:
|
||||
```bash
|
||||
ls -la ~/.claude-ac/.config.json
|
||||
# Should point to ~/.claude.json
|
||||
```
|
||||
|
||||
### direnv doesn't activate in subdirectories
|
||||
direnv walks up the directory tree, so `~/work/.envrc` covers all of `~/work/**`. If it doesn't activate:
|
||||
```bash
|
||||
direnv status # Shows which .envrc is loaded and why
|
||||
```
|
||||
|
||||
### Need to re-login
|
||||
```bash
|
||||
cd ~/work # Ensure direnv sets the env var
|
||||
claude auth logout
|
||||
claude auth login
|
||||
```
|
||||
|
||||
### Subagents use wrong account
|
||||
`CLAUDE_CONFIG_DIR` is in Claude Code's propagated env var list, so subagents inherit it automatically. If a subagent somehow uses the wrong account, verify the parent process has `CLAUDE_CONFIG_DIR` set.
|
||||
|
||||
## Caveats
|
||||
|
||||
- **Undocumented feature**: `CLAUDE_CONFIG_DIR` does not appear in `claude --help` or official docs. However, it is deeply integrated into the binary (config resolution, keychain naming, subagent propagation), suggesting it's intentional infrastructure.
|
||||
- **Version sensitivity**: When Claude Code updates add new shared config files to `~/.claude/`, the alternate config directory won't automatically get symlinks for them. Periodically check for new files that should be symlinked.
|
||||
- **Session isolation**: Even with symlinked memory, session history and project state are per-config-dir. Each account maintains its own conversation history.
|
||||
@ -1,33 +0,0 @@
|
||||
---
|
||||
title: "Workstation Troubleshooting"
|
||||
description: "Troubleshooting notes for Nobara/KDE Wayland workstation issues."
|
||||
type: troubleshooting
|
||||
domain: workstation
|
||||
tags: [troubleshooting, wayland, kde]
|
||||
---
|
||||
|
||||
# Workstation Troubleshooting
|
||||
|
||||
## Discord screen sharing shows no windows on KDE Wayland (2026-04-03)
|
||||
|
||||
**Severity:** Medium — cannot share screen via Discord desktop app
|
||||
|
||||
**Problem:** Clicking "Share Your Screen" in Discord desktop app (v0.0.131, Electron 37) opens the Discord picker but shows zero windows/screens. Same behavior in both the desktop app and the web app when using Discord's own picker. Affects both native Wayland and XWayland modes.
|
||||
|
||||
**Root Cause:** Discord's built-in screen picker uses Electron's `desktopCapturer.getSources()` which relies on X11 window enumeration. On KDE Wayland:
|
||||
- In native Wayland mode: no X11 windows exist, so the picker is empty
|
||||
- In forced X11/XWayland mode (`ELECTRON_OZONE_PLATFORM_HINT=x11`): Discord can only see other XWayland windows (itself, Android emulator), not native Wayland apps
|
||||
- Discord ignores `--use-fake-ui-for-media-stream` and other Chromium flags that should force portal usage
|
||||
- The `discord-flags.conf` file is **not read** by the Nobara/RPM Discord package — flags must go in the `.desktop` file `Exec=` line
|
||||
|
||||
**Fix:** Use **Discord web app in Firefox** for screen sharing. Firefox natively delegates to the XDG Desktop Portal via PipeWire, which shows the KDE screen picker with all windows. The desktop app's own picker remains broken on Wayland as of v0.0.131.
|
||||
|
||||
Configuration applied (for general Discord Wayland support):
|
||||
- `~/.local/share/applications/discord.desktop` — overrides system `.desktop` with Wayland flags
|
||||
- `~/.config/discord-flags.conf` — created but not read by this Discord build
|
||||
|
||||
**Lesson:**
|
||||
- Discord desktop on Linux Wayland cannot do screen sharing through its own picker — always use the web app in Firefox for this
|
||||
- Electron's `desktopCapturer` API is fundamentally X11-only; the PipeWire/portal path requires the app to use `getDisplayMedia()` instead, which Discord's desktop app does not do
|
||||
- `discord-flags.conf` is unreliable across distros — always verify flags landed in `/proc/<pid>/cmdline`
|
||||
- Vesktop (community client) is an alternative that properly implements portal-based screen sharing, if the web app is insufficient
|
||||
Loading…
Reference in New Issue
Block a user