docs: expand monitoring coverage, update Proxmox upgrade plan, remove decommissioned tdarr scripts
- Update monitoring CONTEXT.md with 6-server inventory table, per-server SSH user support, and pre-escalation Discord notification docs - Remove tdarr local monitoring scripts (decommissioned per prior decision) - Update Proxmox upgrade plan with Phase 1 completion and Phase 2 prep - Update vm-management CONTEXT.md with current PVE 8 state - CLAUDE.md: auto-run /save-memories at 25% context instead of asking Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
f20e221090
commit
3737c7dda5
@ -7,7 +7,7 @@
|
||||
- Prefer editing existing files over creating new ones
|
||||
- After complex tasks, prompt to save learnings to cognitive memory
|
||||
- At session end, ask: "Should I update our documentation?"
|
||||
- At 25% context remaining, ask: "Should I update docs before we lose context?"
|
||||
- At 25% context remaining, automatically run `/save-memories` before compaction loses detail
|
||||
|
||||
## Context Loading
|
||||
When a topic comes up, load `{tech}/CONTEXT.md` + `{tech}/troubleshooting.md`. For scripts, also load `{tech}/scripts/CONTEXT.md`.
|
||||
|
||||
@ -1,6 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Cron job wrapper for Tdarr file monitor
|
||||
# Add this to crontab with: * * * * * /mnt/NV2/Development/claude-home/monitoring/scripts/tdarr-file-monitor-cron.sh
|
||||
|
||||
cd /mnt/NV2/Development/claude-home/monitoring/scripts
|
||||
/usr/bin/python3 /mnt/NV2/Development/claude-home/monitoring/scripts/tdarr_file_monitor.py
|
||||
@ -1,286 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tdarr File Monitor - Monitors Tdarr cache directory for completed .mkv files and copies them to backup location.
|
||||
Detects file completion by monitoring size changes and always keeps the smallest version of duplicate files.
|
||||
"""
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass, asdict
|
||||
from typing import Dict, Optional
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileState:
|
||||
"""Tracks the state of a monitored file."""
|
||||
path: str
|
||||
size: int
|
||||
last_modified: float
|
||||
first_seen: float
|
||||
last_size_change: float
|
||||
check_count: int = 0
|
||||
|
||||
|
||||
class TdarrFileMonitor:
|
||||
"""Monitors Tdarr cache directory for completed .mkv files."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
source_dir: str = "/mnt/NV2/tdarr-cache/nobara-pc-gpu-unmapped/temp",
|
||||
media_dir: str = "/mnt/NV2/tdarr-cache/nobara-pc-gpu-unmapped/media",
|
||||
dest_dir: str = "/mnt/NV2/tdarr-cache/manual-backup",
|
||||
state_file: str = "/mnt/NV2/Development/claude-home/logs/tdarr_file_monitor_state.json",
|
||||
completion_wait_seconds: int = 60,
|
||||
log_file: str = "/mnt/NV2/Development/claude-home/logs/tdarr_file_monitor.log"
|
||||
):
|
||||
self.source_dir = Path(source_dir)
|
||||
self.media_dir = Path(media_dir)
|
||||
self.dest_dir = Path(dest_dir)
|
||||
self.state_file = Path(state_file)
|
||||
self.completion_wait_seconds = completion_wait_seconds
|
||||
self.monitored_files: Dict[str, FileState] = {}
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(log_file),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
self.logger = logging.getLogger(f'{__name__}.TdarrFileMonitor')
|
||||
|
||||
# Ensure destination directory exists
|
||||
self.dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load previous state
|
||||
self._load_state()
|
||||
|
||||
def _load_state(self) -> None:
|
||||
"""Load monitored files state from disk."""
|
||||
if self.state_file.exists():
|
||||
try:
|
||||
with open(self.state_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
self.monitored_files = {
|
||||
path: FileState(**file_data)
|
||||
for path, file_data in data.items()
|
||||
}
|
||||
self.logger.info(f"Loaded state for {len(self.monitored_files)} monitored files")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to load state file: {e}")
|
||||
self.monitored_files = {}
|
||||
|
||||
def _save_state(self) -> None:
|
||||
"""Save monitored files state to disk."""
|
||||
try:
|
||||
with open(self.state_file, 'w') as f:
|
||||
data = {path: asdict(state) for path, state in self.monitored_files.items()}
|
||||
json.dump(data, f, indent=2)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to save state file: {e}")
|
||||
|
||||
def _scan_for_mkv_files(self) -> Dict[str, Path]:
|
||||
"""Scan source directory for .mkv files in all subdirectories."""
|
||||
mkv_files = {}
|
||||
try:
|
||||
for mkv_file in self.source_dir.rglob("*.mkv"):
|
||||
if mkv_file.is_file():
|
||||
mkv_files[str(mkv_file)] = mkv_file
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error scanning source directory: {e}")
|
||||
|
||||
return mkv_files
|
||||
|
||||
def _get_file_info(self, file_path: Path) -> Optional[tuple]:
|
||||
"""Get file size and modification time, return None if file doesn't exist or can't be accessed."""
|
||||
try:
|
||||
stat = file_path.stat()
|
||||
return stat.st_size, stat.st_mtime
|
||||
except (OSError, FileNotFoundError) as e:
|
||||
self.logger.warning(f"Cannot access file {file_path}: {e}")
|
||||
return None
|
||||
|
||||
def _validate_file_pair(self, temp_file_path: Path, temp_file_size: int) -> bool:
|
||||
"""Validate that a matching file exists in media directory with exact same name and size."""
|
||||
try:
|
||||
# Search for matching file in media directory tree
|
||||
for media_file in self.media_dir.rglob(temp_file_path.name):
|
||||
if media_file.is_file():
|
||||
media_file_info = self._get_file_info(media_file)
|
||||
if media_file_info:
|
||||
media_size, _ = media_file_info
|
||||
if media_size == temp_file_size:
|
||||
self.logger.debug(f"Found matching file: {temp_file_path.name} ({temp_file_size:,} bytes) in temp and media directories")
|
||||
return True
|
||||
else:
|
||||
self.logger.debug(f"Size mismatch for {temp_file_path.name}: temp={temp_file_size:,}, media={media_size:,}")
|
||||
|
||||
# No matching file found
|
||||
self.logger.info(f"No matching file found in media directory for {temp_file_path.name} ({temp_file_size:,} bytes)")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error validating file pair for {temp_file_path.name}: {e}")
|
||||
return False
|
||||
|
||||
def _is_file_complete(self, file_state: FileState, current_time: float) -> bool:
|
||||
"""Check if file is complete based on size stability."""
|
||||
stale_time = current_time - file_state.last_size_change
|
||||
return stale_time >= self.completion_wait_seconds
|
||||
|
||||
def _should_copy_file(self, source_path: Path, dest_path: Path) -> bool:
|
||||
"""Determine if we should copy the file (always keep smaller version)."""
|
||||
if not dest_path.exists():
|
||||
return True
|
||||
|
||||
source_size = source_path.stat().st_size
|
||||
dest_size = dest_path.stat().st_size
|
||||
|
||||
if source_size < dest_size:
|
||||
self.logger.info(f"Source file {source_path.name} ({source_size:,} bytes) is smaller than existing destination ({dest_size:,} bytes), will replace")
|
||||
return True
|
||||
else:
|
||||
self.logger.info(f"Source file {source_path.name} ({source_size:,} bytes) is not smaller than existing destination ({dest_size:,} bytes), skipping")
|
||||
return False
|
||||
|
||||
def _copy_file_with_retry(self, source_path: Path, dest_path: Path) -> bool:
|
||||
"""Copy file with retry logic and cleanup on failure."""
|
||||
temp_dest = dest_path.with_suffix(dest_path.suffix + '.tmp')
|
||||
|
||||
for attempt in range(2): # Try twice
|
||||
try:
|
||||
start_time = time.time()
|
||||
self.logger.info(f"Attempt {attempt + 1}: Copying {source_path.name} ({source_path.stat().st_size:,} bytes)")
|
||||
|
||||
# Copy to temporary file first
|
||||
shutil.copy2(source_path, temp_dest)
|
||||
|
||||
# Verify copy completed successfully
|
||||
if temp_dest.stat().st_size != source_path.stat().st_size:
|
||||
raise Exception(f"Copy verification failed: size mismatch")
|
||||
|
||||
# Move temp file to final destination
|
||||
if dest_path.exists():
|
||||
dest_path.unlink() # Remove existing file
|
||||
temp_dest.rename(dest_path)
|
||||
|
||||
copy_time = time.time() - start_time
|
||||
final_size = dest_path.stat().st_size
|
||||
|
||||
self.logger.info(f"Successfully copied {source_path.name} ({final_size:,} bytes) in {copy_time:.2f}s")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Copy attempt {attempt + 1} failed for {source_path.name}: {e}")
|
||||
|
||||
# Cleanup temporary file if it exists
|
||||
if temp_dest.exists():
|
||||
try:
|
||||
temp_dest.unlink()
|
||||
except Exception as cleanup_error:
|
||||
self.logger.error(f"Failed to cleanup temp file {temp_dest}: {cleanup_error}")
|
||||
|
||||
if attempt == 1: # Last attempt failed
|
||||
self.logger.error(f"All copy attempts failed for {source_path.name}, giving up")
|
||||
return False
|
||||
else:
|
||||
time.sleep(5) # Wait before retry
|
||||
|
||||
return False
|
||||
|
||||
def run_check(self) -> None:
|
||||
"""Run a single monitoring check cycle."""
|
||||
current_time = time.time()
|
||||
self.logger.info("Starting monitoring check cycle")
|
||||
|
||||
# Scan for current .mkv files
|
||||
current_files = self._scan_for_mkv_files()
|
||||
self.logger.info(f"Found {len(current_files)} .mkv files in source directory")
|
||||
|
||||
# Remove files from monitoring that no longer exist
|
||||
missing_files = set(self.monitored_files.keys()) - set(current_files.keys())
|
||||
for missing_file in missing_files:
|
||||
self.logger.info(f"File no longer exists, removing from monitoring: {Path(missing_file).name}")
|
||||
del self.monitored_files[missing_file]
|
||||
|
||||
# Process each current file
|
||||
files_to_copy = []
|
||||
for file_path_str, file_path in current_files.items():
|
||||
file_info = self._get_file_info(file_path)
|
||||
if not file_info:
|
||||
continue
|
||||
|
||||
current_size, current_mtime = file_info
|
||||
|
||||
# Update or create file state
|
||||
if file_path_str in self.monitored_files:
|
||||
file_state = self.monitored_files[file_path_str]
|
||||
file_state.check_count += 1
|
||||
|
||||
# Check if size changed
|
||||
if current_size != file_state.size:
|
||||
file_state.size = current_size
|
||||
file_state.last_size_change = current_time
|
||||
self.logger.debug(f"Size changed for {file_path.name}: {current_size:,} bytes")
|
||||
|
||||
file_state.last_modified = current_mtime
|
||||
|
||||
else:
|
||||
# New file discovered - validate before tracking
|
||||
if not self._validate_file_pair(file_path, current_size):
|
||||
# File doesn't have a matching pair in media directory, skip tracking
|
||||
continue
|
||||
|
||||
file_state = FileState(
|
||||
path=file_path_str,
|
||||
size=current_size,
|
||||
last_modified=current_mtime,
|
||||
first_seen=current_time,
|
||||
last_size_change=current_time,
|
||||
check_count=1
|
||||
)
|
||||
self.monitored_files[file_path_str] = file_state
|
||||
self.logger.info(f"Started monitoring validated file: {file_path.name} ({current_size:,} bytes)")
|
||||
|
||||
# Log current state
|
||||
stale_time = current_time - file_state.last_size_change
|
||||
self.logger.info(f"Checking {file_path.name}: {current_size:,} bytes, stale for {stale_time:.1f}s (checks: {file_state.check_count})")
|
||||
|
||||
# Check if file is complete
|
||||
if self._is_file_complete(file_state, current_time):
|
||||
dest_path = self.dest_dir / file_path.name
|
||||
if self._should_copy_file(file_path, dest_path):
|
||||
files_to_copy.append((file_path, dest_path, file_state))
|
||||
|
||||
# Copy completed files
|
||||
for source_path, dest_path, file_state in files_to_copy:
|
||||
self.logger.info(f"File appears complete: {source_path.name} (stable for {current_time - file_state.last_size_change:.1f}s)")
|
||||
|
||||
if self._copy_file_with_retry(source_path, dest_path):
|
||||
# Remove from monitoring after successful copy
|
||||
del self.monitored_files[str(source_path)]
|
||||
self.logger.info(f"Successfully processed and removed from monitoring: {source_path.name}")
|
||||
else:
|
||||
self.logger.error(f"Failed to copy {source_path.name}, will continue monitoring")
|
||||
|
||||
# Save state
|
||||
self._save_state()
|
||||
|
||||
self.logger.info(f"Check cycle completed, monitoring {len(self.monitored_files)} files")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the script."""
|
||||
monitor = TdarrFileMonitor()
|
||||
monitor.run_check()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
File diff suppressed because it is too large
Load Diff
@ -3,6 +3,15 @@
|
||||
## Overview
|
||||
Virtual machine management for home lab environments with focus on automated provisioning, infrastructure as code, and security-first configuration. This context covers VM lifecycle management, Proxmox integration, and standardized deployment patterns.
|
||||
|
||||
## Proxmox Host
|
||||
- **Version**: PVE 8.4.16 (upgraded from 7.4-20 on 2026-02-19)
|
||||
- **Kernel**: 6.8.12-18-pve
|
||||
- **IP**: 10.10.0.11
|
||||
- **SSH**: `ssh -i ~/.ssh/homelab_rsa root@10.10.0.11`
|
||||
- **Storage**: local (100GB dir), local-lvm (2.3TB thin), home-truenas (17TB CIFS at 10.10.0.35)
|
||||
- **Networking**: vmbr0 (10.10.0.x/24 via eno1), vmbr1 (10.0.0.x/24 via eno2, Matter/IoT)
|
||||
- **Upgrade plan**: Phase 2 (PVE 8→9) pending — see `proxmox-upgrades/proxmox-7-to-9-upgrade-plan.md`
|
||||
|
||||
## Architecture Patterns
|
||||
|
||||
### Infrastructure as Code (IaC) Approach
|
||||
|
||||
@ -2,18 +2,26 @@
|
||||
|
||||
## Executive Summary
|
||||
|
||||
**Current State**: Proxmox VE 7.1-7 (kernel 5.13.19-2-pve)
|
||||
**Current State**: Proxmox VE 8.4.16 (kernel 6.8.12-18-pve) — Phase 1 complete
|
||||
**Target State**: Proxmox VE 9.1 (latest)
|
||||
**Upgrade Path**: Two-phase upgrade (7→8→9) - direct upgrade not supported
|
||||
**Total Timeline**: 3-4 weeks (including stabilization periods)
|
||||
**Total Downtime**: ~4 hours (2 hours per phase)
|
||||
|
||||
### Phase 1 Status: COMPLETED (2026-02-19)
|
||||
- Upgraded from PVE 7.4-20 → PVE 8.4.16
|
||||
- Kernel: 5.13.19-2-pve → 6.8.12-18-pve
|
||||
- Total downtime: ~45 minutes (upgrade + reboot + service startup)
|
||||
- All services validated and running
|
||||
- Stabilization period: monitoring through early March 2026
|
||||
|
||||
## Infrastructure Overview
|
||||
|
||||
**Production Services** (8 LXC + 17 VMs):
|
||||
- **Critical**: Paper Dynasty/Major Domo (VMs 115, 110), Gitea (LXC 225), n8n (LXC 210), Home Assistant (VM 109)
|
||||
- **Important**: Media services (Plex 107, Tdarr 113, arr-stack 221), OpenClaw (224), Databases (112)
|
||||
- **Lower Priority**: Game servers, development containers
|
||||
**Production Services** (7 LXC + 7 VMs) — cleaned up 2026-02-19:
|
||||
- **Critical**: Paper Dynasty/Major Domo (VM 115), Discord bots (VM 110), Gitea (LXC 225), n8n (LXC 210), Home Assistant (VM 109), Databases (VM 112), docker-home/Pi-hole 1 (VM 106)
|
||||
- **Important**: Claude Discord Coordinator (LXC 301), arr-stack (LXC 221), Uptime Kuma (LXC 227), Foundry VTT (LXC 223), Memos (LXC 222)
|
||||
- **Stopped/Investigate**: docker-vpn (VM 105, decommissioning), docker-home-servers (VM 116, needs investigation)
|
||||
- **Removed (2026-02-19)**: 108 (ansible), 224 (openclaw), 300 (openclaw-migrated), 101/102/104/111/211 (game servers), 107 (plex), 113 (tdarr - moved to .226), 114 (duplicate arr-stack), 117 (unused), 100/103 (old templates)
|
||||
|
||||
**Key Constraints**:
|
||||
- Home Assistant VM 109 requires dual network (vmbr1 for Matter support)
|
||||
@ -29,26 +37,33 @@
|
||||
|
||||
#### 1. Comprehensive Backups
|
||||
|
||||
**Priority 1 - Production Services**:
|
||||
**All production guests** (14 total after cleanup):
|
||||
```bash
|
||||
# Backup critical services to TrueNAS
|
||||
vzdump 210 --mode snapshot --dumpdir /mnt/truenas/proxmox --compress zstd # n8n
|
||||
vzdump 115 --mode snapshot --dumpdir /mnt/truenas/proxmox --compress zstd # docker-sba
|
||||
vzdump 112 --mode snapshot --dumpdir /mnt/truenas/proxmox --compress zstd # databases
|
||||
vzdump 110 --mode snapshot --dumpdir /mnt/truenas/proxmox --compress zstd # discord-bots
|
||||
vzdump 225 --mode snapshot --dumpdir /mnt/truenas/proxmox --compress zstd # gitea
|
||||
vzdump 109 --mode snapshot --dumpdir /mnt/truenas/proxmox --compress zstd # homeassistant
|
||||
```
|
||||
|
||||
**Priority 2 - All Remaining VMs/LXCs**:
|
||||
```bash
|
||||
vzdump --all --mode snapshot --dumpdir /mnt/truenas/proxmox --compress zstd
|
||||
# Backup all to TrueNAS (PVE storage: home-truenas, mount: /mnt/pve/home-truenas)
|
||||
# VMs
|
||||
vzdump 106 --mode snapshot --storage home-truenas --compress zstd # docker-home (pihole1, NPM)
|
||||
vzdump 109 --mode snapshot --storage home-truenas --compress zstd # homeassistant
|
||||
vzdump 110 --mode snapshot --storage home-truenas --compress zstd # discord-bots
|
||||
vzdump 112 --mode snapshot --storage home-truenas --compress zstd # databases
|
||||
vzdump 115 --mode snapshot --storage home-truenas --compress zstd # docker-sba (Paper Dynasty)
|
||||
# LXCs
|
||||
vzdump 210 --mode snapshot --storage home-truenas --compress zstd # n8n
|
||||
vzdump 221 --mode snapshot --storage home-truenas --compress zstd # arr-stack
|
||||
vzdump 222 --mode snapshot --storage home-truenas --compress zstd # memos
|
||||
vzdump 223 --mode snapshot --storage home-truenas --compress zstd # foundry
|
||||
vzdump 225 --mode snapshot --storage home-truenas --compress zstd # gitea
|
||||
vzdump 227 --mode snapshot --storage home-truenas --compress zstd # uptime-kuma
|
||||
vzdump 301 --mode snapshot --storage home-truenas --compress zstd # claude-discord-coordinator
|
||||
# Optional (stopped/investigate)
|
||||
# vzdump 105 --mode snapshot --storage home-truenas --compress zstd # docker-vpn (decommissioning)
|
||||
# vzdump 116 --mode snapshot --storage home-truenas --compress zstd # docker-home-servers (investigate)
|
||||
```
|
||||
|
||||
**Backup Proxmox Configuration**:
|
||||
```bash
|
||||
tar -czf /mnt/truenas/proxmox/pve-config-$(date +%Y%m%d).tar.gz /etc/pve/
|
||||
cp /etc/network/interfaces /mnt/truenas/proxmox/interfaces.backup
|
||||
# Already completed 2026-02-19 — refresh before upgrade
|
||||
tar -czf /mnt/pve/home-truenas/dump/pve-config/pve-config-$(date +%Y%m%d).tar.gz /etc/pve/
|
||||
cp /etc/network/interfaces /mnt/pve/home-truenas/dump/pve-config/interfaces.backup.$(date +%Y%m%d)
|
||||
```
|
||||
|
||||
**Expected**: 2-4 hours, ~500GB-1TB storage required
|
||||
@ -120,24 +135,34 @@ pvesm status
|
||||
|
||||
### Post-Upgrade Validation
|
||||
|
||||
**Start Services in Dependency Order**:
|
||||
**Start Services in Dependency Order** (stagger with 30s delays per Phase 1 lessons):
|
||||
```bash
|
||||
# Databases first
|
||||
pvesh create /nodes/proxmox/qemu/112/status/start
|
||||
pvesh create /nodes/proxmox/qemu/112/status/start # databases-bots
|
||||
sleep 30
|
||||
|
||||
# Infrastructure
|
||||
# Infrastructure + DNS
|
||||
pvesh create /nodes/proxmox/qemu/106/status/start # docker-home (pihole1, NPM)
|
||||
pvesh create /nodes/proxmox/lxc/225/status/start # gitea
|
||||
pvesh create /nodes/proxmox/lxc/210/status/start # n8n
|
||||
pvesh create /nodes/proxmox/lxc/227/status/start # uptime-kuma
|
||||
sleep 30
|
||||
|
||||
# Applications
|
||||
pvesh create /nodes/proxmox/qemu/115/status/start # docker-sba (Paper Dynasty)
|
||||
pvesh create /nodes/proxmox/qemu/110/status/start # discord-bots
|
||||
pvesh create /nodes/proxmox/lxc/224/status/start # openclaw
|
||||
pvesh create /nodes/proxmox/lxc/301/status/start # claude-discord-coordinator
|
||||
sleep 30
|
||||
|
||||
# Restart Pi-hole container proactively (UDP DNS fix from Phase 1)
|
||||
qm guest exec 106 -- docker restart pihole
|
||||
sleep 10
|
||||
|
||||
# Media & Others
|
||||
pvesh create /nodes/proxmox/qemu/109/status/start # homeassistant
|
||||
pvesh create /nodes/proxmox/qemu/107/status/start # plex
|
||||
pvesh create /nodes/proxmox/lxc/221/status/start # arr-stack
|
||||
pvesh create /nodes/proxmox/lxc/222/status/start # memos
|
||||
pvesh create /nodes/proxmox/lxc/223/status/start # foundry-lxc
|
||||
```
|
||||
|
||||
**Service Validation Checklist**:
|
||||
@ -159,6 +184,30 @@ Monitor for:
|
||||
- Service uptime
|
||||
- Error logs
|
||||
|
||||
### Phase 1 Lessons Learned (2026-02-19)
|
||||
|
||||
**Issues encountered:**
|
||||
1. **I/O storm on boot**: All 15 guests starting simultaneously caused massive I/O delay (~50% for several minutes). Consider staggering guest startup with delays between groups.
|
||||
2. **Pi-hole 1 UDP DNS failed after boot**: Docker iptables NAT rules weren't fully set up. Required container restart. TCP DNS worked immediately — only UDP was affected.
|
||||
3. **Home Assistant IP changed**: HA on VM 109 got a new DHCP address (10.10.0.215 instead of previous). Need DHCP reservation to prevent this.
|
||||
4. **Local machine DNS failover**: Desktop was configured with only one Pi-hole DNS server (10.10.0.226). When Proxmox guests were shut down, Pi-hole on physical server at .226 should have kept working but didn't resolve initially. Added both Pi-holes as DNS servers.
|
||||
5. **Some VMs ignored ACPI shutdown**: VMs 105 and 112 required `--forceStop` flag.
|
||||
6. **Several guests had onboot=1**: Many guests auto-started before we could bring them up in dependency order. Not harmful but unexpected.
|
||||
|
||||
**What went well:**
|
||||
- `pve7to8 --full` checker caught everything — zero surprises during upgrade
|
||||
- `DEBIAN_FRONTEND=noninteractive apt dist-upgrade -y -o Dpkg::Options::='--force-confnew'` worked cleanly
|
||||
- Reboot took ~4 minutes (longer than expected but completed without issues)
|
||||
- All backups on TrueNAS were intact and accessible post-upgrade
|
||||
- Local disk space dropped from 57% to 14% after upgrade (old kernel/packages cleaned up)
|
||||
|
||||
**Recommendations for Phase 2:**
|
||||
- Stagger guest startup: add `sleep 30` between dependency groups
|
||||
- Restart Pi-hole Docker container proactively after boot
|
||||
- Set DHCP reservation for HA VM before Phase 2
|
||||
- Switch local DNS to public resolvers (1.1.1.1) before shutting down guests
|
||||
- Disable onboot for all guests before upgrade, re-enable after validation
|
||||
|
||||
---
|
||||
|
||||
## Phase 2: Proxmox 8.4 → 9.1 Upgrade
|
||||
@ -169,25 +218,29 @@ Monitor for:
|
||||
|
||||
```bash
|
||||
# Verify systemd version in each LXC (must be > 230)
|
||||
for ct in 108 210 211 221 222 223 224 225; do
|
||||
for ct in 108 210 211 221 222 223 224 225 227 300 301; do
|
||||
echo "=== LXC $ct ==="
|
||||
pct exec $ct -- systemd --version | head -1
|
||||
done
|
||||
```
|
||||
|
||||
**Action Required**: If any LXC shows systemd < 230:
|
||||
```bash
|
||||
pct enter <CTID>
|
||||
apt update && apt dist-upgrade -y
|
||||
do-release-upgrade # Upgrade Ubuntu to compatible version
|
||||
```
|
||||
**Pre-verified 2026-02-19** (all pass, updated after cleanup):
|
||||
| LXC | Name | systemd | Status |
|
||||
|-----|------|---------|--------|
|
||||
| 210 | n8n | 245 | Pass |
|
||||
| 221 | arr-stack | 245 | Pass |
|
||||
| 222 | memos | 245 | Pass |
|
||||
| 223 | foundry | 245 | Pass |
|
||||
| 225 | gitea | 245 | Pass |
|
||||
| 227 | uptime-kuma | 249 | Pass |
|
||||
| 301 | claude-discord-coord | 249 | Pass |
|
||||
|
||||
**Expected**: All Ubuntu 20.04+ LXCs should be compatible (systemd 245+)
|
||||
**Expected**: All compatible. Re-verify before Phase 2 in case any LXC OS was changed.
|
||||
|
||||
#### 2. Fresh Backup Set
|
||||
```bash
|
||||
vzdump --all --mode snapshot --dumpdir /mnt/truenas/proxmox/pve9-upgrade --compress zstd
|
||||
tar -czf /mnt/truenas/proxmox/pve8-config-$(date +%Y%m%d).tar.gz /etc/pve/
|
||||
vzdump --all --mode snapshot --storage home-truenas --compress zstd
|
||||
tar -czf /mnt/pve/home-truenas/dump/pve-config/pve8-config-$(date +%Y%m%d).tar.gz /etc/pve/
|
||||
```
|
||||
|
||||
#### 3. Run PVE 8-to-9 Checker
|
||||
@ -350,7 +403,27 @@ pvesh get /cluster/resources
|
||||
|
||||
## Verification Checklist
|
||||
|
||||
After each upgrade phase:
|
||||
### Phase 1 (PVE 7→8) — Completed 2026-02-19
|
||||
|
||||
- [x] Proxmox version correct: pve-manager/8.4.16
|
||||
- [x] Kernel version updated: 6.8.12-18-pve
|
||||
- [x] All PVE services running (pve-cluster, pvedaemon, pveproxy, pvestatd)
|
||||
- [x] Storage accessible: local, local-lvm, home-truenas all active
|
||||
- [x] Network functional
|
||||
- [x] All VMs/LXCs visible in UI
|
||||
- [x] Critical VMs/LXCs started successfully
|
||||
- [x] Discord bots responding (confirmed on .88)
|
||||
- [x] Databases accessible (VM 112 running)
|
||||
- [x] n8n workflows — HTTP 200
|
||||
- [x] Gitea accessible — HTTP 200
|
||||
- [x] Home Assistant functional — HTTP 200 (new IP: 10.10.0.215)
|
||||
- [x] Jellyfin streaming — HTTP 302
|
||||
- [x] Uptime Kuma — HTTP 302
|
||||
- [x] Pi-hole 1 DNS resolving (after container restart)
|
||||
- [x] Pi-hole 2 DNS resolving
|
||||
- [x] Web UI functional — HTTP 200
|
||||
|
||||
### Phase 2 (PVE 8→9) — Pending
|
||||
|
||||
- [ ] Proxmox version correct (`pveversion`)
|
||||
- [ ] Kernel version updated (`uname -r`)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user