diff --git a/CLAUDE.md b/CLAUDE.md index 587f098..1285f44 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -7,7 +7,7 @@ - Prefer editing existing files over creating new ones - After complex tasks, prompt to save learnings to cognitive memory - At session end, ask: "Should I update our documentation?" -- At 25% context remaining, ask: "Should I update docs before we lose context?" +- At 25% context remaining, automatically run `/save-memories` before compaction loses detail ## Context Loading When a topic comes up, load `{tech}/CONTEXT.md` + `{tech}/troubleshooting.md`. For scripts, also load `{tech}/scripts/CONTEXT.md`. diff --git a/monitoring/scripts/tdarr-file-monitor-cron.sh b/monitoring/scripts/tdarr-file-monitor-cron.sh deleted file mode 100755 index 2faacbe..0000000 --- a/monitoring/scripts/tdarr-file-monitor-cron.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -# Cron job wrapper for Tdarr file monitor -# Add this to crontab with: * * * * * /mnt/NV2/Development/claude-home/monitoring/scripts/tdarr-file-monitor-cron.sh - -cd /mnt/NV2/Development/claude-home/monitoring/scripts -/usr/bin/python3 /mnt/NV2/Development/claude-home/monitoring/scripts/tdarr_file_monitor.py \ No newline at end of file diff --git a/monitoring/scripts/tdarr_file_monitor.py b/monitoring/scripts/tdarr_file_monitor.py deleted file mode 100755 index e789408..0000000 --- a/monitoring/scripts/tdarr_file_monitor.py +++ /dev/null @@ -1,286 +0,0 @@ -#!/usr/bin/env python3 -""" -Tdarr File Monitor - Monitors Tdarr cache directory for completed .mkv files and copies them to backup location. -Detects file completion by monitoring size changes and always keeps the smallest version of duplicate files. -""" - -import os -import shutil -import json -import time -import logging -from pathlib import Path -from dataclasses import dataclass, asdict -from typing import Dict, Optional -from datetime import datetime, timedelta - - -@dataclass -class FileState: - """Tracks the state of a monitored file.""" - path: str - size: int - last_modified: float - first_seen: float - last_size_change: float - check_count: int = 0 - - -class TdarrFileMonitor: - """Monitors Tdarr cache directory for completed .mkv files.""" - - def __init__( - self, - source_dir: str = "/mnt/NV2/tdarr-cache/nobara-pc-gpu-unmapped/temp", - media_dir: str = "/mnt/NV2/tdarr-cache/nobara-pc-gpu-unmapped/media", - dest_dir: str = "/mnt/NV2/tdarr-cache/manual-backup", - state_file: str = "/mnt/NV2/Development/claude-home/logs/tdarr_file_monitor_state.json", - completion_wait_seconds: int = 60, - log_file: str = "/mnt/NV2/Development/claude-home/logs/tdarr_file_monitor.log" - ): - self.source_dir = Path(source_dir) - self.media_dir = Path(media_dir) - self.dest_dir = Path(dest_dir) - self.state_file = Path(state_file) - self.completion_wait_seconds = completion_wait_seconds - self.monitored_files: Dict[str, FileState] = {} - - # Setup logging - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - handlers=[ - logging.FileHandler(log_file), - logging.StreamHandler() - ] - ) - self.logger = logging.getLogger(f'{__name__}.TdarrFileMonitor') - - # Ensure destination directory exists - self.dest_dir.mkdir(parents=True, exist_ok=True) - - # Load previous state - self._load_state() - - def _load_state(self) -> None: - """Load monitored files state from disk.""" - if self.state_file.exists(): - try: - with open(self.state_file, 'r') as f: - data = json.load(f) - self.monitored_files = { - path: FileState(**file_data) - for path, file_data in data.items() - } - self.logger.info(f"Loaded state for {len(self.monitored_files)} monitored files") - except Exception as e: - self.logger.error(f"Failed to load state file: {e}") - self.monitored_files = {} - - def _save_state(self) -> None: - """Save monitored files state to disk.""" - try: - with open(self.state_file, 'w') as f: - data = {path: asdict(state) for path, state in self.monitored_files.items()} - json.dump(data, f, indent=2) - except Exception as e: - self.logger.error(f"Failed to save state file: {e}") - - def _scan_for_mkv_files(self) -> Dict[str, Path]: - """Scan source directory for .mkv files in all subdirectories.""" - mkv_files = {} - try: - for mkv_file in self.source_dir.rglob("*.mkv"): - if mkv_file.is_file(): - mkv_files[str(mkv_file)] = mkv_file - except Exception as e: - self.logger.error(f"Error scanning source directory: {e}") - - return mkv_files - - def _get_file_info(self, file_path: Path) -> Optional[tuple]: - """Get file size and modification time, return None if file doesn't exist or can't be accessed.""" - try: - stat = file_path.stat() - return stat.st_size, stat.st_mtime - except (OSError, FileNotFoundError) as e: - self.logger.warning(f"Cannot access file {file_path}: {e}") - return None - - def _validate_file_pair(self, temp_file_path: Path, temp_file_size: int) -> bool: - """Validate that a matching file exists in media directory with exact same name and size.""" - try: - # Search for matching file in media directory tree - for media_file in self.media_dir.rglob(temp_file_path.name): - if media_file.is_file(): - media_file_info = self._get_file_info(media_file) - if media_file_info: - media_size, _ = media_file_info - if media_size == temp_file_size: - self.logger.debug(f"Found matching file: {temp_file_path.name} ({temp_file_size:,} bytes) in temp and media directories") - return True - else: - self.logger.debug(f"Size mismatch for {temp_file_path.name}: temp={temp_file_size:,}, media={media_size:,}") - - # No matching file found - self.logger.info(f"No matching file found in media directory for {temp_file_path.name} ({temp_file_size:,} bytes)") - return False - - except Exception as e: - self.logger.error(f"Error validating file pair for {temp_file_path.name}: {e}") - return False - - def _is_file_complete(self, file_state: FileState, current_time: float) -> bool: - """Check if file is complete based on size stability.""" - stale_time = current_time - file_state.last_size_change - return stale_time >= self.completion_wait_seconds - - def _should_copy_file(self, source_path: Path, dest_path: Path) -> bool: - """Determine if we should copy the file (always keep smaller version).""" - if not dest_path.exists(): - return True - - source_size = source_path.stat().st_size - dest_size = dest_path.stat().st_size - - if source_size < dest_size: - self.logger.info(f"Source file {source_path.name} ({source_size:,} bytes) is smaller than existing destination ({dest_size:,} bytes), will replace") - return True - else: - self.logger.info(f"Source file {source_path.name} ({source_size:,} bytes) is not smaller than existing destination ({dest_size:,} bytes), skipping") - return False - - def _copy_file_with_retry(self, source_path: Path, dest_path: Path) -> bool: - """Copy file with retry logic and cleanup on failure.""" - temp_dest = dest_path.with_suffix(dest_path.suffix + '.tmp') - - for attempt in range(2): # Try twice - try: - start_time = time.time() - self.logger.info(f"Attempt {attempt + 1}: Copying {source_path.name} ({source_path.stat().st_size:,} bytes)") - - # Copy to temporary file first - shutil.copy2(source_path, temp_dest) - - # Verify copy completed successfully - if temp_dest.stat().st_size != source_path.stat().st_size: - raise Exception(f"Copy verification failed: size mismatch") - - # Move temp file to final destination - if dest_path.exists(): - dest_path.unlink() # Remove existing file - temp_dest.rename(dest_path) - - copy_time = time.time() - start_time - final_size = dest_path.stat().st_size - - self.logger.info(f"Successfully copied {source_path.name} ({final_size:,} bytes) in {copy_time:.2f}s") - return True - - except Exception as e: - self.logger.error(f"Copy attempt {attempt + 1} failed for {source_path.name}: {e}") - - # Cleanup temporary file if it exists - if temp_dest.exists(): - try: - temp_dest.unlink() - except Exception as cleanup_error: - self.logger.error(f"Failed to cleanup temp file {temp_dest}: {cleanup_error}") - - if attempt == 1: # Last attempt failed - self.logger.error(f"All copy attempts failed for {source_path.name}, giving up") - return False - else: - time.sleep(5) # Wait before retry - - return False - - def run_check(self) -> None: - """Run a single monitoring check cycle.""" - current_time = time.time() - self.logger.info("Starting monitoring check cycle") - - # Scan for current .mkv files - current_files = self._scan_for_mkv_files() - self.logger.info(f"Found {len(current_files)} .mkv files in source directory") - - # Remove files from monitoring that no longer exist - missing_files = set(self.monitored_files.keys()) - set(current_files.keys()) - for missing_file in missing_files: - self.logger.info(f"File no longer exists, removing from monitoring: {Path(missing_file).name}") - del self.monitored_files[missing_file] - - # Process each current file - files_to_copy = [] - for file_path_str, file_path in current_files.items(): - file_info = self._get_file_info(file_path) - if not file_info: - continue - - current_size, current_mtime = file_info - - # Update or create file state - if file_path_str in self.monitored_files: - file_state = self.monitored_files[file_path_str] - file_state.check_count += 1 - - # Check if size changed - if current_size != file_state.size: - file_state.size = current_size - file_state.last_size_change = current_time - self.logger.debug(f"Size changed for {file_path.name}: {current_size:,} bytes") - - file_state.last_modified = current_mtime - - else: - # New file discovered - validate before tracking - if not self._validate_file_pair(file_path, current_size): - # File doesn't have a matching pair in media directory, skip tracking - continue - - file_state = FileState( - path=file_path_str, - size=current_size, - last_modified=current_mtime, - first_seen=current_time, - last_size_change=current_time, - check_count=1 - ) - self.monitored_files[file_path_str] = file_state - self.logger.info(f"Started monitoring validated file: {file_path.name} ({current_size:,} bytes)") - - # Log current state - stale_time = current_time - file_state.last_size_change - self.logger.info(f"Checking {file_path.name}: {current_size:,} bytes, stale for {stale_time:.1f}s (checks: {file_state.check_count})") - - # Check if file is complete - if self._is_file_complete(file_state, current_time): - dest_path = self.dest_dir / file_path.name - if self._should_copy_file(file_path, dest_path): - files_to_copy.append((file_path, dest_path, file_state)) - - # Copy completed files - for source_path, dest_path, file_state in files_to_copy: - self.logger.info(f"File appears complete: {source_path.name} (stable for {current_time - file_state.last_size_change:.1f}s)") - - if self._copy_file_with_retry(source_path, dest_path): - # Remove from monitoring after successful copy - del self.monitored_files[str(source_path)] - self.logger.info(f"Successfully processed and removed from monitoring: {source_path.name}") - else: - self.logger.error(f"Failed to copy {source_path.name}, will continue monitoring") - - # Save state - self._save_state() - - self.logger.info(f"Check cycle completed, monitoring {len(self.monitored_files)} files") - - -def main(): - """Main entry point for the script.""" - monitor = TdarrFileMonitor() - monitor.run_check() - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/monitoring/scripts/tdarr_monitor.py b/monitoring/scripts/tdarr_monitor.py deleted file mode 100755 index ad546d1..0000000 --- a/monitoring/scripts/tdarr_monitor.py +++ /dev/null @@ -1,1227 +0,0 @@ -#!/usr/bin/env python3 -""" -Tdarr API Monitoring Script with Stuck Job Detection and Discord Alerts - -Monitors Tdarr server via its web API endpoints: -- Server status and health -- Queue status and statistics -- Node status and performance -- Library scan progress -- Worker activity -- Stuck job detection with configurable timeouts -- Discord notifications for alerts and status updates - -Usage: - python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check all - python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check queue - python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes - - # Enable stuck job detection (30 minute threshold) - python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --detect-stuck - - # Custom stuck threshold (15 minutes) - python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check all --detect-stuck --stuck-threshold 15 - - # Enable Discord alerts for stuck jobs (uses default webhook) - python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --detect-stuck --discord-alerts - - # Automatically clear hung workers when detected - python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --detect-stuck --clear-hung-workers - - # Full monitoring with automatic clearing and Discord alerts - python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --detect-stuck --clear-hung-workers --discord-alerts - - # Test Discord integration (uses default webhook) - python3 tdarr_monitor.py --server http://10.10.0.43:8265 --discord-test - - # Enable file logging with custom path and debug level - python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --log-file /tmp/tdarr_debug.log --log-level DEBUG - - # Disable file logging (console only) - python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --no-log-file - - # Clear memory state - python3 tdarr_monitor.py --server http://10.10.0.43:8265 --clear-memory -""" - -import argparse -import json -import logging -import logging.handlers -import sys -import os -import pickle -from dataclasses import dataclass, asdict -from datetime import datetime, timedelta -from typing import Dict, List, Optional, Any, Union -import requests -from urllib.parse import urljoin - - - -@dataclass -class WorkerSnapshot: - worker_id: str - node_id: str - worker_type: str - file: str - percentage: float - status: str - fps: int - eta: str - timestamp: datetime - - -@dataclass -class StuckJob: - worker_snapshot: WorkerSnapshot - first_seen: datetime - stuck_duration_minutes: float - is_stuck: bool = True - - -@dataclass -class MemoryState: - worker_snapshots: Dict[str, WorkerSnapshot] - stuck_jobs: Dict[str, StuckJob] - last_updated: datetime - - -@dataclass -class ServerStatus: - timestamp: str - server_url: str - status: str - error: Optional[str] = None - version: Optional[str] = None - server_id: Optional[str] = None - uptime: Optional[str] = None - system_info: Optional[Dict[str, Any]] = None - - -@dataclass -class QueueStats: - total_files: int - queued: int - processing: int - completed: int - queue_items: List[Dict[str, Any]] - - -@dataclass -class QueueStatus: - timestamp: str - queue_stats: Optional[QueueStats] = None - error: Optional[str] = None - - -@dataclass -class NodeInfo: - id: Optional[str] - nodeName: Optional[str] - status: str - lastSeen: Optional[int] - version: Optional[str] - platform: Optional[str] - workers: Dict[str, int] - processing: List[Dict[str, Any]] - - -@dataclass -class NodeSummary: - total_nodes: int - online_nodes: int - offline_nodes: int - online_details: List[NodeInfo] - offline_details: List[NodeInfo] - - -@dataclass -class NodeStatus: - timestamp: str - nodes: List[Dict[str, Any]] - node_summary: Optional[NodeSummary] = None - stuck_jobs: List[StuckJob] = None - error: Optional[str] = None - - -@dataclass -class LibraryInfo: - name: Optional[str] - path: Optional[str] - file_count: int - scan_progress: int - last_scan: Optional[str] - is_scanning: bool - - -@dataclass -class ScanStatus: - total_libraries: int - total_files: int - scanning_libraries: int - - -@dataclass -class LibraryStatus: - timestamp: str - libraries: List[LibraryInfo] - scan_status: Optional[ScanStatus] = None - error: Optional[str] = None - - -@dataclass -class Statistics: - total_transcodes: int - space_saved: int - total_files_processed: int - failed_transcodes: int - processing_speed: int - eta: Optional[str] - - -@dataclass -class StatisticsStatus: - timestamp: str - statistics: Optional[Statistics] = None - error: Optional[str] = None - - -@dataclass -class HealthCheck: - status: str - healthy: bool - online_count: Optional[int] = None - total_count: Optional[int] = None - accessible: Optional[bool] = None - total_items: Optional[int] = None - - -@dataclass -class HealthStatus: - timestamp: str - overall_status: str - checks: Dict[str, HealthCheck] - - -@dataclass -class DiscordEmbedField: - name: str - value: str - inline: bool = False - - -@dataclass -class DiscordEmbed: - title: str - description: str - color: int - fields: List[DiscordEmbedField] = None - timestamp: str = None - - def __post_init__(self): - if self.fields is None: - self.fields = [] - if self.timestamp is None: - self.timestamp = datetime.now().isoformat() - - -class DiscordNotifier: - def __init__(self, webhook_url: str, timeout: int = 10): - """Initialize Discord notifier with webhook URL.""" - self.webhook_url = webhook_url - self.timeout = timeout - self.session = requests.Session() - self.logger = logging.getLogger(f"{__name__}.DiscordNotifier") - - def send_content_message(self, content: str, username: str = "Tdarr Monitor") -> bool: - """Send a simple content message to Discord. - - Args: - content: The message content to send - username: Bot username to display - - Returns: - True if successful, False otherwise - """ - payload = { - "content": content, - "username": username - } - - return self._send_webhook(payload) - - def send_embed_message(self, - title: str, - description: str, - color: int = 0xff6b6b, # Red by default - fields: List[DiscordEmbedField] = None, - username: str = "Tdarr Monitor") -> bool: - """Send an embed message to Discord. - - Args: - title: Embed title - description: Embed description - color: Embed color (hex integer, default red) - fields: List of embed fields - username: Bot username to display - - Returns: - True if successful, False otherwise - """ - embed = DiscordEmbed( - title=title, - description=description, - color=color, - fields=fields or [] - ) - - payload = { - "username": username, - "embeds": [asdict(embed)] - } - - return self._send_webhook(payload) - - def send_stuck_job_alert(self, stuck_jobs: List[StuckJob]) -> bool: - """Send alert for stuck jobs using embed format. - - Args: - stuck_jobs: List of stuck jobs to report - - Returns: - True if successful, False otherwise - """ - if not stuck_jobs: - return True - - # Create embed fields for each stuck job - fields = [] - for i, stuck_job in enumerate(stuck_jobs[:10]): # Limit to 10 jobs (Discord embed field limit is 25) - ws = stuck_job.worker_snapshot - field_value = ( - f"**File:** {os.path.basename(ws.file)}\n" - f"**Status:** {ws.status}\n" - f"**Progress:** {ws.percentage}%\n" - f"**Duration:** {stuck_job.stuck_duration_minutes:.1f} minutes\n" - f"**Node:** {ws.node_id}" - ) - - fields.append(DiscordEmbedField( - name=f"Stuck Job {i+1}: {ws.worker_id}", - value=field_value, - inline=True - )) - - # Add summary field if there are more jobs - if len(stuck_jobs) > 10: - fields.append(DiscordEmbedField( - name="Additional Jobs", - value=f"... and {len(stuck_jobs) - 10} more stuck jobs", - inline=False - )) - - title = f"๐Ÿšจ Tdarr Stuck Jobs Detected ({len(stuck_jobs)})" - description = ( - f"Detected {len(stuck_jobs)} stuck job{'s' if len(stuck_jobs) != 1 else ''} " - f"in your Tdarr system. These jobs will follow the requested clear behavior." - ) - - return self.send_embed_message( - title=title, - description=description, - color=0xff6b6b, # Red color for alerts - fields=fields - ) - - def send_system_status(self, - server_status: ServerStatus, - node_status: NodeStatus, - stuck_jobs: List[StuckJob] = None) -> bool: - """Send system status summary using embed format. - - Args: - server_status: Server status information - node_status: Node status information - stuck_jobs: Optional stuck jobs list - - Returns: - True if successful, False otherwise - """ - # Determine overall health color - is_healthy = ( - server_status.status == "good" and - not server_status.error and - not node_status.error and - (not stuck_jobs or len(stuck_jobs) == 0) - ) - - color = 0x28a745 if is_healthy else 0xff6b6b # Green if healthy, red if not - - # Build description - description_parts = [ - f"**Server Status:** {server_status.status.title()}", - f"**Version:** {getattr(server_status, 'version', 'Unknown')}" - ] - - if node_status.node_summary: - description_parts.extend([ - f"**Total Nodes:** {node_status.node_summary.total_nodes}", - f"**Online Nodes:** {node_status.node_summary.online_nodes}", - f"**Offline Nodes:** {node_status.node_summary.offline_nodes}" - ]) - - if stuck_jobs: - description_parts.append(f"**Stuck Jobs:** {len(stuck_jobs)}") - - # Add node details as fields - fields = [] - if node_status.node_summary and node_status.node_summary.online_details: - for node in node_status.node_summary.online_details: - active_workers = len(node.processing) if node.processing else 0 - field_value = ( - f"**Status:** Online\n" - f"**Platform:** {node.platform or 'Unknown'}\n" - f"**Active Workers:** {active_workers}\n" - f"**CPU Workers:** {node.workers.get('cpu', 0)}\n" - f"**GPU Workers:** {node.workers.get('gpu', 0)}" - ) - - fields.append(DiscordEmbedField( - name=f"๐Ÿ“ก {node.nodeName or node.id}", - value=field_value, - inline=True - )) - - title = "๐Ÿ“Š Tdarr System Status" - if not is_healthy: - title = "โš ๏ธ Tdarr System Alert" - - return self.send_embed_message( - title=title, - description="\n".join(description_parts), - color=color, - fields=fields - ) - - def _send_webhook(self, payload: Dict[str, Any]) -> bool: - """Send payload to Discord webhook. - - Args: - payload: JSON payload to send - - Returns: - True if successful, False otherwise - """ - try: - response = self.session.post( - self.webhook_url, - json=payload, - timeout=self.timeout - ) - response.raise_for_status() - self.logger.info("Discord notification sent successfully") - return True - - except requests.exceptions.RequestException as e: - self.logger.error(f"Failed to send Discord notification: {e}") - return False - except Exception as e: - self.logger.error(f"Unexpected error sending Discord notification: {e}") - return False - - -class StuckJobDetector: - def __init__(self, memory_file: str = "/mnt/NV2/Development/claude-home/logs/tdarr_memory.pkl", stuck_threshold_minutes: int = 30): - """Initialize stuck job detector with memory persistence.""" - self.memory_file = os.path.abspath(memory_file) # Use absolute path - self.stuck_threshold_minutes = stuck_threshold_minutes - self.logger = logging.getLogger(f"{__name__}.StuckJobDetector") - self.logger.debug(f"Using memory file: {self.memory_file}") - self.memory_state = self._load_memory_state() - - # Ensure memory directory exists - os.makedirs(os.path.dirname(memory_file), exist_ok=True) - - def _load_memory_state(self) -> MemoryState: - """Load memory state from disk or create new one.""" - if os.path.exists(self.memory_file): - try: - with open(self.memory_file, 'rb') as f: - memory_state = pickle.load(f) - self.logger.debug(f"Loaded memory state: {len(memory_state.worker_snapshots)} workers, {len(memory_state.stuck_jobs)} stuck jobs") - return memory_state - except Exception as e: - self.logger.warning(f"Failed to load memory state: {e}, creating new state") - else: - self.logger.debug(f"Memory file {self.memory_file} does not exist, creating new state") - - return MemoryState( - worker_snapshots={}, - stuck_jobs={}, - last_updated=datetime.now() - ) - - def _save_memory_state(self): - """Save memory state to disk.""" - try: - with open(self.memory_file, 'wb') as f: - pickle.dump(self.memory_state, f) - except Exception as e: - self.logger.error(f"Failed to save memory state: {e}") - - def _create_worker_key(self, node_id: str, worker_id: str) -> str: - """Create unique key for worker identification.""" - return f"{node_id}:{worker_id}" - - def _is_worker_stuck(self, current: WorkerSnapshot, previous: WorkerSnapshot) -> bool: - """Check if worker is stuck based on comparison with previous snapshot.""" - worker_key = f"{current.node_id}:{current.worker_id}" - - # Check each condition individually for detailed logging - file_same = current.file == previous.file - percentage_same = current.percentage == previous.percentage - status_same = current.status == previous.status - fps_same = current.fps == previous.fps - eta_same = current.eta == previous.eta - - is_stuck = file_same and percentage_same and status_same and fps_same and eta_same - - # Log detailed comparison info - self.logger.debug(f"Worker {worker_key} stuck check:") - self.logger.debug(f" File: '{current.file}' == '{previous.file}' = {file_same}") - self.logger.debug(f" Percentage: {current.percentage}% == {previous.percentage}% = {percentage_same}") - self.logger.debug(f" Status: '{current.status}' == '{previous.status}' = {status_same}") - self.logger.debug(f" FPS: {current.fps} == {previous.fps} = {fps_same}") - self.logger.debug(f" ETA: '{current.eta}' == '{previous.eta}' = {eta_same}") - self.logger.debug(f" โ†’ Result: {'STUCK' if is_stuck else 'NOT STUCK'}") - - # Log INFO level when we detect changes (worker making progress) - if not is_stuck: - if not percentage_same: - self.logger.info(f"Worker {worker_key} making progress: {previous.percentage}% โ†’ {current.percentage}%") - elif not status_same: - self.logger.info(f"Worker {worker_key} status changed: '{previous.status}' โ†’ '{current.status}'") - elif not file_same: - self.logger.info(f"Worker {worker_key} file changed: '{previous.file}' โ†’ '{current.file}'") - - return is_stuck - - def update_workers(self, nodes_data: Dict[str, Any]) -> List[StuckJob]: - """Update worker snapshots and detect stuck jobs.""" - current_time = datetime.now() - current_workers = {} - detected_stuck_jobs = [] - - # Extract current worker states from nodes data - for node_id, node_data in nodes_data.items(): - workers = node_data.get('workers', {}) - for worker_id, worker_data in workers.items(): - worker_key = self._create_worker_key(node_id, worker_id) - - # Create current snapshot - current_snapshot = WorkerSnapshot( - worker_id=worker_id, - node_id=node_id, - worker_type=worker_data.get('workerType', 'unknown'), - file=worker_data.get('file', ''), - percentage=worker_data.get('percentage', -1), - status=worker_data.get('status', ''), - fps=worker_data.get('fps', 0), - eta=worker_data.get('ETA', ''), - timestamp=current_time - ) - - current_workers[worker_key] = current_snapshot - - # Log all workers being tracked - self.logger.debug(f"Tracking worker {worker_key}: {current_snapshot.status} at {current_snapshot.percentage}% on '{current_snapshot.file}'") - - # Check if worker was previously tracked - if worker_key in self.memory_state.worker_snapshots: - previous_snapshot = self.memory_state.worker_snapshots[worker_key] - - # Check if worker is stuck - if self._is_worker_stuck(current_snapshot, previous_snapshot): - # Calculate how long it's been stuck - time_since_previous = (current_time - previous_snapshot.timestamp).total_seconds() / 60 - self.logger.debug(f"Worker {worker_key} has been stuck for {time_since_previous:.1f} minutes since last check") - self.logger.debug(f"Worker {worker_key} checking stuck_jobs dict: {list(self.memory_state.stuck_jobs.keys())}") - - if worker_key in self.memory_state.stuck_jobs: - # Already known stuck job, update duration - stuck_job = self.memory_state.stuck_jobs[worker_key] - stuck_duration = current_time - stuck_job.first_seen - stuck_job.stuck_duration_minutes = stuck_duration.total_seconds() / 60 - stuck_job.worker_snapshot = current_snapshot - - self.logger.debug(f"Worker {worker_key} known stuck job - duration: {stuck_job.stuck_duration_minutes:.1f} min, threshold: {self.stuck_threshold_minutes} min") - if stuck_job.stuck_duration_minutes >= self.stuck_threshold_minutes: - self.logger.debug(f"Worker {worker_key} EXCEEDS threshold - adding to detected stuck jobs") - detected_stuck_jobs.append(stuck_job) - else: - self.logger.debug(f"Worker {worker_key} below threshold - not flagging yet") - else: - # New stuck job detected - add to memory immediately to start tracking - first_seen = previous_snapshot.timestamp - stuck_duration = current_time - first_seen - stuck_duration_minutes = stuck_duration.total_seconds() / 60 - - self.logger.debug(f"Worker {worker_key} NEW stuck job - first_seen: {first_seen}, current: {current_time}") - self.logger.debug(f"Worker {worker_key} NEW stuck job - duration: {stuck_duration_minutes:.1f} min, threshold: {self.stuck_threshold_minutes} min") - - # Create stuck job entry immediately to track duration across runs - stuck_job = StuckJob( - worker_snapshot=current_snapshot, - first_seen=first_seen, - stuck_duration_minutes=stuck_duration_minutes, - is_stuck=True - ) - self.memory_state.stuck_jobs[worker_key] = stuck_job - - if stuck_duration_minutes >= self.stuck_threshold_minutes: - self.logger.debug(f"Worker {worker_key} NEW stuck job EXCEEDS threshold - flagging for clearing") - detected_stuck_jobs.append(stuck_job) - else: - self.logger.debug(f"Worker {worker_key} NEW stuck job below threshold - tracking in memory") - else: - # Worker is not stuck, remove from stuck jobs if present - if worker_key in self.memory_state.stuck_jobs: - del self.memory_state.stuck_jobs[worker_key] - self.logger.info(f"Worker {worker_key} is no longer stuck") - else: - # New worker, start tracking it - self.logger.info(f"New worker detected: {worker_key} - {current_snapshot.status} at {current_snapshot.percentage}% on '{current_snapshot.file}'") - - # Clean up stuck jobs for workers that no longer exist - stuck_jobs_to_remove = [] - for worker_key in self.memory_state.stuck_jobs: - if worker_key not in current_workers: - stuck_jobs_to_remove.append(worker_key) - - for worker_key in stuck_jobs_to_remove: - del self.memory_state.stuck_jobs[worker_key] - self.logger.info(f"Removed stuck job tracking for missing worker: {worker_key}") - - # Update memory state - self.memory_state.worker_snapshots = current_workers - self.memory_state.last_updated = current_time - - # Save to disk - self._save_memory_state() - - return detected_stuck_jobs - - def get_stuck_jobs(self) -> List[StuckJob]: - """Get current list of stuck jobs.""" - return list(self.memory_state.stuck_jobs.values()) - - def clear_memory(self): - """Clear all memory state.""" - self.memory_state = MemoryState( - worker_snapshots={}, - stuck_jobs={}, - last_updated=datetime.now() - ) - self._save_memory_state() - self.logger.info("Memory state cleared") - - -class TdarrMonitor: - def __init__(self, server_url: str, timeout: int = 30, enable_stuck_detection: bool = False, - stuck_threshold_minutes: int = 30, memory_file: str = ".claude/tmp/tdarr_memory.pkl", - discord_webhook_url: str = None, enable_discord_alerts: bool = False, - log_file: Optional[str] = None, log_level: str = "INFO", clear_hung_workers: bool = False): - """Initialize Tdarr monitor with server URL.""" - self.server_url = server_url.rstrip('/') - self.timeout = timeout - self.session = requests.Session() - self.enable_stuck_detection = enable_stuck_detection - self.enable_discord_alerts = enable_discord_alerts - self.clear_hung_workers_enabled = clear_hung_workers - - # Configure logging first - self._setup_logging(log_file, log_level) - self.logger = logging.getLogger(__name__) - - # Initialize stuck job detector if enabled - self.stuck_detector = None - if enable_stuck_detection: - self.stuck_detector = StuckJobDetector(memory_file, stuck_threshold_minutes) - - # Initialize Discord notifier if enabled - self.discord_notifier = None - if enable_discord_alerts: - if discord_webhook_url: - self.discord_notifier = DiscordNotifier(discord_webhook_url) - else: - self.logger.warning("Discord alerts enabled but no webhook URL provided") - - def _setup_logging(self, log_file: Optional[str] = None, log_level: str = "INFO"): - """Configure logging with optional file rotation.""" - # Clear any existing handlers - root_logger = logging.getLogger() - root_logger.handlers.clear() - - # Set log level - level = getattr(logging, log_level.upper(), logging.INFO) - root_logger.setLevel(level) - - # Create formatters - console_formatter = logging.Formatter( - '%(asctime)s - %(name)s - %(levelname)s - %(message)s', - datefmt='%Y-%m-%d %H:%M:%S' - ) - - # Enhanced file formatter for better readability - file_formatter = logging.Formatter( - '%(asctime)s [%(levelname)-7s] %(module)-12s | %(message)s', - datefmt='%H:%M:%S' - ) - - # Console handler (for interactive use) - console_handler = logging.StreamHandler(sys.stdout) - console_handler.setFormatter(console_formatter) - root_logger.addHandler(console_handler) - - # File handler with rotation (if log_file specified) - if log_file: - # Ensure log directory exists - log_dir = os.path.dirname(log_file) - if log_dir: - os.makedirs(log_dir, exist_ok=True) - - # Rotating file handler: 10MB max, keep 5 backup files - file_handler = logging.handlers.RotatingFileHandler( - log_file, - maxBytes=10 * 1024 * 1024, # 10MB - backupCount=5, - encoding='utf-8' - ) - file_handler.setFormatter(file_formatter) - root_logger.addHandler(file_handler) - - def _make_request(self, endpoint: str) -> Optional[Dict[str, Any]]: - """Make HTTP request to Tdarr API endpoint.""" - url = urljoin(self.server_url, endpoint) - - try: - response = self.session.get(url, timeout=self.timeout) - response.raise_for_status() - return response.json() - - except requests.exceptions.RequestException as e: - self.logger.error(f"Request failed for {url}: {e}") - return None - except json.JSONDecodeError as e: - self.logger.error(f"JSON decode failed for {url}: {e}") - return None - - def clear_hung_workers(self, stuck_jobs: Optional[List[StuckJob]] = None) -> bool: - """Clear hung workers via Tdarr API using kill-worker endpoint. - - Args: - stuck_jobs: List of StuckJob objects to clear. Each contains worker and node information. - - Returns: - True if all workers cleared successfully, False otherwise - """ - if not stuck_jobs: - self.logger.info("No stuck jobs provided for clearing hung workers") - return True - - success_count = 0 - total_count = len(stuck_jobs) - - for stuck_job in stuck_jobs: - worker_snapshot = stuck_job.worker_snapshot - try: - # Use the kill-worker endpoint with correct payload format - endpoint = '/api/v2/kill-worker' - payload = { - "data": { - "nodeID": worker_snapshot.node_id, - "workerID": worker_snapshot.worker_id - } - } - - url = urljoin(self.server_url, endpoint) - response = self.session.post(url, json=payload, timeout=self.timeout) - response.raise_for_status() - - self.logger.info(f"Successfully killed hung worker: {worker_snapshot.node_id}:{worker_snapshot.worker_id}") - success_count += 1 - - except requests.exceptions.RequestException as e: - self.logger.error(f"Failed to kill worker {worker_snapshot.node_id}:{worker_snapshot.worker_id}: {e}") - except Exception as e: - self.logger.error(f"Unexpected error killing worker {worker_snapshot.node_id}:{worker_snapshot.worker_id}: {e}") - - self.logger.info(f"Cleared {success_count}/{total_count} hung workers") - return success_count == total_count - - def get_server_status(self) -> ServerStatus: - """Get overall server status and configuration.""" - timestamp = datetime.now().isoformat() - - # Try to get server info from API - data = self._make_request('/api/v2/status') - if data: - server_status = data.get('status', 'unknown') - self.logger.info(f"Server check completed: status={server_status}, version={data.get('version', 'unknown')}") - return ServerStatus( - timestamp=timestamp, - server_url=self.server_url, - status=server_status, - version=data.get('version'), - uptime=data.get('uptime') - ) - else: - self.logger.error("Server check failed: Unable to connect to Tdarr server") - return ServerStatus( - timestamp=timestamp, - server_url=self.server_url, - status='offline', - error='Unable to connect to Tdarr server' - ) - - def get_queue_status(self) -> QueueStatus: - """Get transcoding queue status and statistics.""" - timestamp = datetime.now().isoformat() - - # Get queue information - data = self._make_request('/api/v2/get-queue') - if data: - queue_data = data.get('queue', []) - - # Calculate queue statistics - total_files = len(queue_data) - queued_files = len([f for f in queue_data if f.get('status') == 'Queued']) - processing_files = len([f for f in queue_data if f.get('status') == 'Processing']) - completed_files = len([f for f in queue_data if f.get('status') == 'Completed']) - - queue_stats = QueueStats( - total_files=total_files, - queued=queued_files, - processing=processing_files, - completed=completed_files, - queue_items=queue_data[:10] # First 10 items for details - ) - - return QueueStatus( - timestamp=timestamp, - queue_stats=queue_stats - ) - else: - return QueueStatus( - timestamp=timestamp, - error='Unable to fetch queue data' - ) - - def get_node_status(self) -> NodeStatus: - """Get status of all connected nodes.""" - timestamp = datetime.now().isoformat() - - # Get nodes information (using correct endpoint) - data = self._make_request('/api/v2/get-nodes') - if data: - # Handle the actual data structure returned by Tdarr API - nodes_dict = data if isinstance(data, dict) else {} - nodes = [] - - # Process node information - online_nodes = [] - offline_nodes = [] - - for node_id, node_data in nodes_dict.items(): - node_info = NodeInfo( - id=node_id, - nodeName=node_data.get('nodeName'), - status='online', # Assume online if in response - lastSeen=None, - version=node_data.get('config', {}).get('version'), - platform=node_data.get('config', {}).get('platform_arch_isdocker'), - workers={ - 'cpu': len([w for w in node_data.get('workers', {}).values() if 'cpu' in w.get('workerType', '').lower()]), - 'gpu': len([w for w in node_data.get('workers', {}).values() if 'gpu' in w.get('workerType', '').lower()]) - }, - processing=list(node_data.get('workers', {}).values()) - ) - - online_nodes.append(node_info) - nodes.append(node_data) - - # Check for stuck jobs if detection is enabled - stuck_jobs = [] - if self.stuck_detector: - try: - stuck_jobs = self.stuck_detector.update_workers(nodes_dict) - if stuck_jobs: - self.logger.warning(f"Detected {len(stuck_jobs)} stuck jobs") - for stuck_job in stuck_jobs: - self.logger.warning( - f"Stuck job: {stuck_job.worker_snapshot.node_id}:{stuck_job.worker_snapshot.worker_id} " - f"on file '{stuck_job.worker_snapshot.file}' " - f"at {stuck_job.worker_snapshot.percentage}% for {stuck_job.stuck_duration_minutes:.1f} minutes" - ) - - # Clear hung workers if enabled - if self.clear_hung_workers_enabled: - try: - clear_success = self.clear_hung_workers(stuck_jobs) - if clear_success: - self.logger.info(f"Successfully cleared {len(stuck_jobs)} hung workers") - else: - self.logger.warning("Some hung workers could not be cleared") - except Exception as e: - self.logger.error(f"Error clearing hung workers: {e}") - - # Send Discord notification for stuck jobs - if self.discord_notifier: - try: - self.discord_notifier.send_stuck_job_alert(stuck_jobs) - except Exception as e: - self.logger.error(f"Failed to send Discord stuck job alert: {e}") - - except Exception as e: - self.logger.error(f"Error in stuck job detection: {e}") - - node_summary = NodeSummary( - total_nodes=len(nodes), - online_nodes=len(online_nodes), - offline_nodes=len(offline_nodes), - online_details=online_nodes, - offline_details=offline_nodes - ) - - # Log successful node check with summary - if stuck_jobs: - self.logger.info(f"Node check completed: {len(nodes)} nodes online, {len(stuck_jobs)} stuck jobs detected") - else: - self.logger.info(f"Node check completed: {len(nodes)} nodes online, no stuck jobs detected") - - return NodeStatus( - timestamp=timestamp, - nodes=nodes, - node_summary=node_summary, - stuck_jobs=stuck_jobs - ) - else: - self.logger.error("Node check failed: Unable to fetch node data") - return NodeStatus( - timestamp=timestamp, - nodes=[], - error='Unable to fetch node data' - ) - - def get_library_status(self) -> LibraryStatus: - """Get library scan status and file statistics.""" - timestamp = datetime.now().isoformat() - - # Get library information - data = self._make_request('/api/v2/get-libraries') - if data: - libraries = data.get('libraries', []) - - library_stats = [] - total_files = 0 - - for lib in libraries: - lib_info = LibraryInfo( - name=lib.get('name'), - path=lib.get('path'), - file_count=lib.get('totalFiles', 0), - scan_progress=lib.get('scanProgress', 0), - last_scan=lib.get('lastScan'), - is_scanning=lib.get('isScanning', False) - ) - library_stats.append(lib_info) - total_files += lib_info.file_count - - scan_status = ScanStatus( - total_libraries=len(libraries), - total_files=total_files, - scanning_libraries=len([l for l in library_stats if l.is_scanning]) - ) - - return LibraryStatus( - timestamp=timestamp, - libraries=library_stats, - scan_status=scan_status - ) - else: - return LibraryStatus( - timestamp=timestamp, - libraries=[], - error='Unable to fetch library data' - ) - - def get_statistics(self) -> StatisticsStatus: - """Get overall Tdarr statistics and health metrics.""" - timestamp = datetime.now().isoformat() - - # Get statistics - data = self._make_request('/api/v2/get-stats') - if data: - stats = data.get('stats', {}) - statistics = Statistics( - total_transcodes=stats.get('totalTranscodes', 0), - space_saved=stats.get('spaceSaved', 0), - total_files_processed=stats.get('totalFilesProcessed', 0), - failed_transcodes=stats.get('failedTranscodes', 0), - processing_speed=stats.get('processingSpeed', 0), - eta=stats.get('eta') - ) - - return StatisticsStatus( - timestamp=timestamp, - statistics=statistics - ) - else: - return StatisticsStatus( - timestamp=timestamp, - error='Unable to fetch statistics' - ) - - def health_check(self) -> HealthStatus: - """Perform comprehensive health check.""" - timestamp = datetime.now().isoformat() - - # Server connectivity - server_status = self.get_server_status() - server_check = HealthCheck( - status=server_status.status, - healthy=server_status.status == 'good' - ) - - # Node connectivity - node_status = self.get_node_status() - nodes_healthy = ( - node_status.node_summary.online_nodes > 0 if node_status.node_summary else False - ) and not node_status.error - - nodes_check = HealthCheck( - status='online' if nodes_healthy else 'offline', - healthy=nodes_healthy, - online_count=node_status.node_summary.online_nodes if node_status.node_summary else 0, - total_count=node_status.node_summary.total_nodes if node_status.node_summary else 0 - ) - - checks = { - 'server': server_check, - 'nodes': nodes_check - } - - # Determine overall health - all_checks_healthy = all(check.healthy for check in checks.values()) - overall_status = 'healthy' if all_checks_healthy else 'unhealthy' - - return HealthStatus( - timestamp=timestamp, - overall_status=overall_status, - checks=checks - ) - - -def main(): - parser = argparse.ArgumentParser(description='Monitor Tdarr server via API') - parser.add_argument('--server', required=True, help='Tdarr server URL (e.g., http://10.10.0.43:8265)') - parser.add_argument('--check', choices=['all', 'status', 'queue', 'nodes', 'libraries', 'stats', 'health'], - default='health', help='Type of check to perform') - parser.add_argument('--timeout', type=int, default=30, help='Request timeout in seconds') - parser.add_argument('--output', choices=['json', 'pretty'], default='pretty', help='Output format') - parser.add_argument('--verbose', action='store_true', help='Enable verbose logging') - parser.add_argument('--detect-stuck', action='store_true', help='Enable stuck job detection') - parser.add_argument('--stuck-threshold', type=int, default=30, help='Minutes before job is considered stuck (default: 30)') - parser.add_argument('--memory-file', default='/mnt/NV2/Development/claude-home/logs/tdarr_memory.pkl', help='Path to memory state file') - parser.add_argument('--clear-memory', action='store_true', help='Clear memory state and exit') - parser.add_argument('--discord-webhook', - default='https://discord.com/api/webhooks/1404105821549498398/y2Ud1RK9rzFjv58xbypUfQNe3jrL7ZUq1FkQHa4_dfOHm2ylp93z0f4tY0O8Z-vQgKhD', - help='Discord webhook URL for notifications (default: configured webhook)') - parser.add_argument('--discord-alerts', action='store_true', help='Enable Discord alerts for stuck jobs and system status') - parser.add_argument('--discord-test', action='store_true', help='Send test Discord message and exit') - parser.add_argument('--log-file', default='/mnt/NV2/Development/claude-home/logs/tdarr_monitor.log', - help='Path to log file with rotation (default: /mnt/NV2/Development/claude-home/logs/tdarr_monitor.log)') - parser.add_argument('--log-level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], default='INFO', - help='Logging level (default: INFO)') - parser.add_argument('--no-log-file', action='store_true', help='Disable file logging, console only') - parser.add_argument('--clear-hung-workers', action='store_true', help='Clear hung workers via API call when stuck jobs are detected') - - args = parser.parse_args() - - if args.verbose: - logging.getLogger().setLevel(logging.DEBUG) - - # Handle clear memory command - if args.clear_memory: - if os.path.exists(args.memory_file): - os.remove(args.memory_file) - print(f"Memory state cleared: {args.memory_file}") - else: - print(f"Memory file does not exist: {args.memory_file}") - sys.exit(0) - - # Handle Discord test command - if args.discord_test: - print("Sending Discord test messages...") - notifier = DiscordNotifier(args.discord_webhook) - - # Test content message - content_success = notifier.send_content_message( - "๐Ÿงช **Tdarr Monitor Test** - Content message working correctly!" - ) - - # Test embed message - test_fields = [ - DiscordEmbedField("Test Field 1", "This is a test value", True), - DiscordEmbedField("Test Field 2", "Another test value", True), - ] - - embed_success = notifier.send_embed_message( - title="๐Ÿงช Tdarr Monitor Test", - description="This is a test embed message to verify Discord integration is working correctly.", - color=0x00ff00, # Green - fields=test_fields - ) - - if content_success and embed_success: - print("โœ… Discord test successful! Both content and embed messages sent.") - sys.exit(0) - else: - print("โŒ Discord test failed. Check webhook URL and permissions.") - sys.exit(1) - - # Initialize monitor - log_file = None if args.no_log_file else args.log_file - monitor = TdarrMonitor( - args.server, - args.timeout, - enable_stuck_detection=args.detect_stuck, - stuck_threshold_minutes=args.stuck_threshold, - memory_file=args.memory_file, - discord_webhook_url=args.discord_webhook, - enable_discord_alerts=args.discord_alerts, - log_file=log_file, - log_level=args.log_level, - clear_hung_workers=args.clear_hung_workers - ) - - # Perform requested check - monitor.logger.info(f"Starting Tdarr monitoring check: {args.check}, stuck_detection={'enabled' if args.detect_stuck else 'disabled'}, clear_workers={'enabled' if args.clear_hung_workers else 'disabled'}") - - result = None - if args.check == 'all': - result = { - 'server_status': monitor.get_server_status(), - 'node_status': monitor.get_node_status() - } - elif args.check == 'status': - result = monitor.get_server_status() - elif args.check == 'queue': - result = monitor.get_queue_status() - elif args.check == 'nodes': - result = monitor.get_node_status() - elif args.check == 'libraries': - result = monitor.get_library_status() - elif args.check == 'stats': - result = monitor.get_statistics() - elif args.check == 'health': - result = monitor.health_check() - - # Output results - if args.output == 'json': - # Convert dataclasses to dictionaries for JSON serialization - if args.check == 'all': - json_result = {} - for key, value in result.items(): - json_result[key] = asdict(value) - print(json.dumps(json_result, indent=2)) - else: - print(json.dumps(asdict(result), indent=2)) - else: - # Pretty print format - print(f"=== Tdarr Monitor Results - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===") - - if args.check == 'health' or (hasattr(result, 'overall_status') and result.overall_status): - health = result if hasattr(result, 'overall_status') else None - if health: - status = health.overall_status - print(f"Overall Status: {status.upper()}") - - if health.checks: - print("\nHealth Checks:") - for check_name, check_data in health.checks.items(): - status_icon = "โœ“" if check_data.healthy else "โœ—" - print(f" {status_icon} {check_name.title()}: {asdict(check_data)}") - - # Display stuck jobs if present - if args.detect_stuck: - if hasattr(result, 'stuck_jobs') and result.stuck_jobs: - print(f"\n=== STUCK JOBS DETECTED ({len(result.stuck_jobs)}) ===") - for stuck_job in result.stuck_jobs: - print(f"๐Ÿšจ {stuck_job.worker_snapshot.node_id}:{stuck_job.worker_snapshot.worker_id}") - print(f" File: {stuck_job.worker_snapshot.file}") - print(f" Progress: {stuck_job.worker_snapshot.percentage}%") - print(f" Status: {stuck_job.worker_snapshot.status}") - print(f" Stuck for: {stuck_job.stuck_duration_minutes:.1f} minutes") - print() - elif args.check in ['nodes', 'all']: - # Check all results for stuck jobs if 'all' is selected - stuck_found = False - if args.check == 'all' and isinstance(result, dict): - for section, data in result.items(): - if hasattr(data, 'stuck_jobs') and data.stuck_jobs: - if not stuck_found: - print(f"\n=== STUCK JOBS DETECTED ===") - stuck_found = True - for stuck_job in data.stuck_jobs: - print(f"๐Ÿšจ {stuck_job.worker_snapshot.node_id}:{stuck_job.worker_snapshot.worker_id}") - print(f" File: {stuck_job.worker_snapshot.file}") - print(f" Progress: {stuck_job.worker_snapshot.percentage}%") - print(f" Status: {stuck_job.worker_snapshot.status}") - print(f" Stuck for: {stuck_job.stuck_duration_minutes:.1f} minutes") - print() - - if not stuck_found: - print(f"\nโœ… No stuck jobs detected (threshold: {args.stuck_threshold} minutes)") - - if args.check == 'all': - for section, data in result.items(): - print(f"\n=== {section.replace('_', ' ').title()} ===") - # Don't print stuck_jobs in JSON format as we already displayed them above - if hasattr(data, 'stuck_jobs'): - data_dict = asdict(data) - data_dict.pop('stuck_jobs', None) - print(json.dumps(data_dict, indent=2)) - else: - print(json.dumps(asdict(data), indent=2)) - elif args.check != 'health': - # Don't print stuck_jobs in JSON format as we already displayed them above - if hasattr(result, 'stuck_jobs'): - result_dict = asdict(result) - result_dict.pop('stuck_jobs', None) - print(json.dumps(result_dict, indent=2)) - else: - print(json.dumps(asdict(result), indent=2)) - - # Exit with appropriate code - if result: - # Check for unhealthy status in health check - if isinstance(result, HealthStatus) and result.overall_status == 'unhealthy': - sys.exit(1) - # Check for errors in individual status objects (all status classes except HealthStatus have error attribute) - elif (isinstance(result, (ServerStatus, QueueStatus, NodeStatus, LibraryStatus, StatisticsStatus)) - and result.error): - sys.exit(1) - # Check for errors in 'all' results - elif isinstance(result, dict): - for status_obj in result.values(): - if (isinstance(status_obj, (ServerStatus, QueueStatus, NodeStatus, LibraryStatus, StatisticsStatus)) - and status_obj.error): - sys.exit(1) - - sys.exit(0) - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/vm-management/CONTEXT.md b/vm-management/CONTEXT.md index 5be2f1e..85667d9 100644 --- a/vm-management/CONTEXT.md +++ b/vm-management/CONTEXT.md @@ -3,6 +3,15 @@ ## Overview Virtual machine management for home lab environments with focus on automated provisioning, infrastructure as code, and security-first configuration. This context covers VM lifecycle management, Proxmox integration, and standardized deployment patterns. +## Proxmox Host +- **Version**: PVE 8.4.16 (upgraded from 7.4-20 on 2026-02-19) +- **Kernel**: 6.8.12-18-pve +- **IP**: 10.10.0.11 +- **SSH**: `ssh -i ~/.ssh/homelab_rsa root@10.10.0.11` +- **Storage**: local (100GB dir), local-lvm (2.3TB thin), home-truenas (17TB CIFS at 10.10.0.35) +- **Networking**: vmbr0 (10.10.0.x/24 via eno1), vmbr1 (10.0.0.x/24 via eno2, Matter/IoT) +- **Upgrade plan**: Phase 2 (PVE 8โ†’9) pending โ€” see `proxmox-upgrades/proxmox-7-to-9-upgrade-plan.md` + ## Architecture Patterns ### Infrastructure as Code (IaC) Approach diff --git a/vm-management/proxmox-upgrades/proxmox-7-to-9-upgrade-plan.md b/vm-management/proxmox-upgrades/proxmox-7-to-9-upgrade-plan.md index cc8d477..d07ae3e 100644 --- a/vm-management/proxmox-upgrades/proxmox-7-to-9-upgrade-plan.md +++ b/vm-management/proxmox-upgrades/proxmox-7-to-9-upgrade-plan.md @@ -2,18 +2,26 @@ ## Executive Summary -**Current State**: Proxmox VE 7.1-7 (kernel 5.13.19-2-pve) +**Current State**: Proxmox VE 8.4.16 (kernel 6.8.12-18-pve) โ€” Phase 1 complete **Target State**: Proxmox VE 9.1 (latest) **Upgrade Path**: Two-phase upgrade (7โ†’8โ†’9) - direct upgrade not supported **Total Timeline**: 3-4 weeks (including stabilization periods) **Total Downtime**: ~4 hours (2 hours per phase) +### Phase 1 Status: COMPLETED (2026-02-19) +- Upgraded from PVE 7.4-20 โ†’ PVE 8.4.16 +- Kernel: 5.13.19-2-pve โ†’ 6.8.12-18-pve +- Total downtime: ~45 minutes (upgrade + reboot + service startup) +- All services validated and running +- Stabilization period: monitoring through early March 2026 + ## Infrastructure Overview -**Production Services** (8 LXC + 17 VMs): -- **Critical**: Paper Dynasty/Major Domo (VMs 115, 110), Gitea (LXC 225), n8n (LXC 210), Home Assistant (VM 109) -- **Important**: Media services (Plex 107, Tdarr 113, arr-stack 221), OpenClaw (224), Databases (112) -- **Lower Priority**: Game servers, development containers +**Production Services** (7 LXC + 7 VMs) โ€” cleaned up 2026-02-19: +- **Critical**: Paper Dynasty/Major Domo (VM 115), Discord bots (VM 110), Gitea (LXC 225), n8n (LXC 210), Home Assistant (VM 109), Databases (VM 112), docker-home/Pi-hole 1 (VM 106) +- **Important**: Claude Discord Coordinator (LXC 301), arr-stack (LXC 221), Uptime Kuma (LXC 227), Foundry VTT (LXC 223), Memos (LXC 222) +- **Stopped/Investigate**: docker-vpn (VM 105, decommissioning), docker-home-servers (VM 116, needs investigation) +- **Removed (2026-02-19)**: 108 (ansible), 224 (openclaw), 300 (openclaw-migrated), 101/102/104/111/211 (game servers), 107 (plex), 113 (tdarr - moved to .226), 114 (duplicate arr-stack), 117 (unused), 100/103 (old templates) **Key Constraints**: - Home Assistant VM 109 requires dual network (vmbr1 for Matter support) @@ -29,26 +37,33 @@ #### 1. Comprehensive Backups -**Priority 1 - Production Services**: +**All production guests** (14 total after cleanup): ```bash -# Backup critical services to TrueNAS -vzdump 210 --mode snapshot --dumpdir /mnt/truenas/proxmox --compress zstd # n8n -vzdump 115 --mode snapshot --dumpdir /mnt/truenas/proxmox --compress zstd # docker-sba -vzdump 112 --mode snapshot --dumpdir /mnt/truenas/proxmox --compress zstd # databases -vzdump 110 --mode snapshot --dumpdir /mnt/truenas/proxmox --compress zstd # discord-bots -vzdump 225 --mode snapshot --dumpdir /mnt/truenas/proxmox --compress zstd # gitea -vzdump 109 --mode snapshot --dumpdir /mnt/truenas/proxmox --compress zstd # homeassistant -``` - -**Priority 2 - All Remaining VMs/LXCs**: -```bash -vzdump --all --mode snapshot --dumpdir /mnt/truenas/proxmox --compress zstd +# Backup all to TrueNAS (PVE storage: home-truenas, mount: /mnt/pve/home-truenas) +# VMs +vzdump 106 --mode snapshot --storage home-truenas --compress zstd # docker-home (pihole1, NPM) +vzdump 109 --mode snapshot --storage home-truenas --compress zstd # homeassistant +vzdump 110 --mode snapshot --storage home-truenas --compress zstd # discord-bots +vzdump 112 --mode snapshot --storage home-truenas --compress zstd # databases +vzdump 115 --mode snapshot --storage home-truenas --compress zstd # docker-sba (Paper Dynasty) +# LXCs +vzdump 210 --mode snapshot --storage home-truenas --compress zstd # n8n +vzdump 221 --mode snapshot --storage home-truenas --compress zstd # arr-stack +vzdump 222 --mode snapshot --storage home-truenas --compress zstd # memos +vzdump 223 --mode snapshot --storage home-truenas --compress zstd # foundry +vzdump 225 --mode snapshot --storage home-truenas --compress zstd # gitea +vzdump 227 --mode snapshot --storage home-truenas --compress zstd # uptime-kuma +vzdump 301 --mode snapshot --storage home-truenas --compress zstd # claude-discord-coordinator +# Optional (stopped/investigate) +# vzdump 105 --mode snapshot --storage home-truenas --compress zstd # docker-vpn (decommissioning) +# vzdump 116 --mode snapshot --storage home-truenas --compress zstd # docker-home-servers (investigate) ``` **Backup Proxmox Configuration**: ```bash -tar -czf /mnt/truenas/proxmox/pve-config-$(date +%Y%m%d).tar.gz /etc/pve/ -cp /etc/network/interfaces /mnt/truenas/proxmox/interfaces.backup +# Already completed 2026-02-19 โ€” refresh before upgrade +tar -czf /mnt/pve/home-truenas/dump/pve-config/pve-config-$(date +%Y%m%d).tar.gz /etc/pve/ +cp /etc/network/interfaces /mnt/pve/home-truenas/dump/pve-config/interfaces.backup.$(date +%Y%m%d) ``` **Expected**: 2-4 hours, ~500GB-1TB storage required @@ -120,24 +135,34 @@ pvesm status ### Post-Upgrade Validation -**Start Services in Dependency Order**: +**Start Services in Dependency Order** (stagger with 30s delays per Phase 1 lessons): ```bash # Databases first -pvesh create /nodes/proxmox/qemu/112/status/start +pvesh create /nodes/proxmox/qemu/112/status/start # databases-bots +sleep 30 -# Infrastructure +# Infrastructure + DNS +pvesh create /nodes/proxmox/qemu/106/status/start # docker-home (pihole1, NPM) pvesh create /nodes/proxmox/lxc/225/status/start # gitea pvesh create /nodes/proxmox/lxc/210/status/start # n8n +pvesh create /nodes/proxmox/lxc/227/status/start # uptime-kuma +sleep 30 # Applications pvesh create /nodes/proxmox/qemu/115/status/start # docker-sba (Paper Dynasty) pvesh create /nodes/proxmox/qemu/110/status/start # discord-bots -pvesh create /nodes/proxmox/lxc/224/status/start # openclaw +pvesh create /nodes/proxmox/lxc/301/status/start # claude-discord-coordinator +sleep 30 + +# Restart Pi-hole container proactively (UDP DNS fix from Phase 1) +qm guest exec 106 -- docker restart pihole +sleep 10 # Media & Others pvesh create /nodes/proxmox/qemu/109/status/start # homeassistant -pvesh create /nodes/proxmox/qemu/107/status/start # plex pvesh create /nodes/proxmox/lxc/221/status/start # arr-stack +pvesh create /nodes/proxmox/lxc/222/status/start # memos +pvesh create /nodes/proxmox/lxc/223/status/start # foundry-lxc ``` **Service Validation Checklist**: @@ -159,6 +184,30 @@ Monitor for: - Service uptime - Error logs +### Phase 1 Lessons Learned (2026-02-19) + +**Issues encountered:** +1. **I/O storm on boot**: All 15 guests starting simultaneously caused massive I/O delay (~50% for several minutes). Consider staggering guest startup with delays between groups. +2. **Pi-hole 1 UDP DNS failed after boot**: Docker iptables NAT rules weren't fully set up. Required container restart. TCP DNS worked immediately โ€” only UDP was affected. +3. **Home Assistant IP changed**: HA on VM 109 got a new DHCP address (10.10.0.215 instead of previous). Need DHCP reservation to prevent this. +4. **Local machine DNS failover**: Desktop was configured with only one Pi-hole DNS server (10.10.0.226). When Proxmox guests were shut down, Pi-hole on physical server at .226 should have kept working but didn't resolve initially. Added both Pi-holes as DNS servers. +5. **Some VMs ignored ACPI shutdown**: VMs 105 and 112 required `--forceStop` flag. +6. **Several guests had onboot=1**: Many guests auto-started before we could bring them up in dependency order. Not harmful but unexpected. + +**What went well:** +- `pve7to8 --full` checker caught everything โ€” zero surprises during upgrade +- `DEBIAN_FRONTEND=noninteractive apt dist-upgrade -y -o Dpkg::Options::='--force-confnew'` worked cleanly +- Reboot took ~4 minutes (longer than expected but completed without issues) +- All backups on TrueNAS were intact and accessible post-upgrade +- Local disk space dropped from 57% to 14% after upgrade (old kernel/packages cleaned up) + +**Recommendations for Phase 2:** +- Stagger guest startup: add `sleep 30` between dependency groups +- Restart Pi-hole Docker container proactively after boot +- Set DHCP reservation for HA VM before Phase 2 +- Switch local DNS to public resolvers (1.1.1.1) before shutting down guests +- Disable onboot for all guests before upgrade, re-enable after validation + --- ## Phase 2: Proxmox 8.4 โ†’ 9.1 Upgrade @@ -169,25 +218,29 @@ Monitor for: ```bash # Verify systemd version in each LXC (must be > 230) -for ct in 108 210 211 221 222 223 224 225; do +for ct in 108 210 211 221 222 223 224 225 227 300 301; do echo "=== LXC $ct ===" pct exec $ct -- systemd --version | head -1 done ``` -**Action Required**: If any LXC shows systemd < 230: -```bash -pct enter -apt update && apt dist-upgrade -y -do-release-upgrade # Upgrade Ubuntu to compatible version -``` +**Pre-verified 2026-02-19** (all pass, updated after cleanup): +| LXC | Name | systemd | Status | +|-----|------|---------|--------| +| 210 | n8n | 245 | Pass | +| 221 | arr-stack | 245 | Pass | +| 222 | memos | 245 | Pass | +| 223 | foundry | 245 | Pass | +| 225 | gitea | 245 | Pass | +| 227 | uptime-kuma | 249 | Pass | +| 301 | claude-discord-coord | 249 | Pass | -**Expected**: All Ubuntu 20.04+ LXCs should be compatible (systemd 245+) +**Expected**: All compatible. Re-verify before Phase 2 in case any LXC OS was changed. #### 2. Fresh Backup Set ```bash -vzdump --all --mode snapshot --dumpdir /mnt/truenas/proxmox/pve9-upgrade --compress zstd -tar -czf /mnt/truenas/proxmox/pve8-config-$(date +%Y%m%d).tar.gz /etc/pve/ +vzdump --all --mode snapshot --storage home-truenas --compress zstd +tar -czf /mnt/pve/home-truenas/dump/pve-config/pve8-config-$(date +%Y%m%d).tar.gz /etc/pve/ ``` #### 3. Run PVE 8-to-9 Checker @@ -350,7 +403,27 @@ pvesh get /cluster/resources ## Verification Checklist -After each upgrade phase: +### Phase 1 (PVE 7โ†’8) โ€” Completed 2026-02-19 + +- [x] Proxmox version correct: pve-manager/8.4.16 +- [x] Kernel version updated: 6.8.12-18-pve +- [x] All PVE services running (pve-cluster, pvedaemon, pveproxy, pvestatd) +- [x] Storage accessible: local, local-lvm, home-truenas all active +- [x] Network functional +- [x] All VMs/LXCs visible in UI +- [x] Critical VMs/LXCs started successfully +- [x] Discord bots responding (confirmed on .88) +- [x] Databases accessible (VM 112 running) +- [x] n8n workflows โ€” HTTP 200 +- [x] Gitea accessible โ€” HTTP 200 +- [x] Home Assistant functional โ€” HTTP 200 (new IP: 10.10.0.215) +- [x] Jellyfin streaming โ€” HTTP 302 +- [x] Uptime Kuma โ€” HTTP 302 +- [x] Pi-hole 1 DNS resolving (after container restart) +- [x] Pi-hole 2 DNS resolving +- [x] Web UI functional โ€” HTTP 200 + +### Phase 2 (PVE 8โ†’9) โ€” Pending - [ ] Proxmox version correct (`pveversion`) - [ ] Kernel version updated (`uname -r`)