From 10c9e0d854a41748ce64f6eb1c0ac04fe2347233 Mon Sep 17 00:00:00 2001 From: Cal Corum Date: Tue, 12 Aug 2025 23:20:15 -0500 Subject: [PATCH] CLAUDE: Migrate to technology-first documentation architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Complete restructure from patterns/examples/reference to technology-focused directories: β€’ Created technology-specific directories with comprehensive documentation: - /tdarr/ - Transcoding automation with gaming-aware scheduling - /docker/ - Container management with GPU acceleration patterns - /vm-management/ - Virtual machine automation and cloud-init - /networking/ - SSH infrastructure, reverse proxy, and security - /monitoring/ - System health checks and Discord notifications - /databases/ - Database patterns and troubleshooting - /development/ - Programming language patterns (bash, nodejs, python, vuejs) β€’ Enhanced CLAUDE.md with intelligent context loading: - Technology-first loading rules for automatic context provision - Troubleshooting keyword triggers for emergency scenarios - Documentation maintenance protocols with automated reminders - Context window management for optimal documentation updates β€’ Preserved valuable content from .claude/tmp/: - SSH security improvements and server inventory - Tdarr CIFS troubleshooting and Docker iptables solutions - Operational scripts with proper technology classification β€’ Benefits achieved: - Self-contained technology directories with complete context - Automatic loading of relevant documentation based on keywords - Emergency-ready troubleshooting with comprehensive guides - Scalable structure for future technology additions - Eliminated context bloat through targeted loading πŸ€– Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- .claude/settings.json | 7 + .gitignore | 3 +- CLAUDE.md | 315 +++-- .../README.md => databases/CONTEXT.md | 0 databases/troubleshooting.md | 316 +++++ .../README.md => development/bash-CONTEXT.md | 0 .../bash-troubleshooting.md | 0 .../python => development}/debugging.md | 0 .../nodejs-CONTEXT.md | 0 .../python-CONTEXT.md | 0 .../service-management.md | 0 .../README.md => development/vuejs-CONTEXT.md | 0 .../python => development}/web-frameworks.md | 0 docker/CONTEXT.md | 331 +++++ .../examples}/crash-analysis-summary.md | 0 .../examples}/distributed-transcoding.md | 0 ...docker-iptables-troubleshooting-session.md | 262 ++++ .../examples}/gpu-acceleration.md | 0 .../examples}/multi-stage-builds.md | 0 .../examples}/nvidia-gpu-troubleshooting.md | 0 .../examples}/nvidia-troubleshooting.md | 0 .../examples}/tdarr-container-fixes.md | 0 .../tdarr-monitoring-configuration.md | 0 .../examples}/tdarr-node-configurations.md | 0 .../tdarr-node-local/docker-compose-cpu.yml | 0 .../tdarr-node-local/docker-compose-gpu.yml | 0 .../start-tdarr-mapped-node.sh | 0 .../examples}/tdarr-server-setup/README.md | 0 .../tdarr-server-setup/docker-compose.yml | 0 .../examples}/tdarr-troubleshooting.md | 0 .../examples}/troubleshooting.md | 0 docker/troubleshooting.md | 466 +++++++ legacy/old-scripts-README.md | 172 +++ monitoring/CONTEXT.md | 142 ++ monitoring/examples/cron-job-management.md | 326 +++++ .../scripts}/README.md | 0 .../scripts}/setup-discord-monitoring.md | 0 .../scripts}/tdarr-timeout-monitor.sh | 0 monitoring/scripts/tdarr_monitor.py | 1234 +++++++++++++++++ .../scripts}/windows-desktop/README.md | 0 .../windows-reboot-monitor.ps1 | 0 .../windows-reboot-task-shutdown.xml | 0 .../windows-reboot-task-startup.xml | 0 .../windows-setup-instructions.md | 0 monitoring/troubleshooting.md | 414 ++++++ networking/CONTEXT.md | 309 +++++ .../examples}/cifs-mount-resilience-fixes.md | 0 .../examples}/nas-mount-configuration.md | 0 .../network-filesystem-limitations.md | 0 .../examples}/nginx-config.md | 0 networking/examples/security_improvements.md | 99 ++ networking/examples/server_inventory.yaml | 70 + .../examples}/ssh-homelab-setup.md | 0 .../examples}/ssh-key-management.md | 0 .../examples}/ssh-troubleshooting.md | 0 .../examples}/troubleshooting.md | 0 networking/scripts/ssh_key_maintenance.sh | 114 ++ networking/troubleshooting.md | 496 +++++++ patterns/docker/README.md | 26 - patterns/networking/README.md | 32 - patterns/vm-management/README.md | 66 - scripts/monitoring/tdarr_monitor.py | 498 ------- scripts/tdarr-manager | 6 - tdarr/CONTEXT.md | 152 ++ .../tdarr-cifs-troubleshooting-2025-08-11.md | 143 ++ tdarr/examples/tdarr-node-configurations.md | 183 +++ .../tdarr-node-local/docker-compose-cpu.yml | 28 + .../tdarr-node-local/docker-compose-gpu.yml | 45 + .../start-tdarr-mapped-node.sh | 83 ++ tdarr/examples/tdarr-server-setup/README.md | 69 + .../tdarr-server-setup/docker-compose.yml | 37 + tdarr/scripts/CONTEXT.md | 212 +++ {scripts/tdarr => tdarr/scripts}/README.md | 0 .../scripts}/start-tdarr-gpu-podman-clean.sh | 0 .../scripts}/stop-tdarr-gpu-podman.sh | 0 .../scripts}/tdarr-cron-check-configurable.sh | 0 .../scripts}/tdarr-schedule-manager.sh | 0 .../scripts}/tdarr-schedule.conf | 0 tdarr/troubleshooting.md | 272 ++++ vm-management/CONTEXT.md | 296 ++++ .../examples}/proxmox-automation.md | 0 .../examples}/troubleshooting.md | 0 .../scripts}/README.md | 0 .../scripts}/cloud-init-user-data.yaml | 0 .../scripts}/vm-post-install.sh | 0 vm-management/troubleshooting.md | 652 +++++++++ 86 files changed, 7123 insertions(+), 753 deletions(-) create mode 100644 .claude/settings.json rename patterns/databases/README.md => databases/CONTEXT.md (100%) create mode 100644 databases/troubleshooting.md rename patterns/bash/README.md => development/bash-CONTEXT.md (100%) rename reference/bash/troubleshooting.md => development/bash-troubleshooting.md (100%) rename {reference/python => development}/debugging.md (100%) rename patterns/nodejs/README.md => development/nodejs-CONTEXT.md (100%) rename patterns/python/README.md => development/python-CONTEXT.md (100%) rename {examples/bash => development}/service-management.md (100%) rename patterns/vuejs/README.md => development/vuejs-CONTEXT.md (100%) rename {examples/python => development}/web-frameworks.md (100%) create mode 100644 docker/CONTEXT.md rename {reference/docker => docker/examples}/crash-analysis-summary.md (100%) rename {patterns/docker => docker/examples}/distributed-transcoding.md (100%) create mode 100644 docker/examples/docker-iptables-troubleshooting-session.md rename {patterns/docker => docker/examples}/gpu-acceleration.md (100%) rename {examples/docker => docker/examples}/multi-stage-builds.md (100%) rename {reference/docker => docker/examples}/nvidia-gpu-troubleshooting.md (100%) rename {reference/docker => docker/examples}/nvidia-troubleshooting.md (100%) rename {reference/docker => docker/examples}/tdarr-container-fixes.md (100%) rename {reference/docker => docker/examples}/tdarr-monitoring-configuration.md (100%) rename {examples/docker => docker/examples}/tdarr-node-configurations.md (100%) rename {examples/docker => docker/examples}/tdarr-node-local/docker-compose-cpu.yml (100%) rename {examples/docker => docker/examples}/tdarr-node-local/docker-compose-gpu.yml (100%) rename {examples/docker => docker/examples}/tdarr-node-local/start-tdarr-mapped-node.sh (100%) rename {examples/docker => docker/examples}/tdarr-server-setup/README.md (100%) rename {examples/docker => docker/examples}/tdarr-server-setup/docker-compose.yml (100%) rename {reference/docker => docker/examples}/tdarr-troubleshooting.md (100%) rename {reference/docker => docker/examples}/troubleshooting.md (100%) create mode 100644 docker/troubleshooting.md create mode 100644 legacy/old-scripts-README.md create mode 100644 monitoring/CONTEXT.md create mode 100644 monitoring/examples/cron-job-management.md rename {scripts/monitoring => monitoring/scripts}/README.md (100%) rename {scripts/monitoring => monitoring/scripts}/setup-discord-monitoring.md (100%) rename {scripts/monitoring => monitoring/scripts}/tdarr-timeout-monitor.sh (100%) create mode 100755 monitoring/scripts/tdarr_monitor.py rename {scripts/monitoring => monitoring/scripts}/windows-desktop/README.md (100%) rename {scripts/monitoring => monitoring/scripts}/windows-desktop/windows-reboot-monitor.ps1 (100%) rename {scripts/monitoring => monitoring/scripts}/windows-desktop/windows-reboot-task-shutdown.xml (100%) rename {scripts/monitoring => monitoring/scripts}/windows-desktop/windows-reboot-task-startup.xml (100%) rename {scripts/monitoring => monitoring/scripts}/windows-desktop/windows-setup-instructions.md (100%) create mode 100644 monitoring/troubleshooting.md create mode 100644 networking/CONTEXT.md rename {reference/networking => networking/examples}/cifs-mount-resilience-fixes.md (100%) rename {reference/networking => networking/examples}/nas-mount-configuration.md (100%) rename {reference/storage => networking/examples}/network-filesystem-limitations.md (100%) rename {examples/networking => networking/examples}/nginx-config.md (100%) create mode 100644 networking/examples/security_improvements.md create mode 100644 networking/examples/server_inventory.yaml rename {examples/networking => networking/examples}/ssh-homelab-setup.md (100%) rename {patterns/networking => networking/examples}/ssh-key-management.md (100%) rename {reference/networking => networking/examples}/ssh-troubleshooting.md (100%) rename {reference/networking => networking/examples}/troubleshooting.md (100%) create mode 100755 networking/scripts/ssh_key_maintenance.sh create mode 100644 networking/troubleshooting.md delete mode 100644 patterns/docker/README.md delete mode 100644 patterns/networking/README.md delete mode 100644 patterns/vm-management/README.md delete mode 100755 scripts/monitoring/tdarr_monitor.py delete mode 100755 scripts/tdarr-manager create mode 100644 tdarr/CONTEXT.md create mode 100644 tdarr/examples/tdarr-cifs-troubleshooting-2025-08-11.md create mode 100644 tdarr/examples/tdarr-node-configurations.md create mode 100644 tdarr/examples/tdarr-node-local/docker-compose-cpu.yml create mode 100644 tdarr/examples/tdarr-node-local/docker-compose-gpu.yml create mode 100755 tdarr/examples/tdarr-node-local/start-tdarr-mapped-node.sh create mode 100644 tdarr/examples/tdarr-server-setup/README.md create mode 100644 tdarr/examples/tdarr-server-setup/docker-compose.yml create mode 100644 tdarr/scripts/CONTEXT.md rename {scripts/tdarr => tdarr/scripts}/README.md (100%) rename {scripts/tdarr => tdarr/scripts}/start-tdarr-gpu-podman-clean.sh (100%) rename {scripts/tdarr => tdarr/scripts}/stop-tdarr-gpu-podman.sh (100%) rename {scripts/tdarr => tdarr/scripts}/tdarr-cron-check-configurable.sh (100%) rename {scripts/tdarr => tdarr/scripts}/tdarr-schedule-manager.sh (100%) rename {scripts/tdarr => tdarr/scripts}/tdarr-schedule.conf (100%) create mode 100644 tdarr/troubleshooting.md create mode 100644 vm-management/CONTEXT.md rename {examples/vm-management => vm-management/examples}/proxmox-automation.md (100%) rename {reference/vm-management => vm-management/examples}/troubleshooting.md (100%) rename {scripts/vm-management => vm-management/scripts}/README.md (100%) rename {scripts/vm-management => vm-management/scripts}/cloud-init-user-data.yaml (100%) rename {scripts/vm-management => vm-management/scripts}/vm-post-install.sh (100%) create mode 100644 vm-management/troubleshooting.md diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000..5701dc9 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,7 @@ +{ + "notifications_disabled": true, + "allowed_working_directories": [ + "/mnt/NV2/Development/claude-home", + "/mnt/media" + ] +} \ No newline at end of file diff --git a/.gitignore b/.gitignore index dd64d9e..b5b53c6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .claude/tmp/ -tmp/ \ No newline at end of file +tmp/ +__pycache__ \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index a9f52a2..e71912b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -8,130 +8,98 @@ - If creating a temporary file will help achieve your goal, please create the file in the .claude/tmp/ directory and clean up when you're done. - Prefer editing an existing file to creating a new one. - Following a complex task or series of tasks, prompt the user to save any key learnings from the session. +- **Documentation Maintenance Reminder**: At the end of coding sessions, proactively ask: "Should I update our documentation to reflect the changes we made today?" Focus on CONTEXT.md files, troubleshooting guides, and any new patterns discovered. +- **Context Window Management**: When approaching 25% context window remaining, prioritize documentation updates before auto-summarization occurs. Ask: "We're approaching context limits - should I update our documentation now to capture today's work before we lose context?" ## Automatic Context Loading Rules -### File Extension Triggers -When working with files, automatically load relevant documentation: +### Technology-First Loading Rules +When working with specific technologies, automatically load their dedicated context: -**Python (.py, .pyx, .pyi)** -- Load: `patterns/python/` -- Load: `reference/python/` -- If Django/Flask detected: Load `examples/python/web-frameworks.md` -- If requests/httpx detected: Load `examples/python/api-clients.md` +**Tdarr Keywords** +- "tdarr", "transcode", "ffmpeg", "gpu transcoding", "nvenc", "scheduler", "api" + - Load: `tdarr/CONTEXT.md` (technology overview and patterns) + - Load: `tdarr/troubleshooting.md` (error handling and debugging) + - If working in `/tdarr/scripts/`: Load `tdarr/scripts/CONTEXT.md` (script-specific documentation) + - Note: Gaming-aware scheduling system with configurable time windows available + - Note: Comprehensive API monitoring available via `tdarr_monitor.py` with dataclass-based status tracking -**JavaScript/Node.js (.js, .mjs, .ts)** -- Load: `patterns/nodejs/` -- Load: `reference/nodejs/` -- If package.json exists: Load `examples/nodejs/package-management.md` +**Docker Keywords** +- "docker", "container", "image", "compose", "kubernetes", "k8s", "dockerfile", "podman" + - Load: `docker/CONTEXT.md` (technology overview and patterns) + - Load: `docker/troubleshooting.md` (error handling and debugging) + - If working in `/docker/scripts/`: Load `docker/scripts/CONTEXT.md` (script-specific documentation) -**Vue.js (.vue, vite.config.*, nuxt.config.*)** -- Load: `patterns/vuejs/` -- Load: `reference/vuejs/` -- Load: `examples/vuejs/component-patterns.md` +**VM Management Keywords** +- "virtual machine", "vm", "proxmox", "kvm", "hypervisor", "guest", "virtualization" + - Load: `vm-management/CONTEXT.md` (technology overview and patterns) + - Load: `vm-management/troubleshooting.md` (error handling and debugging) + - If working in `/vm-management/scripts/`: Load `vm-management/scripts/CONTEXT.md` (script-specific documentation) -**Shell Scripts (.sh, .bash, .zsh)** -- Load: `patterns/bash/` -- Load: `reference/bash/` -- If systemd mentioned: Load `examples/bash/service-management.md` +**Networking Keywords** +- "network", "nginx", "proxy", "load balancer", "dns", "port", "firewall", "ssh", "ssl", "tls" + - Load: `networking/CONTEXT.md` (technology overview and patterns) + - Load: `networking/troubleshooting.md` (error handling and debugging) + - If working in `/networking/scripts/`: Load `networking/scripts/CONTEXT.md` (script-specific documentation) -**Docker (Dockerfile, docker-compose.yml, .dockerignore)** -- Load: `patterns/docker/` -- Load: `reference/docker/` -- Load: `examples/docker/multi-stage-builds.md` +**Monitoring Keywords** +- "monitoring", "alert", "notification", "discord", "health check", "status", "uptime", "windows reboot", "system monitor" + - Load: `monitoring/CONTEXT.md` (technology overview and patterns) + - Load: `monitoring/troubleshooting.md` (error handling and debugging) + - If working in `/monitoring/scripts/`: Load `monitoring/scripts/CONTEXT.md` (script-specific documentation) + - Note: Windows desktop monitoring with Discord notifications available + - Note: Comprehensive Tdarr API monitoring with dataclass-based status tracking ### Directory Context Triggers When working in specific directories: -**Docker-related directories (/docker/, /containers/, /compose/)** -- Load: `patterns/docker/` -- Load: `examples/docker/` -- Load: `reference/docker/troubleshooting.md` +**Technology directories (/tdarr/, /docker/, /vm-management/, /networking/, /monitoring/)** +- Load: `{technology}/CONTEXT.md` (technology overview) +- Load: `{technology}/troubleshooting.md` (debugging info) -**Database directories (/db/, /database/, /mysql/, /postgres/, /mongo/)** -- Load: `patterns/databases/` -- Load: `examples/databases/` -- Load: `reference/databases/` - -**Network directories (/network/, /networking/, /nginx/, /traefik/)** -- Load: `patterns/networking/` -- Load: `examples/networking/` -- Load: `reference/networking/troubleshooting.md` - -**VM directories (/vm/, /virtual/, /proxmox/, /kvm/)** -- Load: `patterns/vm-management/` -- Load: `examples/vm-management/` -- Load: `reference/vm-management/` - -**Scripts directory (/scripts/, /scripts/*/)** -- Load: `patterns/` (relevant to script type) -- Load: `reference/` (relevant troubleshooting guides) -- Load: `scripts/*/README.md` (subsystem-specific documentation) +**Script subdirectories (/tdarr/scripts/, /docker/scripts/, etc.)** +- Load: `{technology}/CONTEXT.md` (parent technology context) +- Load: `{technology}/scripts/CONTEXT.md` (script-specific context) +- Load: `{technology}/troubleshooting.md` (debugging info) - Context: Active operational scripts - treat as production code -- Note: Windows desktop monitoring system available in `scripts/monitoring/windows-desktop/` -### Keyword Triggers -When user mentions specific terms, automatically load relevant docs: +**Legacy directories (for backward compatibility)** +- `/scripts/tdarr/` β†’ Load Tdarr context files +- `/scripts/monitoring/` β†’ Load Monitoring context files +- `/patterns/`, `/examples/`, `/reference/` β†’ Load as before until migration complete -**Troubleshooting Keywords** -- "debug", "error", "fail", "broken", "not working", "issue" - - Load: `reference/{relevant-tech}/troubleshooting.md` - - Load: `examples/{relevant-tech}/debugging.md` +### File Extension Triggers +For programming languages, load general development context: -**Configuration Keywords** -- "config", "configure", "setup", "install", "deploy" - - Load: `patterns/{relevant-tech}/` - - Load: `examples/{relevant-tech}/configuration.md` +**Python (.py, .pyx, .pyi)** +- Load: `development/python-CONTEXT.md` (Python patterns and best practices) +- If Django/Flask detected: Load `development/web-frameworks-CONTEXT.md` +- If requests/httpx detected: Load `development/api-clients-CONTEXT.md` -**Performance Keywords** -- "slow", "performance", "optimize", "memory", "cpu" - - Load: `reference/{relevant-tech}/performance.md` - - Load: `examples/{relevant-tech}/optimization.md` +**JavaScript/Node.js (.js, .mjs, .ts)** +- Load: `development/nodejs-CONTEXT.md` (Node.js patterns and best practices) +- If package.json exists: Load `development/package-management-CONTEXT.md` -**Security Keywords** -- "secure", "ssl", "tls", "certificate", "auth", "firewall" - - Load: `patterns/networking/security.md` - - Load: `reference/networking/security.md` +**Shell Scripts (.sh, .bash, .zsh)** +- Load: `development/bash-CONTEXT.md` (Bash scripting patterns) +- If systemd mentioned: Load `development/service-management-CONTEXT.md` -**Database Keywords** -- "database", "db", "sql", "mysql", "postgres", "mongo", "redis" - - Load: `patterns/databases/` - - Load: `examples/databases/` +### Troubleshooting Keywords +For troubleshooting scenarios, always load both context and troubleshooting files: -**Container Keywords** -- "docker", "container", "image", "compose", "kubernetes", "k8s" - - Load: `patterns/docker/` - - Load: `examples/docker/` +**General Troubleshooting Keywords** +- "shutdown", "stop", "emergency", "reset", "recovery", "crash", "broken", "not working", "error", "issue", "problem", "debug", "troubleshoot", "fix" + - If Tdarr context detected: Load `tdarr/CONTEXT.md` AND `tdarr/troubleshooting.md` + - If Docker context detected: Load `docker/CONTEXT.md` AND `docker/troubleshooting.md` + - If VM context detected: Load `vm-management/CONTEXT.md` AND `vm-management/troubleshooting.md` + - If Network context detected: Load `networking/CONTEXT.md` AND `networking/troubleshooting.md` + - If Monitoring context detected: Load `monitoring/CONTEXT.md` AND `monitoring/troubleshooting.md` -**Network Keywords** -- "network", "nginx", "proxy", "load balancer", "dns", "port", "firewall" - - Load: `patterns/networking/` - - Load: `examples/networking/` - -**SSH Keywords** -- "ssh", "key", "authentication", "authorized_keys", "ssh-copy-id" - - Load: `patterns/networking/ssh-key-management.md` - - Load: `examples/networking/ssh-homelab-setup.md` - - Load: `reference/networking/ssh-troubleshooting.md` - -**VM Keywords** -- "virtual machine", "vm", "proxmox", "kvm", "hypervisor", "guest" - - Load: `patterns/vm-management/` - - Load: `examples/vm-management/` - -**Tdarr Keywords** -- "tdarr", "transcode", "ffmpeg", "gpu transcoding", "nvenc", "forEach error", "gaming detection", "scheduler", "monitoring", "api" - - Load: `reference/docker/tdarr-troubleshooting.md` - - Load: `patterns/docker/distributed-transcoding.md` - - Load: `scripts/tdarr/README.md` (for automation and scheduling) - - Load: `scripts/monitoring/README.md` (for monitoring and health checks) - - Note: Gaming-aware scheduling system with configurable time windows available - - Note: Comprehensive API monitoring available via `tdarr_monitor.py` with dataclass-based status tracking - -**Windows Monitoring Keywords** -- "windows reboot", "discord notification", "system monitor", "windows desktop", "power outage", "windows update" - - Load: `scripts/monitoring/windows-desktop/README.md` - - Note: Complete Windows desktop monitoring with Discord notifications for reboots and system events +**Specific Tdarr Troubleshooting Keywords** +- "forEach error", "staging timeout", "gaming detection", "plugin error", "container stop", "node disconnect", "cache cleanup", "shutdown tdarr", "stop tdarr", "emergency tdarr", "reset tdarr" + - Load: `tdarr/CONTEXT.md` (technology overview) + - Load: `tdarr/troubleshooting.md` (specific solutions including Emergency Recovery section) + - If working in `/tdarr/scripts/`: Load `tdarr/scripts/CONTEXT.md` ### Priority Rules 1. **File extension triggers** take highest priority @@ -141,33 +109,132 @@ When user mentions specific terms, automatically load relevant docs: 5. Always prefer specific over general (e.g., `vuejs/` over `nodejs/`) ### Context Loading Behavior -- Load pattern files first for overview -- Load relevant examples for implementation details -- Load reference files for troubleshooting and edge cases -- Maximum of 3 documentation files per trigger to maintain efficiency -- If context becomes too large, prioritize most recent/specific files +- **Technology context first**: Load CONTEXT.md for overview and patterns +- **Troubleshooting context**: ALWAYS load troubleshooting.md for error scenarios and emergency procedures +- **Script-specific context**: Load scripts/CONTEXT.md when working in script directories +- **Examples last**: Load examples for implementation details +- **Critical rule**: For any troubleshooting scenario, load BOTH context and troubleshooting files to ensure complete information +- Maximum of 3-4 documentation files per trigger to maintain efficiency while ensuring comprehensive coverage ## Documentation Structure ``` -/patterns/ # Technology overviews and best practices -/examples/ # Complete working implementations -/reference/ # Troubleshooting, cheat sheets, fallback info -/scripts/ # Active scripts and utilities for home lab operations - β”œβ”€β”€ tdarr/ # Tdarr automation with gaming-aware scheduling - β”œβ”€β”€ monitoring/ # System monitoring and alerting - β”‚ β”œβ”€β”€ tdarr_monitor.py # Comprehensive Tdarr API monitoring with dataclasses - β”‚ └── windows-desktop/ # Windows reboot monitoring with Discord notifications - └── / # Other organized automation subsystems -``` +/tdarr/ # Tdarr transcoding automation + β”œβ”€β”€ CONTEXT.md # Technology overview, patterns, best practices + β”œβ”€β”€ troubleshooting.md # Error handling and debugging + β”œβ”€β”€ examples/ # Working configurations and templates + └── scripts/ # Active automation scripts + β”œβ”€β”€ CONTEXT.md # Script-specific documentation + β”œβ”€β”€ monitoring.py # Comprehensive API monitoring with dataclasses + └── scheduler.py # Gaming-aware scheduling system -Each pattern file should reference relevant examples and reference materials. +/docker/ # Container orchestration and management + β”œβ”€β”€ CONTEXT.md # Technology overview, patterns, best practices + β”œβ”€β”€ troubleshooting.md # Error handling and debugging + β”œβ”€β”€ examples/ # Working configurations and templates + └── scripts/ # Active automation scripts + └── CONTEXT.md # Script-specific documentation + +/vm-management/ # Virtual machine operations + β”œβ”€β”€ CONTEXT.md # Technology overview, patterns, best practices + β”œβ”€β”€ troubleshooting.md # Error handling and debugging + β”œβ”€β”€ examples/ # Working configurations and templates + └── scripts/ # Active automation scripts + └── CONTEXT.md # Script-specific documentation + +/networking/ # Network configuration and SSH management + β”œβ”€β”€ CONTEXT.md # Technology overview, patterns, best practices + β”œβ”€β”€ troubleshooting.md # Error handling and debugging + β”œβ”€β”€ examples/ # Working configurations and templates + └── scripts/ # Active automation scripts + └── CONTEXT.md # Script-specific documentation + +/monitoring/ # System monitoring and alerting + β”œβ”€β”€ CONTEXT.md # Technology overview, patterns, best practices + β”œβ”€β”€ troubleshooting.md # Error handling and debugging + β”œβ”€β”€ examples/ # Working configurations and templates + └── scripts/ # Active automation scripts + β”œβ”€β”€ CONTEXT.md # Script-specific documentation + └── windows-desktop/ # Windows reboot monitoring with Discord notifications + +/development/ # Programming language patterns and tools + β”œβ”€β”€ python-CONTEXT.md # Python development patterns + β”œβ”€β”€ nodejs-CONTEXT.md # Node.js development patterns + └── bash-CONTEXT.md # Shell scripting patterns + +/legacy/ # Backward compatibility during migration + β”œβ”€β”€ patterns/ # Old patterns structure (temporary) + β”œβ”€β”€ examples/ # Old examples structure (temporary) + └── reference/ # Old reference structure (temporary) +``` ### Directory Usage Guidelines -- `/scripts/` - Contains actively used scripts for home lab management and operations - - Organized by subsystem (e.g., `tdarr/`, `networking/`, `vm-management/`) - - Each subsystem includes its own README.md with complete documentation -- `/examples/` - Contains example configurations and template scripts for reference -- `/patterns/` - Best practices and architectural guidance -- `/reference/` - Troubleshooting guides and technical references +- Each technology directory is self-contained with its own context, troubleshooting, examples, and scripts +- `CONTEXT.md` files provide technology overview, patterns, and best practices for Claude +- `troubleshooting.md` files contain error handling and debugging information +- `/scripts/` subdirectories contain active operational code with their own `CONTEXT.md` +- `/examples/` subdirectories contain template configurations and reference implementations +- `/development/` contains general programming language patterns that apply across technologies +- `/legacy/` provides backward compatibility during the migration from the old structure + +## Documentation Maintenance Protocol + +### Automated Maintenance Triggers +Claude Code should automatically prompt for documentation updates when: + +1. **New Technology Integration**: When working with a technology that doesn't have a dedicated directory + - Prompt: "I notice we're working with [technology] but don't have a dedicated `/[technology]/` directory. Should I create the technology-first structure with CONTEXT.md and troubleshooting.md files?" + +2. **New Error Patterns Discovered**: When encountering and solving new issues + - Prompt: "We just resolved a [technology] issue that isn't documented. Should I add this solution to `[technology]/troubleshooting.md`?" + +3. **New Scripts or Operational Procedures**: When creating new automation or workflows + - Prompt: "I created new scripts/procedures for [technology]. Should I update `[technology]/scripts/CONTEXT.md` and add any new operational patterns?" + +4. **Session End with Significant Changes**: When completing complex tasks + - Prompt: "We made significant changes to [technology] systems. Should I update our documentation to reflect the new patterns, configurations, or troubleshooting procedures we discovered?" + +### Documentation Update Checklist +When "update our documentation" is requested, systematically check: + +**Technology-Specific Updates**: +- [ ] Update `[technology]/CONTEXT.md` with new patterns or architectural changes +- [ ] Add new troubleshooting scenarios to `[technology]/troubleshooting.md` +- [ ] Update `[technology]/scripts/CONTEXT.md` for new operational procedures +- [ ] Add working examples to `[technology]/examples/` if new configurations were created + +**Cross-Technology Updates**: +- [ ] Update main CLAUDE.md loading rules if new keywords or triggers are needed +- [ ] Add new technology directories to the Documentation Structure section +- [ ] Update Directory Usage Guidelines if new organizational patterns emerge + +**Legacy Cleanup**: +- [ ] Check if any old patterns/examples/reference files can be migrated to technology directories +- [ ] Update or remove outdated information that conflicts with new approaches + +### Self-Maintenance Features + +**Loading Rule Validation**: Periodically verify that: +- All technology directories have corresponding keyword triggers +- Troubleshooting keywords include all common error scenarios +- File paths in loading rules match actual directory structure + +**Documentation Completeness Check**: Each technology directory should have: +- `CONTEXT.md` (overview, patterns, best practices) +- `troubleshooting.md` (error scenarios, emergency procedures) +- `examples/` (working configurations) +- `scripts/CONTEXT.md` (if operational scripts exist) + +**Keyword Coverage Analysis**: Ensure loading rules cover: +- Technology names and common aliases +- Error types and troubleshooting scenarios +- Operational keywords (start, stop, configure, monitor) +- Emergency keywords (shutdown, reset, recovery) + +### Warning Triggers +Claude Code should warn when: +- Working extensively with a technology that lacks dedicated documentation structure +- Solving problems that aren't covered in existing troubleshooting guides +- Creating scripts or procedures without corresponding CONTEXT.md documentation +- Encountering loading rules that reference non-existent files diff --git a/patterns/databases/README.md b/databases/CONTEXT.md similarity index 100% rename from patterns/databases/README.md rename to databases/CONTEXT.md diff --git a/databases/troubleshooting.md b/databases/troubleshooting.md new file mode 100644 index 0000000..dd5734f --- /dev/null +++ b/databases/troubleshooting.md @@ -0,0 +1,316 @@ +# Database Troubleshooting Guide + +## Connection Issues + +### Cannot Connect to Database +**Symptoms**: Connection refused, timeout errors, authentication failures +**Diagnosis**: +```bash +# Test basic connectivity +telnet db-server 3306 # MySQL +telnet db-server 5432 # PostgreSQL +nc -zv db-server 6379 # Redis + +# Check database service status +systemctl status mysql +systemctl status postgresql +systemctl status redis-server +``` + +**Solutions**: +```bash +# Restart database services +sudo systemctl restart mysql +sudo systemctl restart postgresql + +# Check configuration files +sudo nano /etc/mysql/mysql.conf.d/mysqld.cnf +sudo nano /etc/postgresql/*/main/postgresql.conf + +# Verify port bindings +ss -tlnp | grep :3306 # MySQL +ss -tlnp | grep :5432 # PostgreSQL +``` + +## Performance Issues + +### Slow Query Performance +**Symptoms**: Long-running queries, high CPU usage, timeouts +**Diagnosis**: +```sql +-- MySQL +SHOW PROCESSLIST; +SHOW ENGINE INNODB STATUS; +EXPLAIN SELECT * FROM table WHERE condition; + +-- PostgreSQL +SELECT * FROM pg_stat_activity; +EXPLAIN ANALYZE SELECT * FROM table WHERE condition; +``` + +**Solutions**: +```sql +-- Add missing indexes +CREATE INDEX idx_column ON table(column); + +-- Analyze table statistics +ANALYZE TABLE table_name; -- MySQL +ANALYZE table_name; -- PostgreSQL + +-- Optimize queries +-- Use LIMIT for large result sets +-- Add WHERE clauses to filter results +-- Use appropriate JOIN types +``` + +### Memory and Resource Issues +**Symptoms**: Out of memory errors, swap usage, slow performance +**Diagnosis**: +```bash +# Check memory usage +free -h +ps aux | grep mysql +ps aux | grep postgres + +# Database-specific memory usage +mysqladmin -u root -p status +sudo -u postgres psql -c "SELECT * FROM pg_stat_database;" +``` + +**Solutions**: +```bash +# Adjust database memory settings +# MySQL - /etc/mysql/mysql.conf.d/mysqld.cnf +innodb_buffer_pool_size = 2G +key_buffer_size = 256M + +# PostgreSQL - /etc/postgresql/*/main/postgresql.conf +shared_buffers = 256MB +effective_cache_size = 2GB +work_mem = 4MB +``` + +## Data Integrity Issues + +### Corruption Detection and Recovery +**Symptoms**: Table corruption errors, data inconsistencies +**Diagnosis**: +```sql +-- MySQL +CHECK TABLE table_name; +mysqlcheck -u root -p --all-databases + +-- PostgreSQL +-- Check for corruption in logs +tail -f /var/log/postgresql/postgresql-*.log +``` + +**Solutions**: +```sql +-- MySQL table repair +REPAIR TABLE table_name; +mysqlcheck -u root -p --auto-repair database_name + +-- PostgreSQL consistency check +-- Run VACUUM and REINDEX +VACUUM FULL table_name; +REINDEX TABLE table_name; +``` + +## Backup and Recovery Issues + +### Backup Failures +**Symptoms**: Backup scripts failing, incomplete backups +**Diagnosis**: +```bash +# Check backup script logs +tail -f /var/log/backup.log + +# Test backup commands manually +mysqldump -u root -p database_name > test_backup.sql +pg_dump -U postgres database_name > test_backup.sql + +# Check disk space +df -h /backup/location/ +``` + +**Solutions**: +```bash +# Fix backup script permissions +chmod +x /path/to/backup-script.sh +chown backup-user:backup-group /backup/location/ + +# Automated backup script example +#!/bin/bash +BACKUP_DIR="/backups/mysql" +DATE=$(date +%Y%m%d_%H%M%S) + +mysqldump -u root -p$MYSQL_PASSWORD --all-databases > \ + "$BACKUP_DIR/full_backup_$DATE.sql" + +# Compress and rotate backups +gzip "$BACKUP_DIR/full_backup_$DATE.sql" +find "$BACKUP_DIR" -name "*.gz" -mtime +7 -delete +``` + +## Authentication and Security Issues + +### Access Denied Errors +**Symptoms**: Authentication failures, permission errors +**Diagnosis**: +```sql +-- MySQL +SELECT user, host FROM mysql.user; +SHOW GRANTS FOR 'username'@'host'; + +-- PostgreSQL +\du -- List users +\l -- List databases +``` + +**Solutions**: +```sql +-- MySQL user management +CREATE USER 'newuser'@'localhost' IDENTIFIED BY 'password'; +GRANT ALL PRIVILEGES ON database.* TO 'newuser'@'localhost'; +FLUSH PRIVILEGES; + +-- PostgreSQL user management +CREATE USER newuser WITH PASSWORD 'password'; +GRANT ALL PRIVILEGES ON DATABASE database_name TO newuser; +``` + +## Replication Issues + +### Master-Slave Replication Problems +**Symptoms**: Replication lag, sync errors, slave disconnection +**Diagnosis**: +```sql +-- MySQL Master +SHOW MASTER STATUS; + +-- MySQL Slave +SHOW SLAVE STATUS\G + +-- Check replication lag +SELECT SECONDS_BEHIND_MASTER FROM SHOW SLAVE STATUS\G +``` + +**Solutions**: +```sql +-- Reset replication +STOP SLAVE; +RESET SLAVE; +CHANGE MASTER TO MASTER_LOG_FILE='mysql-bin.000001', MASTER_LOG_POS=4; +START SLAVE; + +-- Fix replication errors +SET GLOBAL sql_slave_skip_counter = 1; +START SLAVE; +``` + +## Storage and Disk Issues + +### Disk Space Problems +**Symptoms**: Out of disk space errors, database growth +**Diagnosis**: +```bash +# Check database sizes +du -sh /var/lib/mysql/* +du -sh /var/lib/postgresql/*/main/* + +# Find large tables +SELECT table_schema, table_name, + ROUND((data_length + index_length) / 1024 / 1024, 2) AS 'Size (MB)' +FROM information_schema.tables +ORDER BY (data_length + index_length) DESC; +``` + +**Solutions**: +```sql +-- Clean up large tables +DELETE FROM log_table WHERE created_date < DATE_SUB(NOW(), INTERVAL 30 DAY); +OPTIMIZE TABLE log_table; + +-- Enable log rotation +-- For MySQL binary logs +SET GLOBAL expire_logs_days = 7; +PURGE BINARY LOGS BEFORE DATE(NOW() - INTERVAL 7 DAY); +``` + +## Emergency Recovery + +### Database Won't Start +**Recovery Steps**: +```bash +# Check error logs +tail -f /var/log/mysql/error.log +tail -f /var/log/postgresql/postgresql-*.log + +# Try safe mode start +sudo mysqld_safe --skip-grant-tables & + +# Recovery from backup +mysql -u root -p < backup_file.sql +psql -U postgres database_name < backup_file.sql +``` + +### Complete Data Loss Recovery +**Recovery Procedure**: +```bash +# Stop database service +sudo systemctl stop mysql + +# Restore from backup +cd /var/lib/mysql +sudo rm -rf * +sudo tar -xzf /backups/mysql_full_backup.tar.gz + +# Fix permissions +sudo chown -R mysql:mysql /var/lib/mysql +sudo chmod 755 /var/lib/mysql + +# Start database +sudo systemctl start mysql +``` + +## Monitoring and Prevention + +### Database Health Monitoring +```bash +#!/bin/bash +# db-health-check.sh + +# Check if database is responding +if ! mysqladmin -u root -p$MYSQL_PASSWORD ping >/dev/null 2>&1; then + echo "ALERT: MySQL not responding" | send_alert +fi + +# Check disk space +DISK_USAGE=$(df /var/lib/mysql | awk 'NR==2 {print $5}' | sed 's/%//') +if [ $DISK_USAGE -gt 80 ]; then + echo "ALERT: Database disk usage at ${DISK_USAGE}%" | send_alert +fi + +# Check for long-running queries +LONG_QUERIES=$(mysql -u root -p$MYSQL_PASSWORD -e "SHOW PROCESSLIST" | grep -c "Query.*[0-9][0-9][0-9]") +if [ $LONG_QUERIES -gt 5 ]; then + echo "ALERT: $LONG_QUERIES long-running queries detected" | send_alert +fi +``` + +### Automated Maintenance +```bash +# Daily maintenance script +#!/bin/bash +# Optimize tables +mysqlcheck -u root -p$MYSQL_PASSWORD --auto-repair --optimize --all-databases + +# Update table statistics +mysql -u root -p$MYSQL_PASSWORD -e "FLUSH TABLES; ANALYZE TABLE table_name;" + +# Backup rotation +find /backups -name "*.sql.gz" -mtime +30 -delete +``` + +This troubleshooting guide provides systematic approaches to resolving common database issues in home lab environments. \ No newline at end of file diff --git a/patterns/bash/README.md b/development/bash-CONTEXT.md similarity index 100% rename from patterns/bash/README.md rename to development/bash-CONTEXT.md diff --git a/reference/bash/troubleshooting.md b/development/bash-troubleshooting.md similarity index 100% rename from reference/bash/troubleshooting.md rename to development/bash-troubleshooting.md diff --git a/reference/python/debugging.md b/development/debugging.md similarity index 100% rename from reference/python/debugging.md rename to development/debugging.md diff --git a/patterns/nodejs/README.md b/development/nodejs-CONTEXT.md similarity index 100% rename from patterns/nodejs/README.md rename to development/nodejs-CONTEXT.md diff --git a/patterns/python/README.md b/development/python-CONTEXT.md similarity index 100% rename from patterns/python/README.md rename to development/python-CONTEXT.md diff --git a/examples/bash/service-management.md b/development/service-management.md similarity index 100% rename from examples/bash/service-management.md rename to development/service-management.md diff --git a/patterns/vuejs/README.md b/development/vuejs-CONTEXT.md similarity index 100% rename from patterns/vuejs/README.md rename to development/vuejs-CONTEXT.md diff --git a/examples/python/web-frameworks.md b/development/web-frameworks.md similarity index 100% rename from examples/python/web-frameworks.md rename to development/web-frameworks.md diff --git a/docker/CONTEXT.md b/docker/CONTEXT.md new file mode 100644 index 0000000..3c44222 --- /dev/null +++ b/docker/CONTEXT.md @@ -0,0 +1,331 @@ +# Docker Container Technology - Technology Context + +## Overview +Docker containerization for home lab environments with focus on performance optimization, GPU acceleration, and distributed workloads. This context covers container architecture patterns, security practices, and production deployment strategies. + +## Architecture Patterns + +### Container Design Principles +1. **Single Responsibility**: One service per container +2. **Immutable Infrastructure**: Treat containers as replaceable units +3. **Resource Isolation**: Use container limits and cgroups +4. **Security First**: Run as non-root, minimal attack surface +5. **Configuration Management**: Environment variables and external configs + +### Multi-Stage Build Pattern +**Purpose**: Minimize production image size and attack surface +```dockerfile +# Build stage +FROM node:18 AS builder +WORKDIR /app +COPY package*.json ./ +RUN npm ci --only=production + +# Production stage +FROM node:18-alpine AS production +WORKDIR /app +COPY --from=builder /app/node_modules ./node_modules +COPY . . +USER 1000 +EXPOSE 3000 +CMD ["node", "server.js"] +``` + +### Distributed Application Architecture +**Pattern**: Server-Node separation with specialized workloads + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Control Plane β”‚ β”‚ Worker Nodes β”‚ +β”‚ β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ - Web Interface│◄──►│ β”‚ Node 1 β”‚ β”‚ Node 2 β”‚ ... β”‚ +β”‚ - Job Queue β”‚ β”‚ β”‚ GPU+CPU β”‚ β”‚ GPU+CPU β”‚ β”‚ +β”‚ - Coordination β”‚ β”‚ β”‚Local SSDβ”‚ β”‚Local SSDβ”‚ β”‚ +β”‚ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ + └──────── Shared Storage β”€β”€β”€β”€β”€β”€β”˜ + (NAS/SAN for persistence) +``` + +## Container Runtime Platforms + +### Docker vs Podman Comparison +**Docker**: Traditional daemon-based approach +- Requires Docker daemon running as root +- Centralized container management +- Established ecosystem and tooling + +**Podman** (Recommended for GPU workloads): +- Daemonless architecture +- Better GPU integration with NVIDIA +- Rootless containers for enhanced security +- Direct systemd integration + +### GPU Acceleration Support +**NVIDIA Container Toolkit Integration**: +```bash +# Podman GPU configuration (recommended) +podman run -d --name gpu-workload \ + --device nvidia.com/gpu=all \ + -e NVIDIA_DRIVER_CAPABILITIES=all \ + -e NVIDIA_VISIBLE_DEVICES=all \ + myapp:latest + +# Docker GPU configuration +docker run -d --name gpu-workload \ + --gpus all \ + -e NVIDIA_DRIVER_CAPABILITIES=all \ + myapp:latest +``` + +## Performance Optimization Patterns + +### Hybrid Storage Strategy +**Pattern**: Balance performance and persistence for different data types + +```yaml +volumes: + # Local storage (SSD/NVMe) - High Performance + - ./app/data:/app/data # Database - frequent I/O + - ./app/configs:/app/configs # Config - startup performance + - ./app/logs:/app/logs # Logs - continuous writing + - ./cache:/cache # Work directories - temp processing + + # Network storage (NAS) - Persistence & Backup + - /mnt/nas/backups:/app/backups # Backups - infrequent access + - /mnt/nas/media:/media:ro # Source data - read-only +``` + +**Benefits**: +- **Local Operations**: 100x faster database performance vs network +- **Network Reliability**: Critical data protected on redundant storage +- **Cost Optimization**: Expensive fast storage only where needed + +### Cache Optimization Hierarchy +```bash +# Performance tiers for different workload types +/dev/shm/cache/ # RAM disk - fastest, volatile, limited size +/mnt/nvme/cache/ # NVMe SSD - 3-7GB/s, persistent, recommended +/mnt/ssd/cache/ # SATA SSD - 500MB/s, good balance +/mnt/nas/cache/ # Network - 100MB/s, legacy compatibility +``` + +### Resource Management +**Container Limits** (prevent resource exhaustion): +```yaml +deploy: + resources: + limits: + memory: 8G + cpus: '6' + reservations: + memory: 4G + cpus: '2' +``` + +**Networking Optimization**: +```yaml +# Host networking for performance-critical applications +network_mode: host + +# Bridge networking with port mapping (default) +network_mode: bridge +ports: + - "8080:8080" +``` + +## Security Patterns + +### Container Hardening +```dockerfile +# Use minimal base images +FROM alpine:3.18 + +# Run as non-root user +RUN addgroup -g 1000 appuser && \ + adduser -u 1000 -G appuser -s /bin/sh -D appuser +USER 1000 + +# Set secure permissions +COPY --chown=appuser:appuser . /app +``` + +### Environment Security +```bash +# Secrets management (avoid environment variables for secrets) +podman secret create db_password password.txt +podman run --secret db_password myapp:latest + +# Network isolation +podman network create --driver bridge isolated-net +podman run --network isolated-net myapp:latest +``` + +### Image Security +1. **Vulnerability Scanning**: Regular image scans with tools like Trivy +2. **Version Pinning**: Use specific tags, avoid `latest` +3. **Minimal Images**: Distroless or Alpine base images +4. **Layer Optimization**: Minimize layers, combine RUN commands + +## Development Workflows + +### Local Development Pattern +```yaml +# docker-compose.dev.yml +version: "3.8" +services: + app: + build: . + volumes: + - .:/app # Code hot-reload + - /app/node_modules # Preserve dependencies + environment: + - NODE_ENV=development + ports: + - "3000:3000" +``` + +### Production Deployment Pattern +```bash +# Production container with health checks +podman run -d --name production-app \ + --restart unless-stopped \ + --health-cmd="curl -f http://localhost:3000/health || exit 1" \ + --health-interval=30s \ + --health-timeout=10s \ + --health-retries=3 \ + -p 3000:3000 \ + myapp:v1.2.3 +``` + +## Monitoring and Observability + +### Health Check Implementation +```dockerfile +# Application health check +HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:3000/health || exit 1 +``` + +### Log Management +```bash +# Structured logging with log rotation +podman run -d --name app \ + --log-driver journald \ + --log-opt max-size=10m \ + --log-opt max-file=3 \ + myapp:latest + +# Centralized logging +podman logs -f app | logger -t myapp +``` + +### Resource Monitoring +```bash +# Real-time container metrics +podman stats --no-stream app + +# Historical resource usage +podman exec app cat /sys/fs/cgroup/memory/memory.usage_in_bytes +``` + +## Common Implementation Patterns + +### Database Containers +```yaml +# Persistent database with backup strategy +services: + postgres: + image: postgres:15-alpine + environment: + POSTGRES_DB: myapp + POSTGRES_USER: appuser + POSTGRES_PASSWORD_FILE: /run/secrets/db_password + volumes: + - postgres_data:/var/lib/postgresql/data # Persistent data + - ./backups:/backups # Backup mount + secrets: + - db_password +``` + +### Web Application Containers +```yaml +# Multi-tier web application +services: + frontend: + image: nginx:alpine + volumes: + - ./nginx.conf:/etc/nginx/nginx.conf:ro + ports: + - "80:80" + - "443:443" + depends_on: + - backend + + backend: + build: ./api + environment: + - DATABASE_URL=postgresql://appuser@postgres/myapp + depends_on: + - postgres +``` + +### GPU-Accelerated Workloads +```bash +# GPU transcoding/processing container +podman run -d --name gpu-processor \ + --device nvidia.com/gpu=all \ + -e NVIDIA_DRIVER_CAPABILITIES=compute,video \ + -v "/fast-storage:/cache" \ + -v "/media:/input:ro" \ + -v "/output:/output" \ + gpu-app:latest +``` + +## Best Practices + +### Production Deployment +1. **Use specific image tags**: Never use `latest` in production +2. **Implement health checks**: Application and infrastructure monitoring +3. **Resource limits**: Prevent resource exhaustion +4. **Backup strategy**: Regular backups of persistent data +5. **Security scanning**: Regular vulnerability assessments + +### Development Guidelines +1. **Multi-stage builds**: Separate build and runtime environments +2. **Environment parity**: Keep dev/staging/prod similar +3. **Configuration externalization**: Use environment variables and secrets +4. **Dependency management**: Pin versions, use lock files +5. **Testing strategy**: Unit, integration, and container tests + +### Operational Excellence +1. **Log aggregation**: Centralized logging strategy +2. **Metrics collection**: Application and infrastructure metrics +3. **Alerting**: Proactive monitoring and alerting +4. **Documentation**: Container documentation and runbooks +5. **Disaster recovery**: Backup and recovery procedures + +## Migration Patterns + +### Legacy Application Containerization +1. **Assessment**: Identify dependencies and requirements +2. **Dockerfile creation**: Start with appropriate base image +3. **Configuration externalization**: Move configs to environment variables +4. **Data persistence**: Identify and volume mount data directories +5. **Testing**: Validate functionality in containerized environment + +### Platform Migration (Docker to Podman) +```bash +# Export Docker container configuration +docker inspect mycontainer > container-config.json + +# Convert to Podman run command +podman run -d --name mycontainer \ + --memory 4g \ + --cpus 2 \ + -v /host/path:/container/path \ + myimage:tag +``` + +This technology context provides comprehensive guidance for implementing Docker containerization strategies in home lab and production environments. \ No newline at end of file diff --git a/reference/docker/crash-analysis-summary.md b/docker/examples/crash-analysis-summary.md similarity index 100% rename from reference/docker/crash-analysis-summary.md rename to docker/examples/crash-analysis-summary.md diff --git a/patterns/docker/distributed-transcoding.md b/docker/examples/distributed-transcoding.md similarity index 100% rename from patterns/docker/distributed-transcoding.md rename to docker/examples/distributed-transcoding.md diff --git a/docker/examples/docker-iptables-troubleshooting-session.md b/docker/examples/docker-iptables-troubleshooting-session.md new file mode 100644 index 0000000..3b8e541 --- /dev/null +++ b/docker/examples/docker-iptables-troubleshooting-session.md @@ -0,0 +1,262 @@ +# Docker iptables/nftables Backend Troubleshooting Session + +## Session Context +- **Date**: August 8, 2025 +- **System**: Nobara PC (Fedora-based gaming distro) +- **User**: cal +- **Working Directory**: `/mnt/NV2/Development/claude-home` +- **Goal**: Get Docker working to run Tdarr Node container + +## System Information +```bash +# OS Details +uname -a +# Linux nobara-pc 6.15.5-200.nobara.fc42.x86_64 #1 SMP PREEMPT_DYNAMIC Sun Jul 6 11:56:20 UTC 2025 x86_64 GNU/Linux + +# Hardware +# AMD Ryzen 7 7800X3D 8-Core Processor +# 62GB RAM +# NVIDIA GeForce RTX 4080 SUPER + +# Distribution +# Nobara (Fedora 42-based) +``` + +## Problem Summary +Docker daemon fails to start with persistent error: +``` +failed to start daemon: Error initializing network controller: error obtaining controller instance: failed to register "bridge" driver: failed to create NAT chain DOCKER: COMMAND_FAILED: INVALID_IPV: 'ipv4' is not a valid backend or is unavailable +``` + +## Root Cause Analysis + +### Initial Discovery +1. **Missing iptables**: Docker couldn't find `iptables` command in PATH +2. **Backend conflict**: System using nftables but Docker expects iptables-legacy +3. **Package inconsistency**: `iptables-nft` package installed but binary missing initially + +### Key Findings +- `dnf list installed | grep -i iptables` initially returned nothing +- `firewalld` and `nftables` services were both inactive +- `iptables-nft` package was installed but `/usr/bin/iptables` didn't exist +- After reinstall, iptables worked but used nftables backend +- NAT table incompatible: `iptables v1.8.11 (nf_tables): table 'nat' is incompatible, use 'nft' tool.` + +## Troubleshooting Steps Performed + +### Step 1: Package Investigation +```bash +# Check installed iptables packages +dnf list installed | grep -i iptables +# Result: No matching packages (surprising!) + +# Check service status +systemctl status nftables # inactive (dead) +firewall-cmd --get-backend-type # firewalld not running + +# Check if iptables binary exists +which iptables # not found +/usr/bin/iptables --version # No such file or directory +``` + +### Step 2: Package Reinstallation +```bash +# Reinstall iptables-nft package +sudo dnf reinstall -y iptables-nft + +# Verify installation +rpm -ql iptables-nft | grep bin +# Shows /usr/bin/iptables should exist + +# Test after reinstall +iptables --version +# Result: iptables v1.8.11 (nf_tables) - SUCCESS! +``` + +### Step 3: Backend Compatibility Testing +```bash +# Test NAT table access +sudo iptables -t nat -L +# Error: iptables v1.8.11 (nf_tables): table `nat' is incompatible, use 'nft' tool. +``` + +### Step 4: Legacy Backend Installation +```bash +# Install iptables-legacy +sudo dnf install -y iptables-legacy iptables-legacy-libs + +# Set up alternatives system +sudo alternatives --install /usr/bin/iptables iptables /usr/bin/iptables-legacy 10 +sudo alternatives --install /usr/bin/ip6tables ip6tables /usr/bin/ip6tables-legacy 10 + +# Test NAT table with legacy backend +sudo iptables -t nat -L +# SUCCESS: Shows empty NAT chains +``` + +### Step 5: Docker Restart Attempts +```bash +# Remove NVIDIA daemon.json config (potential conflict) +sudo rm -f /etc/docker/daemon.json + +# Load NAT kernel module explicitly +sudo modprobe iptable_nat + +# Try starting firewalld (in case Docker needs it) +sudo systemctl enable --now firewalld + +# Multiple restart attempts +sudo systemctl start docker +# ALL FAILED with same NAT chain error +``` + +## Current State +- βœ… iptables-legacy installed and configured +- βœ… NAT table accessible via `iptables -t nat -L` +- βœ… All required kernel modules should be available +- ❌ Docker still fails with NAT chain creation error +- ❌ Same error persists despite backend switch + +## Analysis of Persistent Issue + +### Potential Causes +1. **Kernel State Contamination**: nftables rules/chains may still be active in kernel memory +2. **Module Loading Order**: iptables vs nftables modules loaded in conflicting order +3. **Docker Caching**: Docker may be caching the old backend detection +4. **Firewall Integration**: Docker + firewalld interaction on Fedora/Nobara +5. **System-Level Backend Selection**: Some system-wide iptables backend lock + +### Evidence Supporting Kernel State Theory +- Error message is identical across all restart attempts +- iptables command works fine manually +- NAT table shows properly but Docker can't create chains +- Issue persists despite configuration changes + +## Next Session Action Plan + +### Immediate Steps After System Reboot +1. **Verify Backend Status**: + ```bash + iptables --version # Should show legacy + sudo iptables -t nat -L # Should show clean NAT table + ``` + +2. **Check Kernel Modules**: + ```bash + lsmod | grep -E "(iptable|nf_|ip_tables)" + modprobe -l | grep -E "(iptable|nf_table)" + ``` + +3. **Test Docker Start**: + ```bash + sudo systemctl start docker + docker --version + ``` + +### If Issue Persists After Reboot + +#### Alternative Approach 1: Docker Configuration Override +```bash +# Create daemon.json to disable iptables management +sudo mkdir -p /etc/docker +cat < /tmp/iptables-state.txt +sudo nft list ruleset > /tmp/nft-state.txt + +# Docker troubleshooting +sudo dockerd --debug --log-level=debug > /tmp/docker-debug.log 2>&1 & +# Kill after 30 seconds and examine log + +# System journal deep dive +journalctl -u docker.service --since="1 hour ago" -o verbose > /tmp/docker-journal.log +``` + +## Known Working Configuration Target + +### Expected Working State +- **iptables**: Legacy backend active +- **Docker**: Running with NAT chain creation successful +- **Network**: Docker bridge network functional +- **Containers**: Can start and access network + +### Tdarr Node Test Command +```bash +cd ~/docker/tdarr-node +# Update IP in compose file first: +# serverIP= +docker-compose -f tdarr-node-basic.yml up -d +``` + +## Related Documentation Created +- `/patterns/docker/gpu-acceleration.md` - GPU troubleshooting patterns +- `/reference/docker/nvidia-troubleshooting.md` - NVIDIA container toolkit +- `/examples/docker/tdarr-node-local/` - Working configurations + +## System Context Notes +- This is a gaming-focused Nobara distribution +- May have different default networking than standard Fedora +- NVIDIA drivers already working (nvidia-smi functional) +- System has been used for other Docker containers successfully in past +- Recent NVIDIA container toolkit installation may have triggered the issue + +## Success Criteria for Next Session +1. βœ… Docker service starts without errors +2. βœ… `docker ps` command works +3. βœ… Simple container can run: `docker run --rm hello-world` +4. βœ… Tdarr node container can start (even if can't connect to server yet) +5. βœ… Network connectivity from containers works + +## Escalation Options +If standard troubleshooting fails: +1. **Nobara Community**: Check Nobara Discord/forums for similar issues +2. **Docker Desktop**: Use different Docker implementation +3. **Podman Migration**: Switch to podman as Docker replacement +4. **System Reinstall**: Fresh OS install (nuclear option) +5. **Container Alternatives**: LXC/systemd containers instead of Docker + +## Files to Check Next Session +- `/etc/docker/daemon.json` - Docker configuration +- `/var/log/docker.log` - Docker service logs +- `~/.docker/config.json` - User Docker config +- `/proc/sys/net/ipv4/ip_forward` - IP forwarding enabled +- `/etc/systemd/system/docker.service.d/` - Service overrides + +--- +*End of troubleshooting session log* \ No newline at end of file diff --git a/patterns/docker/gpu-acceleration.md b/docker/examples/gpu-acceleration.md similarity index 100% rename from patterns/docker/gpu-acceleration.md rename to docker/examples/gpu-acceleration.md diff --git a/examples/docker/multi-stage-builds.md b/docker/examples/multi-stage-builds.md similarity index 100% rename from examples/docker/multi-stage-builds.md rename to docker/examples/multi-stage-builds.md diff --git a/reference/docker/nvidia-gpu-troubleshooting.md b/docker/examples/nvidia-gpu-troubleshooting.md similarity index 100% rename from reference/docker/nvidia-gpu-troubleshooting.md rename to docker/examples/nvidia-gpu-troubleshooting.md diff --git a/reference/docker/nvidia-troubleshooting.md b/docker/examples/nvidia-troubleshooting.md similarity index 100% rename from reference/docker/nvidia-troubleshooting.md rename to docker/examples/nvidia-troubleshooting.md diff --git a/reference/docker/tdarr-container-fixes.md b/docker/examples/tdarr-container-fixes.md similarity index 100% rename from reference/docker/tdarr-container-fixes.md rename to docker/examples/tdarr-container-fixes.md diff --git a/reference/docker/tdarr-monitoring-configuration.md b/docker/examples/tdarr-monitoring-configuration.md similarity index 100% rename from reference/docker/tdarr-monitoring-configuration.md rename to docker/examples/tdarr-monitoring-configuration.md diff --git a/examples/docker/tdarr-node-configurations.md b/docker/examples/tdarr-node-configurations.md similarity index 100% rename from examples/docker/tdarr-node-configurations.md rename to docker/examples/tdarr-node-configurations.md diff --git a/examples/docker/tdarr-node-local/docker-compose-cpu.yml b/docker/examples/tdarr-node-local/docker-compose-cpu.yml similarity index 100% rename from examples/docker/tdarr-node-local/docker-compose-cpu.yml rename to docker/examples/tdarr-node-local/docker-compose-cpu.yml diff --git a/examples/docker/tdarr-node-local/docker-compose-gpu.yml b/docker/examples/tdarr-node-local/docker-compose-gpu.yml similarity index 100% rename from examples/docker/tdarr-node-local/docker-compose-gpu.yml rename to docker/examples/tdarr-node-local/docker-compose-gpu.yml diff --git a/examples/docker/tdarr-node-local/start-tdarr-mapped-node.sh b/docker/examples/tdarr-node-local/start-tdarr-mapped-node.sh similarity index 100% rename from examples/docker/tdarr-node-local/start-tdarr-mapped-node.sh rename to docker/examples/tdarr-node-local/start-tdarr-mapped-node.sh diff --git a/examples/docker/tdarr-server-setup/README.md b/docker/examples/tdarr-server-setup/README.md similarity index 100% rename from examples/docker/tdarr-server-setup/README.md rename to docker/examples/tdarr-server-setup/README.md diff --git a/examples/docker/tdarr-server-setup/docker-compose.yml b/docker/examples/tdarr-server-setup/docker-compose.yml similarity index 100% rename from examples/docker/tdarr-server-setup/docker-compose.yml rename to docker/examples/tdarr-server-setup/docker-compose.yml diff --git a/reference/docker/tdarr-troubleshooting.md b/docker/examples/tdarr-troubleshooting.md similarity index 100% rename from reference/docker/tdarr-troubleshooting.md rename to docker/examples/tdarr-troubleshooting.md diff --git a/reference/docker/troubleshooting.md b/docker/examples/troubleshooting.md similarity index 100% rename from reference/docker/troubleshooting.md rename to docker/examples/troubleshooting.md diff --git a/docker/troubleshooting.md b/docker/troubleshooting.md new file mode 100644 index 0000000..4349f3d --- /dev/null +++ b/docker/troubleshooting.md @@ -0,0 +1,466 @@ +# Docker Container Troubleshooting Guide + +## Container Startup Issues + +### Container Won't Start +**Check container logs first**: +```bash +# Docker +docker logs +docker logs --tail 50 -f + +# Podman +podman logs +podman logs --tail 50 -f +``` + +### Common Startup Failures + +#### Port Conflicts +**Symptoms**: `bind: address already in use` error +**Solution**: +```bash +# Find conflicting process +sudo netstat -tulpn | grep +docker ps | grep + +# Change port mapping +docker run -p 8081:8080 myapp # Use different host port +``` + +#### Permission Errors +**Symptoms**: `permission denied` when accessing files/volumes +**Solutions**: +```bash +# Check file ownership +ls -la /host/volume/path + +# Fix ownership (match container user) +sudo chown -R 1000:1000 /host/volume/path + +# Use correct UID/GID in container +docker run -e PUID=1000 -e PGID=1000 myapp +``` + +#### Missing Environment Variables +**Symptoms**: Application fails with configuration errors +**Diagnostic**: +```bash +# Check container environment +docker exec -it env +docker exec -it printenv + +# Verify required variables are set +docker inspect | grep -A 20 "Env" +``` + +#### Resource Constraints +**Symptoms**: Container killed or OOM errors +**Solutions**: +```bash +# Check resource usage +docker stats + +# Increase memory limit +docker run -m 4g myapp + +# Check system resources +free -h +df -h +``` + +### Debug Running Containers +```bash +# Access container shell +docker exec -it /bin/bash +docker exec -it /bin/sh # if bash not available + +# Check container processes +docker exec ps aux + +# Check container filesystem +docker exec ls -la /app +``` + +## Build Issues + +### Build Failures +**Clear build cache when encountering issues**: +```bash +# Docker +docker system prune -a +docker builder prune + +# Podman +podman system prune -a +podman image prune -a +``` + +### Verbose Build Output +```bash +# Docker +docker build --progress=plain --no-cache . + +# Podman +podman build --layers=false . +``` + +### Common Build Problems + +#### COPY/ADD Errors +**Issue**: Files not found during build +**Solutions**: +```dockerfile +# Check .dockerignore file +# Verify file paths relative to build context +COPY ./src /app/src # βœ… Correct +COPY /absolute/path /app # ❌ Wrong - no absolute paths +``` + +#### Package Installation Failures +**Issue**: apt/yum/dnf package installation fails +**Solutions**: +```dockerfile +# Update package lists first +RUN apt-get update && apt-get install -y package-name + +# Combine RUN commands to reduce layers +RUN apt-get update && \ + apt-get install -y package1 package2 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* +``` + +#### Network Issues During Build +**Issue**: Cannot reach package repositories +**Solutions**: +```bash +# Check DNS resolution +docker build --network host . + +# Use custom DNS +docker build --dns 8.8.8.8 . +``` + +## GPU Container Issues + +### NVIDIA GPU Support Problems + +#### Docker Desktop vs Podman on Fedora/Nobara +**Issue**: Docker Desktop has GPU compatibility issues on Fedora-based systems +**Symptoms**: +- `CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected` +- `unknown or invalid runtime name: nvidia` +- Device nodes exist but CUDA fails to initialize + +**Solution**: Use Podman instead of Docker on Fedora systems +```bash +# Verify host GPU works +nvidia-smi + +# Test with Podman (recommended) +podman run --rm --device nvidia.com/gpu=all ubuntu:20.04 nvidia-smi + +# Test with Docker (may fail on Fedora) +docker run --rm --gpus all ubuntu:20.04 nvidia-smi +``` + +#### GPU Container Configuration +**Working Podman GPU template**: +```bash +podman run -d --name gpu-container \ + --device nvidia.com/gpu=all \ + --restart unless-stopped \ + -e NVIDIA_DRIVER_CAPABILITIES=all \ + -e NVIDIA_VISIBLE_DEVICES=all \ + myapp:latest +``` + +**Working Docker GPU template**: +```bash +docker run -d --name gpu-container \ + --gpus all \ + --restart unless-stopped \ + -e NVIDIA_DRIVER_CAPABILITIES=all \ + -e NVIDIA_VISIBLE_DEVICES=all \ + myapp:latest +``` + +#### GPU Troubleshooting Steps +1. **Verify Host GPU Access**: + ```bash + nvidia-smi # Should show GPU info + lsmod | grep nvidia # Should show nvidia modules + ls -la /dev/nvidia* # Should show device files + ``` + +2. **Check NVIDIA Container Toolkit**: + ```bash + rpm -qa | grep nvidia-container-toolkit # Fedora/RHEL + dpkg -l | grep nvidia-container-toolkit # Ubuntu/Debian + nvidia-ctk --version + ``` + +3. **Test GPU in Container**: + ```bash + # Should show GPU information + podman exec gpu-container nvidia-smi + + # Test CUDA functionality + podman exec gpu-container nvidia-ml-py + ``` + +#### Platform-Specific GPU Notes +**Fedora/Nobara/RHEL**: +- βœ… Podman: Works out-of-the-box with GPU support +- ❌ Docker Desktop: Known GPU integration issues +- Solution: Use Podman for GPU workloads + +**Ubuntu/Debian**: +- βœ… Docker: Generally works well with proper NVIDIA toolkit setup +- βœ… Podman: Also works well +- Solution: Either runtime typically works + +## Performance Issues + +### Resource Monitoring +**Real-time resource usage**: +```bash +# Overall container stats +docker stats +podman stats + +# Inside container analysis +docker exec top +docker exec free -h +docker exec df -h + +# Network usage +docker exec netstat -i +``` + +### Image Size Optimization +**Analyze image layers**: +```bash +# Check image sizes +docker images --format "table {{.Repository}}\t{{.Tag}}\t{{.Size}}" + +# Analyze layer history +docker history + +# Find large files in container +docker exec du -sh /* | sort -hr +``` + +**Optimization strategies**: +```dockerfile +# Use multi-stage builds +FROM node:18 AS builder +# ... build steps ... + +FROM node:18-alpine AS production +COPY --from=builder /app/dist /app +# Smaller final image + +# Combine RUN commands +RUN apt-get update && \ + apt-get install -y package && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Use .dockerignore +# .dockerignore +node_modules +.git +*.log +``` + +### Storage Performance Issues +**Slow volume performance**: +```bash +# Test volume I/O performance +docker exec dd if=/dev/zero of=/volume/test bs=1M count=1000 + +# Check volume mount options +docker inspect | grep -A 10 "Mounts" + +# Consider using tmpfs for temporary data +docker run --tmpfs /tmp myapp +``` + +## Network Debugging + +### Network Connectivity Issues +**Inspect network configuration**: +```bash +# List networks +docker network ls +podman network ls + +# Inspect specific network +docker network inspect + +# Check container networking +docker exec ip addr show +docker exec ip route show +``` + +### Service Discovery Problems +**Test connectivity between containers**: +```bash +# Test by container name (same network) +docker exec container1 ping container2 + +# Test by IP address +docker exec container1 ping 172.17.0.3 + +# Check DNS resolution +docker exec container1 nslookup container2 +``` + +### Port Binding Issues +**Verify port mappings**: +```bash +# Check exposed ports +docker port + +# Test external connectivity +curl localhost:8080 + +# Check if port is bound to all interfaces +netstat -tulpn | grep :8080 +``` + +## Emergency Recovery + +### Complete Container Reset +**Remove all containers and start fresh**: +```bash +# Stop all containers +docker stop $(docker ps -q) +podman stop --all + +# Remove all containers +docker container prune -f +podman container prune -f + +# Remove all images +docker image prune -a -f +podman image prune -a -f + +# Remove all volumes (CAUTION: data loss) +docker volume prune -f +podman volume prune -f + +# Complete system cleanup +docker system prune -a --volumes -f +podman system prune -a --volumes -f +``` + +### Container Recovery +**Recover from corrupted container**: +```bash +# Create backup of container data +docker cp :/important/data ./backup/ + +# Export container filesystem +docker export > container-backup.tar + +# Import and restart +docker import container-backup.tar new-image:latest +docker run -d --name new-container new-image:latest +``` + +### Data Recovery +**Recover data from volumes**: +```bash +# List volumes +docker volume ls + +# Inspect volume location +docker volume inspect + +# Access volume data directly +sudo ls -la /var/lib/docker/volumes//_data + +# Mount volume to temporary container +docker run --rm -v :/data alpine ls -la /data +``` + +## Health Check Issues + +### Container Health Checks +**Implement health checks**: +```dockerfile +# Dockerfile health check +HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:3000/health || exit 1 +``` + +**Debug health check failures**: +```bash +# Check health status +docker inspect | grep -A 10 Health + +# Manual health check test +docker exec curl -f http://localhost:3000/health + +# Check health check logs +docker events --filter container= +``` + +## Log Analysis + +### Log Management +**View and manage container logs**: +```bash +# View recent logs +docker logs --tail 100 + +# Follow logs in real-time +docker logs -f + +# Logs with timestamps +docker logs -t + +# Search logs for errors +docker logs 2>&1 | grep ERROR +``` + +### Log Rotation Issues +**Configure log rotation to prevent disk filling**: +```bash +# Run with log size limits +docker run --log-opt max-size=10m --log-opt max-file=3 myapp + +# Check log file sizes +sudo du -sh /var/lib/docker/containers/*/ +``` + +## Platform-Specific Issues + +### Fedora/Nobara/RHEL Systems +- **GPU Support**: Use Podman instead of Docker Desktop +- **SELinux**: May require container contexts (`-Z` flag) +- **Firewall**: Configure firewalld for container networking + +### Ubuntu/Debian Systems +- **AppArmor**: May restrict container operations +- **Snap Docker**: May have permission issues vs native package + +### General Linux Issues +- **cgroups v2**: Some older containers need cgroups v1 +- **User namespaces**: May cause UID/GID mapping issues +- **systemd**: Integration differences between Docker/Podman + +## Prevention Best Practices + +1. **Resource Limits**: Always set memory and CPU limits +2. **Health Checks**: Implement application health monitoring +3. **Log Rotation**: Configure to prevent disk space issues +4. **Security Scanning**: Regular vulnerability scans +5. **Backup Strategy**: Regular data and configuration backups +6. **Testing**: Test containers in staging before production +7. **Documentation**: Document container configurations and dependencies + +This troubleshooting guide covers the most common Docker and Podman container issues encountered in home lab and production environments. \ No newline at end of file diff --git a/legacy/old-scripts-README.md b/legacy/old-scripts-README.md new file mode 100644 index 0000000..53f69d3 --- /dev/null +++ b/legacy/old-scripts-README.md @@ -0,0 +1,172 @@ +# Scripts Directory + +This directory contains operational scripts and utilities for home lab management and automation. + +## Directory Structure + +``` +scripts/ +β”œβ”€β”€ README.md # This documentation +β”œβ”€β”€ tdarr_monitor.py # Enhanced Tdarr monitoring with Discord alerts +β”œβ”€β”€ tdarr/ # Tdarr automation and scheduling +β”œβ”€β”€ monitoring/ # System monitoring and alerting +└── / # Other organized automation subsystems +``` + +## Scripts Overview + +### `tdarr_monitor.py` - Enhanced Tdarr Monitoring + +**Description**: Comprehensive Tdarr monitoring script with stuck job detection and Discord notifications. + +**Features**: +- πŸ“Š Complete Tdarr system monitoring (server, nodes, queue, libraries) +- 🧠 Short-term memory for stuck job detection +- 🚨 Discord notifications with rich embeds +- πŸ’Ύ Persistent state management +- βš™οΈ Configurable thresholds and alerts + +**Quick Start**: +```bash +# Basic monitoring +python3 scripts/tdarr_monitor.py --server http://10.10.0.43:8265 --check all + +# Enable stuck job detection with 15-minute threshold +python3 scripts/tdarr_monitor.py --server http://10.10.0.43:8265 \ + --check nodes --detect-stuck --stuck-threshold 15 + +# Full monitoring with Discord alerts (uses default webhook) +python3 scripts/tdarr_monitor.py --server http://10.10.0.43:8265 \ + --check all --detect-stuck --discord-alerts + +# Test Discord integration (uses default webhook) +python3 scripts/tdarr_monitor.py --server http://10.10.0.43:8265 --discord-test +``` + +**CLI Options**: +``` +--server Tdarr server URL (required) +--check Type of check: all, status, queue, nodes, libraries, stats, health +--timeout Request timeout in seconds (default: 30) +--output Output format: json, pretty (default: pretty) +--verbose Enable verbose logging +--detect-stuck Enable stuck job detection +--stuck-threshold Minutes before job considered stuck (default: 30) +--memory-file Path to memory state file (default: .claude/tmp/tdarr_memory.pkl) +--clear-memory Clear memory state and exit +--discord-webhook Discord webhook URL for notifications (default: configured) +--discord-alerts Enable Discord alerts for stuck jobs +--discord-test Send test Discord message and exit +``` + +**Memory Management**: +- **Persistent State**: Worker snapshots saved to `.claude/tmp/tdarr_memory.pkl` +- **Automatic Cleanup**: Removes tracking for disappeared workers +- **Error Recovery**: Graceful handling of corrupted memory files + +**Discord Features**: +- **Two Message Types**: Simple content messages and rich embeds +- **Stuck Job Alerts**: Detailed embed notifications with file info, progress, duration +- **System Status**: Health summaries with node details and color-coded status +- **Customizable**: Colors, fields, titles, descriptions fully configurable +- **Error Handling**: Graceful failures without breaking monitoring + +**Integration Examples**: + +*Cron Job for Regular Monitoring*: +```bash +# Check every 15 minutes, alert on stuck jobs over 30 minutes +*/15 * * * * cd /path/to/claude-home && python3 scripts/tdarr_monitor.py \ + --server http://10.10.0.43:8265 --check nodes --detect-stuck --discord-alerts +``` + +*Systemd Service*: +```ini +[Unit] +Description=Tdarr Monitor +After=network.target + +[Service] +Type=oneshot +ExecStart=/usr/bin/python3 /path/to/claude-home/scripts/tdarr_monitor.py \ + --server http://10.10.0.43:8265 --check all --detect-stuck --discord-alerts +WorkingDirectory=/path/to/claude-home +User=your-user + +[Timer] +OnCalendar=*:0/15 +Persistent=true + +[Install] +WantedBy=timers.target +``` + +**API Data Classes**: +The script uses strongly-typed dataclasses for all API responses: +- `ServerStatus` - Server health and version info +- `NodeStatus` - Node details with stuck job tracking +- `QueueStatus` - Transcoding queue statistics +- `LibraryStatus` - Library scan progress +- `StatisticsStatus` - Overall system statistics +- `HealthStatus` - Comprehensive health check results + +**Error Handling**: +- Network timeouts and connection errors +- API endpoint failures +- JSON parsing errors +- Discord webhook failures +- Memory state corruption +- Missing dependencies + +**Dependencies**: +- `requests` - HTTP client for API calls +- `pickle` - State serialization +- Standard library only (no external requirements beyond requests) + +--- + +## Development Guidelines + +### Adding New Scripts + +1. **Location**: Place scripts in appropriate subdirectories by function +2. **Documentation**: Include comprehensive docstrings and usage examples +3. **Error Handling**: Implement robust error handling and logging +4. **Configuration**: Use CLI arguments and/or config files for flexibility +5. **Testing**: Include test functionality where applicable + +### Naming Conventions + +- Use descriptive names: `tdarr_monitor.py` not `monitor.py` +- Use underscores for Python scripts: `system_health.py` +- Use hyphens for shell scripts: `backup-system.sh` + +### Directory Organization + +Create subdirectories for related functionality: +``` +scripts/ +β”œβ”€β”€ monitoring/ # System monitoring scripts +β”œβ”€β”€ backup/ # Backup and restore utilities +β”œβ”€β”€ network/ # Network management tools +β”œβ”€β”€ containers/ # Docker/Podman management +└── maintenance/ # System maintenance tasks +``` + +--- + +## Future Enhancements + +### Planned Features +- **Email Notifications**: SMTP integration for email alerts +- **Prometheus Metrics**: Export metrics for Grafana dashboards +- **Webhook Actions**: Trigger external actions on stuck jobs +- **Multi-Server Support**: Monitor multiple Tdarr instances +- **Configuration Files**: YAML/JSON config file support + +### Contributing +1. Follow existing code style and patterns +2. Add comprehensive documentation +3. Include error handling and logging +4. Test thoroughly before committing +5. Update this README with new scripts \ No newline at end of file diff --git a/monitoring/CONTEXT.md b/monitoring/CONTEXT.md new file mode 100644 index 0000000..64d0d53 --- /dev/null +++ b/monitoring/CONTEXT.md @@ -0,0 +1,142 @@ +# System Monitoring and Alerting - Technology Context + +## Overview +Comprehensive monitoring and alerting system for home lab infrastructure with focus on automated health checks, Discord notifications, and proactive system maintenance. + +## Architecture Patterns + +### Distributed Monitoring Strategy +**Pattern**: Service-specific monitoring with centralized alerting +- **Tdarr Monitoring**: API-based transcoding health checks +- **Windows Desktop Monitoring**: Reboot detection and system events +- **Network Monitoring**: Connectivity and service availability +- **Container Monitoring**: Docker/Podman health and resource usage + +### Alert Management +**Pattern**: Structured notifications with actionable information +```bash +# Discord webhook integration +curl -X POST "$DISCORD_WEBHOOK" \ + -H "Content-Type: application/json" \ + -d '{ + "content": "**System Alert**\n```\nService: Tdarr\nIssue: Staging timeout\nAction: Automatic cleanup performed\n```\n<@user_id>" + }' +``` + +## Core Monitoring Components + +### Tdarr System Monitoring +**Purpose**: Monitor transcoding pipeline health and performance +**Location**: `scripts/tdarr_monitor.py` + +**Key Features**: +- API-based status monitoring with dataclass structures +- Staging section timeout detection and cleanup +- Discord notifications with professional formatting +- Log rotation and retention management + +### Windows Desktop Monitoring +**Purpose**: Track Windows system reboots and power events +**Location**: `scripts/windows-desktop/` + +**Components**: +- PowerShell monitoring script +- Scheduled task automation +- Discord notification integration +- System event correlation + +### Network and Service Monitoring +**Purpose**: Monitor critical infrastructure availability +**Implementation**: +```bash +# Service health check pattern +SERVICES="https://homelab.local http://nas.homelab.local" +for service in $SERVICES; do + if curl -sSf --max-time 10 "$service" >/dev/null 2>&1; then + echo "βœ… $service: Available" + else + echo "❌ $service: Failed" | send_alert + fi +done +``` + +## Automation Patterns + +### Cron-Based Scheduling +**Pattern**: Regular health checks with intelligent alerting +```bash +# Monitoring schedule examples +*/20 * * * * /path/to/tdarr-timeout-monitor.sh # Every 20 minutes +0 */6 * * * /path/to/cleanup-temp-dirs.sh # Every 6 hours +0 2 * * * /path/to/backup-monitor.sh # Daily at 2 AM +``` + +### Event-Driven Monitoring +**Pattern**: Reactive monitoring for critical events +- **System Startup**: Windows boot detection +- **Service Failures**: Container restart alerts +- **Resource Exhaustion**: Disk space warnings +- **Security Events**: Failed login attempts + +## Data Collection and Analysis + +### Log Management +**Pattern**: Centralized logging with rotation +```bash +# Log rotation configuration +LOG_FILE="/var/log/homelab-monitor.log" +MAX_SIZE="10M" +RETENTION_DAYS=30 + +# Rotate logs when size exceeded +if [ $(stat -c%s "$LOG_FILE") -gt $((10*1024*1024)) ]; then + mv "$LOG_FILE" "$LOG_FILE.$(date +%Y%m%d)" + touch "$LOG_FILE" +fi +``` + +### Metrics Collection +**Pattern**: Time-series data for trend analysis +- **System Metrics**: CPU, memory, disk usage +- **Service Metrics**: Response times, error rates +- **Application Metrics**: Transcoding progress, queue sizes +- **Network Metrics**: Bandwidth usage, latency + +## Alert Integration + +### Discord Notification System +**Pattern**: Rich, actionable notifications +```markdown +# Professional alert format +**πŸ”§ System Maintenance** +Service: Tdarr Transcoding +Issue: 3 files timed out in staging +Resolution: Automatic cleanup completed +Status: System operational + +Manual review recommended <@user_id> +``` + +### Alert Escalation +**Pattern**: Tiered alerting based on severity +1. **Info**: Routine maintenance completed +2. **Warning**: Service degradation detected +3. **Critical**: Service failure requiring immediate attention +4. **Emergency**: System-wide failure requiring manual intervention + +## Best Practices Implementation + +### Monitoring Strategy +1. **Proactive**: Monitor trends to predict issues +2. **Reactive**: Alert on current failures +3. **Preventive**: Automated cleanup and maintenance +4. **Comprehensive**: Cover all critical services +5. **Actionable**: Provide clear resolution paths + +### Performance Optimization +1. **Efficient Polling**: Balance monitoring frequency with resource usage +2. **Smart Alerting**: Avoid alert fatigue with intelligent filtering +3. **Resource Management**: Monitor the monitoring system itself +4. **Scalable Architecture**: Design for growth and additional services + +This technology context provides the foundation for implementing comprehensive monitoring and alerting in home lab environments. \ No newline at end of file diff --git a/monitoring/examples/cron-job-management.md b/monitoring/examples/cron-job-management.md new file mode 100644 index 0000000..df56f14 --- /dev/null +++ b/monitoring/examples/cron-job-management.md @@ -0,0 +1,326 @@ +# Cron Job Management Patterns + +This document outlines the cron job patterns and management strategies used in the home lab environment. + +## Current Cron Schedule + +### Overview +```bash +# Monthly maintenance +0 2 1 * * /home/cal/bin/ssh_key_maintenance.sh + +# Tdarr monitoring and management +*/10 * * * * python3 /mnt/NV2/Development/claude-home/scripts/tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --detect-stuck --discord-alerts >/dev/null 2>&1 +0 */6 * * * find "/mnt/NV2/tdarr-cache/nobara-pc-gpu-unmapped/temp/" -name "tdarr-workDir2-*" -type d -mmin +360 -exec rm -rf {} \; 2>/dev/null || true +0 3 * * * find "/mnt/NV2/tdarr-cache/nobara-pc-gpu-unmapped/media" -name "*.temp" -o -name "*.tdarr" -mtime +1 -delete 2>/dev/null || true + +# Disabled/legacy jobs +#*/20 * * * * /mnt/NV2/Development/claude-home/scripts/monitoring/tdarr-timeout-monitor.sh +``` + +## Job Categories + +### 1. System Maintenance +**SSH Key Maintenance** +- **Schedule**: `0 2 1 * *` (Monthly, 1st at 2 AM) +- **Purpose**: Maintain SSH key security and rotation +- **Location**: `/home/cal/bin/ssh_key_maintenance.sh` +- **Priority**: High (security-critical) + +### 2. Monitoring & Alerting +**Tdarr System Monitoring** +- **Schedule**: `*/10 * * * *` (Every 10 minutes) +- **Purpose**: Monitor Tdarr nodes, detect stuck jobs, send Discord alerts +- **Features**: + - Stuck job detection (30-minute threshold) + - Discord notifications with rich embeds + - Persistent memory state tracking +- **Script**: `/mnt/NV2/Development/claude-home/scripts/tdarr_monitor.py` +- **Output**: Silent (`>/dev/null 2>&1`) + +### 3. Cleanup & Housekeeping +**Tdarr Work Directory Cleanup** +- **Schedule**: `0 */6 * * *` (Every 6 hours) +- **Purpose**: Remove stale Tdarr work directories +- **Target**: `/mnt/NV2/tdarr-cache/nobara-pc-gpu-unmapped/temp/` +- **Pattern**: `tdarr-workDir2-*` directories +- **Age threshold**: 6 hours (`-mmin +360`) + +**Failed Tdarr Job Cleanup** +- **Schedule**: `0 3 * * *` (Daily at 3 AM) +- **Purpose**: Remove failed transcode artifacts +- **Target**: `/mnt/NV2/tdarr-cache/nobara-pc-gpu-unmapped/media/` +- **Patterns**: `*.temp` and `*.tdarr` files +- **Age threshold**: 24 hours (`-mtime +1`) + +## Design Patterns + +### 1. Absolute Paths +**Always use absolute paths in cron jobs** +```bash +# Good +*/10 * * * * python3 /full/path/to/script.py + +# Bad - relative paths don't work in cron +*/10 * * * * python3 scripts/script.py +``` + +### 2. Error Handling +**Standard error suppression pattern** +```bash +command 2>/dev/null || true +``` +- Suppresses stderr to prevent cron emails +- `|| true` ensures job always exits successfully + +### 3. Time-based Cleanup +**Safe age thresholds for different content types** +- **Work directories**: 6 hours (short-lived, safe for active jobs) +- **Temp files**: 24 hours (allows for long transcodes) +- **Log files**: 7-30 days (depending on importance) + +### 4. Resource-aware Scheduling +**Avoid resource conflicts** +```bash +# System maintenance at low-usage times +0 2 1 * * maintenance_script.sh + +# Cleanup during off-peak hours +0 3 * * * cleanup_script.sh + +# Monitoring with high frequency during active hours +*/10 * * * * monitor_script.py +``` + +## Management Workflow + +### Adding New Cron Jobs + +1. **Backup current crontab** + ```bash + crontab -l > /tmp/crontab_backup_$(date +%Y%m%d) + ``` + +2. **Edit safely** + ```bash + crontab -l > /tmp/new_crontab + echo "# New job description" >> /tmp/new_crontab + echo "schedule command" >> /tmp/new_crontab + crontab /tmp/new_crontab + ``` + +3. **Verify installation** + ```bash + crontab -l + ``` + +### Proper HERE Document (EOF) Usage + +**When building cron files with HERE documents, use proper EOF formatting:** + +#### βœ… **Correct Format** +```bash +cat > /tmp/new_crontab << 'EOF' +0 2 1 * * /home/cal/bin/ssh_key_maintenance.sh +# Tdarr monitoring every 10 minutes +*/10 * * * * python3 /path/to/script.py --args +EOF +``` + +#### ❌ **Common Mistakes** +```bash +# BAD - Causes "EOF not found" errors +cat >> /tmp/crontab << 'EOF' +new_cron_job +EOF + +# Results in malformed file with literal "EOF < /dev/null" lines +``` + +#### **Key Rules for EOF in Cron Files** + +1. **Use `cat >` not `cat >>`** for building complete files + ```bash + # Good - overwrites file cleanly + cat > /tmp/crontab << 'EOF' + + # Bad - appends and can create malformed files + cat >> /tmp/crontab << 'EOF' + ``` + +2. **Quote the EOF delimiter** to prevent variable expansion + ```bash + # Good - literal content + cat > file << 'EOF' + + # Can cause issues with special characters + cat > file << EOF + ``` + +3. **Clean up malformed files** before installing + ```bash + # Remove EOF artifacts and empty lines + head -n -1 /tmp/crontab > /tmp/clean_crontab + + # Or use grep to remove EOF lines + grep -v "^EOF" /tmp/crontab > /tmp/clean_crontab + ``` + +4. **Alternative approach - direct echo method** + ```bash + crontab -l > /tmp/current_crontab + echo "# New job comment" >> /tmp/current_crontab + echo "*/10 * * * * /path/to/command" >> /tmp/current_crontab + crontab /tmp/current_crontab + ``` + +#### **Debugging EOF Issues** + +```bash +# Check for EOF artifacts in crontab file +cat -n /tmp/crontab | grep EOF + +# Validate crontab syntax before installing +crontab -T /tmp/crontab # Some systems support this + +# Manual cleanup if needed +sed '/^EOF/d' /tmp/crontab > /tmp/clean_crontab +``` + +### Testing Cron Jobs + +**Test command syntax first** +```bash +# Test the actual command before scheduling +python3 /full/path/to/script.py --test + +# Check file permissions +ls -la /path/to/script + +# Verify paths exist +ls -la /target/directory/ +``` + +**Test with minimal frequency** +```bash +# Start with 5-minute intervals for testing +*/5 * * * * /path/to/new/script.sh + +# Monitor logs +tail -f /var/log/syslog | grep CRON +``` + +### Monitoring Cron Jobs + +**Check cron logs** +```bash +# System cron logs +sudo journalctl -u cron -f + +# User cron logs +grep CRON /var/log/syslog | grep $(whoami) +``` + +**Verify job execution** +```bash +# Check if cleanup actually ran +ls -la /target/cleanup/directory/ + +# Monitor script logs +tail -f /path/to/script/logs/ +``` + +## Security Considerations + +### 1. Path Security +- Use absolute paths to prevent PATH manipulation +- Ensure scripts are owned by correct user +- Set appropriate permissions (750 for scripts) + +### 2. Command Injection Prevention +```bash +# Good - quoted paths +find "/path/with spaces/" -name "pattern" + +# Bad - unquoted paths vulnerable to injection +find /path/with spaces/ -name pattern +``` + +### 3. Resource Limits +- Prevent runaway processes with `timeout` +- Use `ionice` for I/O intensive cleanup jobs +- Consider `nice` for CPU-intensive tasks + +## Troubleshooting + +### Common Issues + +**Job not running** +1. Check cron service: `sudo systemctl status cron` +2. Verify crontab syntax: `crontab -l` +3. Check file permissions and paths +4. Review cron logs for errors + +**Environment differences** +- Cron runs with minimal environment +- Set PATH explicitly if needed +- Use absolute paths for all commands + +**Silent failures** +- Remove `2>/dev/null` temporarily for debugging +- Add logging to scripts +- Check script exit codes + +### Debugging Commands +```bash +# Test cron environment +* * * * * env > /tmp/cron_env.txt + +# Test script in cron-like environment +env -i /bin/bash -c 'your_command_here' + +# Monitor real-time execution +sudo tail -f /var/log/syslog | grep CRON +``` + +## Best Practices + +### 1. Documentation +- Comment all cron jobs with purpose and schedule +- Document in this patterns file +- Include contact info for complex jobs + +### 2. Maintenance +- Regular review of active jobs (quarterly) +- Remove obsolete jobs promptly +- Update absolute paths when moving scripts + +### 3. Monitoring +- Implement health checks for critical jobs +- Use Discord/email notifications for failures +- Monitor disk space usage from cleanup jobs + +### 4. Backup Strategy +- Backup crontab before changes +- Version control cron configurations +- Document restoration procedures + +## Future Enhancements + +### Planned Additions +- **Log rotation**: Automated cleanup of application logs +- **Health checks**: System resource monitoring +- **Backup verification**: Automated backup integrity checks +- **Certificate renewal**: SSL/TLS certificate automation + +### Migration Considerations +- **Systemd timers**: Consider migration for complex scheduling +- **Configuration management**: Ansible or similar for multi-host +- **Centralized logging**: Aggregated cron job monitoring + +--- + +## Related Documentation +- [Tdarr Monitoring Script](../scripts/README.md#tdarr_monitorpy---enhanced-tdarr-monitoring) +- [System Maintenance](../reference/system-maintenance.md) +- [Discord Integration](../examples/discord-notifications.md) \ No newline at end of file diff --git a/scripts/monitoring/README.md b/monitoring/scripts/README.md similarity index 100% rename from scripts/monitoring/README.md rename to monitoring/scripts/README.md diff --git a/scripts/monitoring/setup-discord-monitoring.md b/monitoring/scripts/setup-discord-monitoring.md similarity index 100% rename from scripts/monitoring/setup-discord-monitoring.md rename to monitoring/scripts/setup-discord-monitoring.md diff --git a/scripts/monitoring/tdarr-timeout-monitor.sh b/monitoring/scripts/tdarr-timeout-monitor.sh similarity index 100% rename from scripts/monitoring/tdarr-timeout-monitor.sh rename to monitoring/scripts/tdarr-timeout-monitor.sh diff --git a/monitoring/scripts/tdarr_monitor.py b/monitoring/scripts/tdarr_monitor.py new file mode 100755 index 0000000..db936f4 --- /dev/null +++ b/monitoring/scripts/tdarr_monitor.py @@ -0,0 +1,1234 @@ +#!/usr/bin/env python3 +""" +Tdarr API Monitoring Script with Stuck Job Detection and Discord Alerts + +Monitors Tdarr server via its web API endpoints: +- Server status and health +- Queue status and statistics +- Node status and performance +- Library scan progress +- Worker activity +- Stuck job detection with configurable timeouts +- Discord notifications for alerts and status updates + +Usage: + python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check all + python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check queue + python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes + + # Enable stuck job detection (30 minute threshold) + python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --detect-stuck + + # Custom stuck threshold (15 minutes) + python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check all --detect-stuck --stuck-threshold 15 + + # Enable Discord alerts for stuck jobs (uses default webhook) + python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --detect-stuck --discord-alerts + + # Automatically clear hung workers when detected + python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --detect-stuck --clear-hung-workers + + # Full monitoring with automatic clearing and Discord alerts + python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --detect-stuck --clear-hung-workers --discord-alerts + + # Test Discord integration (uses default webhook) + python3 tdarr_monitor.py --server http://10.10.0.43:8265 --discord-test + + # Enable file logging with custom path and debug level + python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --log-file /tmp/tdarr_debug.log --log-level DEBUG + + # Disable file logging (console only) + python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --no-log-file + + # Clear memory state + python3 tdarr_monitor.py --server http://10.10.0.43:8265 --clear-memory +""" + +import argparse +import json +import logging +import logging.handlers +import sys +import os +import pickle +from dataclasses import dataclass, asdict +from datetime import datetime, timedelta +from typing import Dict, List, Optional, Any, Union +import requests +from urllib.parse import urljoin + + +@dataclass +class WorkerSnapshot: + worker_id: str + node_id: str + worker_type: str + file: str + percentage: float + status: str + fps: int + eta: str + timestamp: datetime + + +@dataclass +class StuckJob: + worker_snapshot: WorkerSnapshot + first_seen: datetime + stuck_duration_minutes: float + is_stuck: bool = True + + +@dataclass +class MemoryState: + worker_snapshots: Dict[str, WorkerSnapshot] + stuck_jobs: Dict[str, StuckJob] + last_updated: datetime + + +@dataclass +class ServerStatus: + timestamp: str + server_url: str + status: str + error: Optional[str] = None + version: Optional[str] = None + server_id: Optional[str] = None + uptime: Optional[str] = None + system_info: Optional[Dict[str, Any]] = None + + +@dataclass +class QueueStats: + total_files: int + queued: int + processing: int + completed: int + queue_items: List[Dict[str, Any]] + + +@dataclass +class QueueStatus: + timestamp: str + queue_stats: Optional[QueueStats] = None + error: Optional[str] = None + + +@dataclass +class NodeInfo: + id: Optional[str] + nodeName: Optional[str] + status: str + lastSeen: Optional[int] + version: Optional[str] + platform: Optional[str] + workers: Dict[str, int] + processing: List[Dict[str, Any]] + + +@dataclass +class NodeSummary: + total_nodes: int + online_nodes: int + offline_nodes: int + online_details: List[NodeInfo] + offline_details: List[NodeInfo] + + +@dataclass +class NodeStatus: + timestamp: str + nodes: List[Dict[str, Any]] + node_summary: Optional[NodeSummary] = None + stuck_jobs: List[StuckJob] = None + error: Optional[str] = None + + +@dataclass +class LibraryInfo: + name: Optional[str] + path: Optional[str] + file_count: int + scan_progress: int + last_scan: Optional[str] + is_scanning: bool + + +@dataclass +class ScanStatus: + total_libraries: int + total_files: int + scanning_libraries: int + + +@dataclass +class LibraryStatus: + timestamp: str + libraries: List[LibraryInfo] + scan_status: Optional[ScanStatus] = None + error: Optional[str] = None + + +@dataclass +class Statistics: + total_transcodes: int + space_saved: int + total_files_processed: int + failed_transcodes: int + processing_speed: int + eta: Optional[str] + + +@dataclass +class StatisticsStatus: + timestamp: str + statistics: Optional[Statistics] = None + error: Optional[str] = None + + +@dataclass +class HealthCheck: + status: str + healthy: bool + online_count: Optional[int] = None + total_count: Optional[int] = None + accessible: Optional[bool] = None + total_items: Optional[int] = None + + +@dataclass +class HealthStatus: + timestamp: str + overall_status: str + checks: Dict[str, HealthCheck] + + +@dataclass +class DiscordEmbedField: + name: str + value: str + inline: bool = False + + +@dataclass +class DiscordEmbed: + title: str + description: str + color: int + fields: List[DiscordEmbedField] = None + timestamp: str = None + + def __post_init__(self): + if self.fields is None: + self.fields = [] + if self.timestamp is None: + self.timestamp = datetime.utcnow().isoformat() + + +class DiscordNotifier: + def __init__(self, webhook_url: str, timeout: int = 10): + """Initialize Discord notifier with webhook URL.""" + self.webhook_url = webhook_url + self.timeout = timeout + self.session = requests.Session() + self.logger = logging.getLogger(f"{__name__}.DiscordNotifier") + + def send_content_message(self, content: str, username: str = "Tdarr Monitor") -> bool: + """Send a simple content message to Discord. + + Args: + content: The message content to send + username: Bot username to display + + Returns: + True if successful, False otherwise + """ + payload = { + "content": content, + "username": username + } + + return self._send_webhook(payload) + + def send_embed_message(self, + title: str, + description: str, + color: int = 0xff6b6b, # Red by default + fields: List[DiscordEmbedField] = None, + username: str = "Tdarr Monitor") -> bool: + """Send an embed message to Discord. + + Args: + title: Embed title + description: Embed description + color: Embed color (hex integer, default red) + fields: List of embed fields + username: Bot username to display + + Returns: + True if successful, False otherwise + """ + embed = DiscordEmbed( + title=title, + description=description, + color=color, + fields=fields or [] + ) + + payload = { + "username": username, + "embeds": [asdict(embed)] + } + + return self._send_webhook(payload) + + def send_stuck_job_alert(self, stuck_jobs: List[StuckJob]) -> bool: + """Send alert for stuck jobs using embed format. + + Args: + stuck_jobs: List of stuck jobs to report + + Returns: + True if successful, False otherwise + """ + if not stuck_jobs: + return True + + # Create embed fields for each stuck job + fields = [] + for i, stuck_job in enumerate(stuck_jobs[:10]): # Limit to 10 jobs (Discord embed field limit is 25) + ws = stuck_job.worker_snapshot + field_value = ( + f"**File:** {os.path.basename(ws.file)}\n" + f"**Progress:** {ws.percentage}%\n" + f"**Status:** {ws.status}\n" + f"**Duration:** {stuck_job.stuck_duration_minutes:.1f} minutes\n" + f"**Node:** {ws.node_id}" + ) + + fields.append(DiscordEmbedField( + name=f"🚨 Stuck Job {i+1}: {ws.worker_id}", + value=field_value, + inline=True + )) + + # Add summary field if there are more jobs + if len(stuck_jobs) > 10: + fields.append(DiscordEmbedField( + name="Additional Jobs", + value=f"... and {len(stuck_jobs) - 10} more stuck jobs", + inline=False + )) + + title = f"🚨 Tdarr Stuck Jobs Detected ({len(stuck_jobs)})" + description = ( + f"Detected {len(stuck_jobs)} stuck job{'s' if len(stuck_jobs) != 1 else ''} " + f"in your Tdarr system. These jobs may need manual intervention." + ) + + return self.send_embed_message( + title=title, + description=description, + color=0xff6b6b, # Red color for alerts + fields=fields + ) + + def send_system_status(self, + server_status: ServerStatus, + node_status: NodeStatus, + stuck_jobs: List[StuckJob] = None) -> bool: + """Send system status summary using embed format. + + Args: + server_status: Server status information + node_status: Node status information + stuck_jobs: Optional stuck jobs list + + Returns: + True if successful, False otherwise + """ + # Determine overall health color + is_healthy = ( + server_status.status == "good" and + not server_status.error and + not node_status.error and + (not stuck_jobs or len(stuck_jobs) == 0) + ) + + color = 0x28a745 if is_healthy else 0xff6b6b # Green if healthy, red if not + + # Build description + description_parts = [ + f"**Server Status:** {server_status.status.title()}", + f"**Version:** {getattr(server_status, 'version', 'Unknown')}" + ] + + if node_status.node_summary: + description_parts.extend([ + f"**Total Nodes:** {node_status.node_summary.total_nodes}", + f"**Online Nodes:** {node_status.node_summary.online_nodes}", + f"**Offline Nodes:** {node_status.node_summary.offline_nodes}" + ]) + + if stuck_jobs: + description_parts.append(f"**Stuck Jobs:** {len(stuck_jobs)}") + + # Add node details as fields + fields = [] + if node_status.node_summary and node_status.node_summary.online_details: + for node in node_status.node_summary.online_details: + active_workers = len(node.processing) if node.processing else 0 + field_value = ( + f"**Status:** Online\n" + f"**Platform:** {node.platform or 'Unknown'}\n" + f"**Active Workers:** {active_workers}\n" + f"**CPU Workers:** {node.workers.get('cpu', 0)}\n" + f"**GPU Workers:** {node.workers.get('gpu', 0)}" + ) + + fields.append(DiscordEmbedField( + name=f"πŸ“‘ {node.nodeName or node.id}", + value=field_value, + inline=True + )) + + title = "πŸ“Š Tdarr System Status" + if not is_healthy: + title = "⚠️ Tdarr System Alert" + + return self.send_embed_message( + title=title, + description="\n".join(description_parts), + color=color, + fields=fields + ) + + def _send_webhook(self, payload: Dict[str, Any]) -> bool: + """Send payload to Discord webhook. + + Args: + payload: JSON payload to send + + Returns: + True if successful, False otherwise + """ + try: + response = self.session.post( + self.webhook_url, + json=payload, + timeout=self.timeout + ) + response.raise_for_status() + self.logger.info("Discord notification sent successfully") + return True + + except requests.exceptions.RequestException as e: + self.logger.error(f"Failed to send Discord notification: {e}") + return False + except Exception as e: + self.logger.error(f"Unexpected error sending Discord notification: {e}") + return False + + +class StuckJobDetector: + def __init__(self, memory_file: str = ".claude/tmp/tdarr_memory.pkl", stuck_threshold_minutes: int = 30): + """Initialize stuck job detector with memory persistence.""" + self.memory_file = os.path.abspath(memory_file) # Use absolute path + self.stuck_threshold_minutes = stuck_threshold_minutes + self.logger = logging.getLogger(f"{__name__}.StuckJobDetector") + self.logger.debug(f"Using memory file: {self.memory_file}") + self.memory_state = self._load_memory_state() + + # Ensure memory directory exists + os.makedirs(os.path.dirname(memory_file), exist_ok=True) + + def _load_memory_state(self) -> MemoryState: + """Load memory state from disk or create new one.""" + if os.path.exists(self.memory_file): + try: + with open(self.memory_file, 'rb') as f: + memory_state = pickle.load(f) + self.logger.debug(f"Loaded memory state: {len(memory_state.worker_snapshots)} workers, {len(memory_state.stuck_jobs)} stuck jobs") + return memory_state + except Exception as e: + self.logger.warning(f"Failed to load memory state: {e}, creating new state") + else: + self.logger.debug(f"Memory file {self.memory_file} does not exist, creating new state") + + return MemoryState( + worker_snapshots={}, + stuck_jobs={}, + last_updated=datetime.now() + ) + + def _save_memory_state(self): + """Save memory state to disk.""" + try: + with open(self.memory_file, 'wb') as f: + pickle.dump(self.memory_state, f) + except Exception as e: + self.logger.error(f"Failed to save memory state: {e}") + + def _create_worker_key(self, node_id: str, worker_id: str) -> str: + """Create unique key for worker identification.""" + return f"{node_id}:{worker_id}" + + def _is_worker_stuck(self, current: WorkerSnapshot, previous: WorkerSnapshot) -> bool: + """Check if worker is stuck based on comparison with previous snapshot.""" + worker_key = f"{current.node_id}:{current.worker_id}" + + # Check each condition individually for detailed logging + file_same = current.file == previous.file + percentage_same = current.percentage == previous.percentage + status_same = current.status == previous.status + fps_same = current.fps == previous.fps + eta_same = current.eta == previous.eta + + is_stuck = file_same and percentage_same and status_same and fps_same and eta_same + + # Log detailed comparison info + self.logger.debug(f"Worker {worker_key} stuck check:") + self.logger.debug(f" File: '{current.file}' == '{previous.file}' = {file_same}") + self.logger.debug(f" Percentage: {current.percentage}% == {previous.percentage}% = {percentage_same}") + self.logger.debug(f" Status: '{current.status}' == '{previous.status}' = {status_same}") + self.logger.debug(f" FPS: {current.fps} == {previous.fps} = {fps_same}") + self.logger.debug(f" ETA: '{current.eta}' == '{previous.eta}' = {eta_same}") + self.logger.debug(f" β†’ Result: {'STUCK' if is_stuck else 'NOT STUCK'}") + + # Log INFO level when we detect changes (worker making progress) + if not is_stuck: + if not percentage_same: + self.logger.info(f"Worker {worker_key} making progress: {previous.percentage}% β†’ {current.percentage}%") + elif not status_same: + self.logger.info(f"Worker {worker_key} status changed: '{previous.status}' β†’ '{current.status}'") + elif not file_same: + self.logger.info(f"Worker {worker_key} file changed: '{previous.file}' β†’ '{current.file}'") + + return is_stuck + + def update_workers(self, nodes_data: Dict[str, Any]) -> List[StuckJob]: + """Update worker snapshots and detect stuck jobs.""" + current_time = datetime.now() + current_workers = {} + detected_stuck_jobs = [] + + # Extract current worker states from nodes data + for node_id, node_data in nodes_data.items(): + workers = node_data.get('workers', {}) + for worker_id, worker_data in workers.items(): + worker_key = self._create_worker_key(node_id, worker_id) + + # Create current snapshot + current_snapshot = WorkerSnapshot( + worker_id=worker_id, + node_id=node_id, + worker_type=worker_data.get('workerType', 'unknown'), + file=worker_data.get('file', ''), + percentage=worker_data.get('percentage', -1), + status=worker_data.get('status', ''), + fps=worker_data.get('fps', 0), + eta=worker_data.get('ETA', ''), + timestamp=current_time + ) + + current_workers[worker_key] = current_snapshot + + # Log all workers being tracked + self.logger.debug(f"Tracking worker {worker_key}: {current_snapshot.status} at {current_snapshot.percentage}% on '{current_snapshot.file}'") + + # Check if worker was previously tracked + if worker_key in self.memory_state.worker_snapshots: + previous_snapshot = self.memory_state.worker_snapshots[worker_key] + + # Check if worker is stuck + if self._is_worker_stuck(current_snapshot, previous_snapshot): + # Calculate how long it's been stuck + time_since_previous = (current_time - previous_snapshot.timestamp).total_seconds() / 60 + self.logger.debug(f"Worker {worker_key} has been stuck for {time_since_previous:.1f} minutes since last check") + self.logger.debug(f"Worker {worker_key} checking stuck_jobs dict: {list(self.memory_state.stuck_jobs.keys())}") + + if worker_key in self.memory_state.stuck_jobs: + # Already known stuck job, update duration + stuck_job = self.memory_state.stuck_jobs[worker_key] + stuck_duration = current_time - stuck_job.first_seen + stuck_job.stuck_duration_minutes = stuck_duration.total_seconds() / 60 + stuck_job.worker_snapshot = current_snapshot + + self.logger.debug(f"Worker {worker_key} known stuck job - duration: {stuck_job.stuck_duration_minutes:.1f} min, threshold: {self.stuck_threshold_minutes} min") + if stuck_job.stuck_duration_minutes >= self.stuck_threshold_minutes: + self.logger.debug(f"Worker {worker_key} EXCEEDS threshold - adding to detected stuck jobs") + detected_stuck_jobs.append(stuck_job) + else: + self.logger.debug(f"Worker {worker_key} below threshold - not flagging yet") + else: + # New stuck job detected - add to memory immediately to start tracking + first_seen = previous_snapshot.timestamp + stuck_duration = current_time - first_seen + stuck_duration_minutes = stuck_duration.total_seconds() / 60 + + self.logger.debug(f"Worker {worker_key} NEW stuck job - first_seen: {first_seen}, current: {current_time}") + self.logger.debug(f"Worker {worker_key} NEW stuck job - duration: {stuck_duration_minutes:.1f} min, threshold: {self.stuck_threshold_minutes} min") + + # Create stuck job entry immediately to track duration across runs + stuck_job = StuckJob( + worker_snapshot=current_snapshot, + first_seen=first_seen, + stuck_duration_minutes=stuck_duration_minutes, + is_stuck=True + ) + self.memory_state.stuck_jobs[worker_key] = stuck_job + + if stuck_duration_minutes >= self.stuck_threshold_minutes: + self.logger.debug(f"Worker {worker_key} NEW stuck job EXCEEDS threshold - flagging for clearing") + detected_stuck_jobs.append(stuck_job) + else: + self.logger.debug(f"Worker {worker_key} NEW stuck job below threshold - tracking in memory") + else: + # Worker is not stuck, remove from stuck jobs if present + if worker_key in self.memory_state.stuck_jobs: + del self.memory_state.stuck_jobs[worker_key] + self.logger.info(f"Worker {worker_key} is no longer stuck") + else: + # New worker, start tracking it + self.logger.info(f"New worker detected: {worker_key} - {current_snapshot.status} at {current_snapshot.percentage}% on '{current_snapshot.file}'") + + # Clean up stuck jobs for workers that no longer exist + stuck_jobs_to_remove = [] + for worker_key in self.memory_state.stuck_jobs: + if worker_key not in current_workers: + stuck_jobs_to_remove.append(worker_key) + + for worker_key in stuck_jobs_to_remove: + del self.memory_state.stuck_jobs[worker_key] + self.logger.info(f"Removed stuck job tracking for missing worker: {worker_key}") + + # Update memory state + self.memory_state.worker_snapshots = current_workers + self.memory_state.last_updated = current_time + + # Save to disk + self._save_memory_state() + + return detected_stuck_jobs + + def get_stuck_jobs(self) -> List[StuckJob]: + """Get current list of stuck jobs.""" + return list(self.memory_state.stuck_jobs.values()) + + def clear_memory(self): + """Clear all memory state.""" + self.memory_state = MemoryState( + worker_snapshots={}, + stuck_jobs={}, + last_updated=datetime.now() + ) + self._save_memory_state() + self.logger.info("Memory state cleared") + + +class TdarrMonitor: + def __init__(self, server_url: str, timeout: int = 30, enable_stuck_detection: bool = False, + stuck_threshold_minutes: int = 30, memory_file: str = ".claude/tmp/tdarr_memory.pkl", + discord_webhook_url: str = None, enable_discord_alerts: bool = False, + log_file: Optional[str] = None, log_level: str = "INFO", clear_hung_workers: bool = False): + """Initialize Tdarr monitor with server URL.""" + self.server_url = server_url.rstrip('/') + self.timeout = timeout + self.session = requests.Session() + self.enable_stuck_detection = enable_stuck_detection + self.enable_discord_alerts = enable_discord_alerts + self.clear_hung_workers_enabled = clear_hung_workers + + # Configure logging first + self._setup_logging(log_file, log_level) + self.logger = logging.getLogger(__name__) + + # Initialize stuck job detector if enabled + self.stuck_detector = None + if enable_stuck_detection: + self.stuck_detector = StuckJobDetector(memory_file, stuck_threshold_minutes) + + # Initialize Discord notifier if enabled + self.discord_notifier = None + if enable_discord_alerts: + if discord_webhook_url: + self.discord_notifier = DiscordNotifier(discord_webhook_url) + else: + self.logger.warning("Discord alerts enabled but no webhook URL provided") + + def _setup_logging(self, log_file: Optional[str] = None, log_level: str = "INFO"): + """Configure logging with optional file rotation.""" + # Clear any existing handlers + root_logger = logging.getLogger() + root_logger.handlers.clear() + + # Set log level + level = getattr(logging, log_level.upper(), logging.INFO) + root_logger.setLevel(level) + + # Create formatter + formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + # Console handler (for interactive use) + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setFormatter(formatter) + root_logger.addHandler(console_handler) + + # File handler with rotation (if log_file specified) + if log_file: + # Ensure log directory exists + log_dir = os.path.dirname(log_file) + if log_dir: + os.makedirs(log_dir, exist_ok=True) + + # Rotating file handler: 10MB max, keep 5 backup files + file_handler = logging.handlers.RotatingFileHandler( + log_file, + maxBytes=10 * 1024 * 1024, # 10MB + backupCount=5, + encoding='utf-8' + ) + file_handler.setFormatter(formatter) + root_logger.addHandler(file_handler) + + def _make_request(self, endpoint: str) -> Optional[Dict[str, Any]]: + """Make HTTP request to Tdarr API endpoint.""" + url = urljoin(self.server_url, endpoint) + + try: + response = self.session.get(url, timeout=self.timeout) + response.raise_for_status() + return response.json() + + except requests.exceptions.RequestException as e: + self.logger.error(f"Request failed for {url}: {e}") + return None + except json.JSONDecodeError as e: + self.logger.error(f"JSON decode failed for {url}: {e}") + return None + + def clear_hung_workers(self, stuck_jobs: Optional[List[StuckJob]] = None) -> bool: + """Clear hung workers via Tdarr API using kill-worker endpoint. + + Args: + stuck_jobs: List of StuckJob objects to clear. Each contains worker and node information. + + Returns: + True if all workers cleared successfully, False otherwise + """ + if not stuck_jobs: + self.logger.info("No stuck jobs provided for clearing hung workers") + return True + + success_count = 0 + total_count = len(stuck_jobs) + + for stuck_job in stuck_jobs: + worker_snapshot = stuck_job.worker_snapshot + try: + # Use the kill-worker endpoint with correct payload format + endpoint = '/api/v2/kill-worker' + payload = { + "data": { + "nodeID": worker_snapshot.node_id, + "workerID": worker_snapshot.worker_id + } + } + + url = urljoin(self.server_url, endpoint) + response = self.session.post(url, json=payload, timeout=self.timeout) + response.raise_for_status() + + self.logger.info(f"Successfully killed hung worker: {worker_snapshot.node_id}:{worker_snapshot.worker_id}") + success_count += 1 + + except requests.exceptions.RequestException as e: + self.logger.error(f"Failed to kill worker {worker_snapshot.node_id}:{worker_snapshot.worker_id}: {e}") + except Exception as e: + self.logger.error(f"Unexpected error killing worker {worker_snapshot.node_id}:{worker_snapshot.worker_id}: {e}") + + self.logger.info(f"Cleared {success_count}/{total_count} hung workers") + return success_count == total_count + + def get_server_status(self) -> ServerStatus: + """Get overall server status and configuration.""" + timestamp = datetime.now().isoformat() + + # Try to get server info from API + data = self._make_request('/api/v2/status') + if data: + server_status = data.get('status', 'unknown') + self.logger.info(f"Server check completed: status={server_status}, version={data.get('version', 'unknown')}") + return ServerStatus( + timestamp=timestamp, + server_url=self.server_url, + status=server_status, + version=data.get('version'), + uptime=data.get('uptime') + ) + else: + self.logger.error("Server check failed: Unable to connect to Tdarr server") + return ServerStatus( + timestamp=timestamp, + server_url=self.server_url, + status='offline', + error='Unable to connect to Tdarr server' + ) + + def get_queue_status(self) -> QueueStatus: + """Get transcoding queue status and statistics.""" + timestamp = datetime.now().isoformat() + + # Get queue information + data = self._make_request('/api/v2/get-queue') + if data: + queue_data = data.get('queue', []) + + # Calculate queue statistics + total_files = len(queue_data) + queued_files = len([f for f in queue_data if f.get('status') == 'Queued']) + processing_files = len([f for f in queue_data if f.get('status') == 'Processing']) + completed_files = len([f for f in queue_data if f.get('status') == 'Completed']) + + queue_stats = QueueStats( + total_files=total_files, + queued=queued_files, + processing=processing_files, + completed=completed_files, + queue_items=queue_data[:10] # First 10 items for details + ) + + return QueueStatus( + timestamp=timestamp, + queue_stats=queue_stats + ) + else: + return QueueStatus( + timestamp=timestamp, + error='Unable to fetch queue data' + ) + + def get_node_status(self) -> NodeStatus: + """Get status of all connected nodes.""" + timestamp = datetime.now().isoformat() + + # Get nodes information (using correct endpoint) + data = self._make_request('/api/v2/get-nodes') + if data: + # Handle the actual data structure returned by Tdarr API + nodes_dict = data if isinstance(data, dict) else {} + nodes = [] + + # Process node information + online_nodes = [] + offline_nodes = [] + + for node_id, node_data in nodes_dict.items(): + node_info = NodeInfo( + id=node_id, + nodeName=node_data.get('nodeName'), + status='online', # Assume online if in response + lastSeen=None, + version=node_data.get('config', {}).get('version'), + platform=node_data.get('config', {}).get('platform_arch_isdocker'), + workers={ + 'cpu': len([w for w in node_data.get('workers', {}).values() if 'cpu' in w.get('workerType', '').lower()]), + 'gpu': len([w for w in node_data.get('workers', {}).values() if 'gpu' in w.get('workerType', '').lower()]) + }, + processing=list(node_data.get('workers', {}).values()) + ) + + online_nodes.append(node_info) + nodes.append(node_data) + + # Check for stuck jobs if detection is enabled + stuck_jobs = [] + if self.stuck_detector: + try: + stuck_jobs = self.stuck_detector.update_workers(nodes_dict) + if stuck_jobs: + self.logger.warning(f"Detected {len(stuck_jobs)} stuck jobs") + for stuck_job in stuck_jobs: + self.logger.warning( + f"Stuck job: {stuck_job.worker_snapshot.node_id}:{stuck_job.worker_snapshot.worker_id} " + f"on file '{stuck_job.worker_snapshot.file}' " + f"at {stuck_job.worker_snapshot.percentage}% for {stuck_job.stuck_duration_minutes:.1f} minutes" + ) + + # Clear hung workers if enabled + if self.clear_hung_workers_enabled: + try: + clear_success = self.clear_hung_workers(stuck_jobs) + if clear_success: + self.logger.info(f"Successfully cleared {len(stuck_jobs)} hung workers") + else: + self.logger.warning("Some hung workers could not be cleared") + except Exception as e: + self.logger.error(f"Error clearing hung workers: {e}") + + # Send Discord notification for stuck jobs + if self.discord_notifier: + try: + self.discord_notifier.send_stuck_job_alert(stuck_jobs) + except Exception as e: + self.logger.error(f"Failed to send Discord stuck job alert: {e}") + + except Exception as e: + self.logger.error(f"Error in stuck job detection: {e}") + + node_summary = NodeSummary( + total_nodes=len(nodes), + online_nodes=len(online_nodes), + offline_nodes=len(offline_nodes), + online_details=online_nodes, + offline_details=offline_nodes + ) + + # Log successful node check with summary + if stuck_jobs: + self.logger.info(f"Node check completed: {len(nodes)} nodes online, {len(stuck_jobs)} stuck jobs detected") + else: + self.logger.info(f"Node check completed: {len(nodes)} nodes online, no stuck jobs detected") + + return NodeStatus( + timestamp=timestamp, + nodes=nodes, + node_summary=node_summary, + stuck_jobs=stuck_jobs + ) + else: + self.logger.error("Node check failed: Unable to fetch node data") + return NodeStatus( + timestamp=timestamp, + nodes=[], + error='Unable to fetch node data' + ) + + def get_library_status(self) -> LibraryStatus: + """Get library scan status and file statistics.""" + timestamp = datetime.now().isoformat() + + # Get library information + data = self._make_request('/api/v2/get-libraries') + if data: + libraries = data.get('libraries', []) + + library_stats = [] + total_files = 0 + + for lib in libraries: + lib_info = LibraryInfo( + name=lib.get('name'), + path=lib.get('path'), + file_count=lib.get('totalFiles', 0), + scan_progress=lib.get('scanProgress', 0), + last_scan=lib.get('lastScan'), + is_scanning=lib.get('isScanning', False) + ) + library_stats.append(lib_info) + total_files += lib_info.file_count + + scan_status = ScanStatus( + total_libraries=len(libraries), + total_files=total_files, + scanning_libraries=len([l for l in library_stats if l.is_scanning]) + ) + + return LibraryStatus( + timestamp=timestamp, + libraries=library_stats, + scan_status=scan_status + ) + else: + return LibraryStatus( + timestamp=timestamp, + libraries=[], + error='Unable to fetch library data' + ) + + def get_statistics(self) -> StatisticsStatus: + """Get overall Tdarr statistics and health metrics.""" + timestamp = datetime.now().isoformat() + + # Get statistics + data = self._make_request('/api/v2/get-stats') + if data: + stats = data.get('stats', {}) + statistics = Statistics( + total_transcodes=stats.get('totalTranscodes', 0), + space_saved=stats.get('spaceSaved', 0), + total_files_processed=stats.get('totalFilesProcessed', 0), + failed_transcodes=stats.get('failedTranscodes', 0), + processing_speed=stats.get('processingSpeed', 0), + eta=stats.get('eta') + ) + + return StatisticsStatus( + timestamp=timestamp, + statistics=statistics + ) + else: + return StatisticsStatus( + timestamp=timestamp, + error='Unable to fetch statistics' + ) + + def health_check(self) -> HealthStatus: + """Perform comprehensive health check.""" + timestamp = datetime.now().isoformat() + + # Server connectivity + server_status = self.get_server_status() + server_check = HealthCheck( + status=server_status.status, + healthy=server_status.status == 'good' + ) + + # Node connectivity + node_status = self.get_node_status() + nodes_healthy = ( + node_status.node_summary.online_nodes > 0 if node_status.node_summary else False + ) and not node_status.error + + nodes_check = HealthCheck( + status='online' if nodes_healthy else 'offline', + healthy=nodes_healthy, + online_count=node_status.node_summary.online_nodes if node_status.node_summary else 0, + total_count=node_status.node_summary.total_nodes if node_status.node_summary else 0 + ) + + # Queue status + queue_status = self.get_queue_status() + queue_healthy = not queue_status.error + queue_check = HealthCheck( + status='accessible' if queue_healthy else 'error', + healthy=queue_healthy, + accessible=queue_healthy, + total_items=queue_status.queue_stats.total_files if queue_status.queue_stats else 0 + ) + + checks = { + 'server': server_check, + 'nodes': nodes_check, + 'queue': queue_check + } + + # Determine overall health + all_checks_healthy = all(check.healthy for check in checks.values()) + overall_status = 'healthy' if all_checks_healthy else 'unhealthy' + + return HealthStatus( + timestamp=timestamp, + overall_status=overall_status, + checks=checks + ) + + +def main(): + parser = argparse.ArgumentParser(description='Monitor Tdarr server via API') + parser.add_argument('--server', required=True, help='Tdarr server URL (e.g., http://10.10.0.43:8265)') + parser.add_argument('--check', choices=['all', 'status', 'queue', 'nodes', 'libraries', 'stats', 'health'], + default='health', help='Type of check to perform') + parser.add_argument('--timeout', type=int, default=30, help='Request timeout in seconds') + parser.add_argument('--output', choices=['json', 'pretty'], default='pretty', help='Output format') + parser.add_argument('--verbose', action='store_true', help='Enable verbose logging') + parser.add_argument('--detect-stuck', action='store_true', help='Enable stuck job detection') + parser.add_argument('--stuck-threshold', type=int, default=30, help='Minutes before job is considered stuck (default: 30)') + parser.add_argument('--memory-file', default='.claude/tmp/tdarr_memory.pkl', help='Path to memory state file') + parser.add_argument('--clear-memory', action='store_true', help='Clear memory state and exit') + parser.add_argument('--discord-webhook', + default='https://discord.com/api/webhooks/1404105821549498398/y2Ud1RK9rzFjv58xbypUfQNe3jrL7ZUq1FkQHa4_dfOHm2ylp93z0f4tY0O8Z-vQgKhD', + help='Discord webhook URL for notifications (default: configured webhook)') + parser.add_argument('--discord-alerts', action='store_true', help='Enable Discord alerts for stuck jobs and system status') + parser.add_argument('--discord-test', action='store_true', help='Send test Discord message and exit') + parser.add_argument('--log-file', default='./scripts/logs/tdarr_monitor.log', + help='Path to log file with rotation (default: ./scripts/logs/tdarr_monitor.log)') + parser.add_argument('--log-level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], default='INFO', + help='Logging level (default: INFO)') + parser.add_argument('--no-log-file', action='store_true', help='Disable file logging, console only') + parser.add_argument('--clear-hung-workers', action='store_true', help='Clear hung workers via API call when stuck jobs are detected') + + args = parser.parse_args() + + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + # Handle clear memory command + if args.clear_memory: + if os.path.exists(args.memory_file): + os.remove(args.memory_file) + print(f"Memory state cleared: {args.memory_file}") + else: + print(f"Memory file does not exist: {args.memory_file}") + sys.exit(0) + + # Handle Discord test command + if args.discord_test: + print("Sending Discord test messages...") + notifier = DiscordNotifier(args.discord_webhook) + + # Test content message + content_success = notifier.send_content_message( + "πŸ§ͺ **Tdarr Monitor Test** - Content message working correctly!" + ) + + # Test embed message + test_fields = [ + DiscordEmbedField("Test Field 1", "This is a test value", True), + DiscordEmbedField("Test Field 2", "Another test value", True), + ] + + embed_success = notifier.send_embed_message( + title="πŸ§ͺ Tdarr Monitor Test", + description="This is a test embed message to verify Discord integration is working correctly.", + color=0x00ff00, # Green + fields=test_fields + ) + + if content_success and embed_success: + print("βœ… Discord test successful! Both content and embed messages sent.") + sys.exit(0) + else: + print("❌ Discord test failed. Check webhook URL and permissions.") + sys.exit(1) + + # Initialize monitor + log_file = None if args.no_log_file else args.log_file + monitor = TdarrMonitor( + args.server, + args.timeout, + enable_stuck_detection=args.detect_stuck, + stuck_threshold_minutes=args.stuck_threshold, + memory_file=args.memory_file, + discord_webhook_url=args.discord_webhook, + enable_discord_alerts=args.discord_alerts, + log_file=log_file, + log_level=args.log_level, + clear_hung_workers=args.clear_hung_workers + ) + + # Perform requested check + monitor.logger.info(f"Starting Tdarr monitoring check: {args.check}, stuck_detection={'enabled' if args.detect_stuck else 'disabled'}, clear_workers={'enabled' if args.clear_hung_workers else 'disabled'}") + + result = None + if args.check == 'all': + result = { + 'server_status': monitor.get_server_status(), + 'queue_status': monitor.get_queue_status(), + 'node_status': monitor.get_node_status(), + 'library_status': monitor.get_library_status(), + 'statistics': monitor.get_statistics() + } + elif args.check == 'status': + result = monitor.get_server_status() + elif args.check == 'queue': + result = monitor.get_queue_status() + elif args.check == 'nodes': + result = monitor.get_node_status() + elif args.check == 'libraries': + result = monitor.get_library_status() + elif args.check == 'stats': + result = monitor.get_statistics() + elif args.check == 'health': + result = monitor.health_check() + + # Output results + if args.output == 'json': + # Convert dataclasses to dictionaries for JSON serialization + if args.check == 'all': + json_result = {} + for key, value in result.items(): + json_result[key] = asdict(value) + print(json.dumps(json_result, indent=2)) + else: + print(json.dumps(asdict(result), indent=2)) + else: + # Pretty print format + print(f"=== Tdarr Monitor Results - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===") + + if args.check == 'health' or (hasattr(result, 'overall_status') and result.overall_status): + health = result if hasattr(result, 'overall_status') else None + if health: + status = health.overall_status + print(f"Overall Status: {status.upper()}") + + if health.checks: + print("\nHealth Checks:") + for check_name, check_data in health.checks.items(): + status_icon = "βœ“" if check_data.healthy else "βœ—" + print(f" {status_icon} {check_name.title()}: {asdict(check_data)}") + + # Display stuck jobs if present + if args.detect_stuck: + if hasattr(result, 'stuck_jobs') and result.stuck_jobs: + print(f"\n=== STUCK JOBS DETECTED ({len(result.stuck_jobs)}) ===") + for stuck_job in result.stuck_jobs: + print(f"🚨 {stuck_job.worker_snapshot.node_id}:{stuck_job.worker_snapshot.worker_id}") + print(f" File: {stuck_job.worker_snapshot.file}") + print(f" Progress: {stuck_job.worker_snapshot.percentage}%") + print(f" Status: {stuck_job.worker_snapshot.status}") + print(f" Stuck for: {stuck_job.stuck_duration_minutes:.1f} minutes") + print() + elif args.check in ['nodes', 'all']: + # Check all results for stuck jobs if 'all' is selected + stuck_found = False + if args.check == 'all' and isinstance(result, dict): + for section, data in result.items(): + if hasattr(data, 'stuck_jobs') and data.stuck_jobs: + if not stuck_found: + print(f"\n=== STUCK JOBS DETECTED ===") + stuck_found = True + for stuck_job in data.stuck_jobs: + print(f"🚨 {stuck_job.worker_snapshot.node_id}:{stuck_job.worker_snapshot.worker_id}") + print(f" File: {stuck_job.worker_snapshot.file}") + print(f" Progress: {stuck_job.worker_snapshot.percentage}%") + print(f" Status: {stuck_job.worker_snapshot.status}") + print(f" Stuck for: {stuck_job.stuck_duration_minutes:.1f} minutes") + print() + + if not stuck_found: + print(f"\nβœ… No stuck jobs detected (threshold: {args.stuck_threshold} minutes)") + + if args.check == 'all': + for section, data in result.items(): + print(f"\n=== {section.replace('_', ' ').title()} ===") + # Don't print stuck_jobs in JSON format as we already displayed them above + if hasattr(data, 'stuck_jobs'): + data_dict = asdict(data) + data_dict.pop('stuck_jobs', None) + print(json.dumps(data_dict, indent=2)) + else: + print(json.dumps(asdict(data), indent=2)) + elif args.check != 'health': + # Don't print stuck_jobs in JSON format as we already displayed them above + if hasattr(result, 'stuck_jobs'): + result_dict = asdict(result) + result_dict.pop('stuck_jobs', None) + print(json.dumps(result_dict, indent=2)) + else: + print(json.dumps(asdict(result), indent=2)) + + # Exit with appropriate code + if result: + # Check for unhealthy status in health check + if isinstance(result, HealthStatus) and result.overall_status == 'unhealthy': + sys.exit(1) + # Check for errors in individual status objects (all status classes except HealthStatus have error attribute) + elif (isinstance(result, (ServerStatus, QueueStatus, NodeStatus, LibraryStatus, StatisticsStatus)) + and result.error): + sys.exit(1) + # Check for errors in 'all' results + elif isinstance(result, dict): + for status_obj in result.values(): + if (isinstance(status_obj, (ServerStatus, QueueStatus, NodeStatus, LibraryStatus, StatisticsStatus)) + and status_obj.error): + sys.exit(1) + + sys.exit(0) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/monitoring/windows-desktop/README.md b/monitoring/scripts/windows-desktop/README.md similarity index 100% rename from scripts/monitoring/windows-desktop/README.md rename to monitoring/scripts/windows-desktop/README.md diff --git a/scripts/monitoring/windows-desktop/windows-reboot-monitor.ps1 b/monitoring/scripts/windows-desktop/windows-reboot-monitor.ps1 similarity index 100% rename from scripts/monitoring/windows-desktop/windows-reboot-monitor.ps1 rename to monitoring/scripts/windows-desktop/windows-reboot-monitor.ps1 diff --git a/scripts/monitoring/windows-desktop/windows-reboot-task-shutdown.xml b/monitoring/scripts/windows-desktop/windows-reboot-task-shutdown.xml similarity index 100% rename from scripts/monitoring/windows-desktop/windows-reboot-task-shutdown.xml rename to monitoring/scripts/windows-desktop/windows-reboot-task-shutdown.xml diff --git a/scripts/monitoring/windows-desktop/windows-reboot-task-startup.xml b/monitoring/scripts/windows-desktop/windows-reboot-task-startup.xml similarity index 100% rename from scripts/monitoring/windows-desktop/windows-reboot-task-startup.xml rename to monitoring/scripts/windows-desktop/windows-reboot-task-startup.xml diff --git a/scripts/monitoring/windows-desktop/windows-setup-instructions.md b/monitoring/scripts/windows-desktop/windows-setup-instructions.md similarity index 100% rename from scripts/monitoring/windows-desktop/windows-setup-instructions.md rename to monitoring/scripts/windows-desktop/windows-setup-instructions.md diff --git a/monitoring/troubleshooting.md b/monitoring/troubleshooting.md new file mode 100644 index 0000000..c29ac29 --- /dev/null +++ b/monitoring/troubleshooting.md @@ -0,0 +1,414 @@ +# Monitoring System Troubleshooting Guide + +## Discord Notification Issues + +### Webhook Not Working +**Symptoms**: No Discord messages received, connection errors +**Diagnosis**: +```bash +# Test webhook manually +curl -X POST "$DISCORD_WEBHOOK_URL" \ + -H "Content-Type: application/json" \ + -d '{"content": "Test message"}' + +# Check webhook URL format +echo $DISCORD_WEBHOOK_URL | grep -E "https://discord.com/api/webhooks/[0-9]+/.+" +``` + +**Solutions**: +```bash +# Verify webhook URL is correct +# Format: https://discord.com/api/webhooks/ID/TOKEN + +# Test with minimal payload +curl -X POST "$DISCORD_WEBHOOK_URL" \ + -H "Content-Type: application/json" \ + -d '{"content": "βœ… Webhook working"}' + +# Check for JSON formatting issues +echo '{"content": "test"}' | jq . # Validate JSON +``` + +### Message Formatting Problems +**Symptoms**: Malformed messages, broken markdown, missing user pings +**Common Issues**: +```bash +# ❌ Broken JSON escaping +{"content": "Error: "quotes" break JSON"} + +# βœ… Proper JSON escaping +{"content": "Error: \"quotes\" properly escaped"} + +# ❌ User ping inside code block (doesn't work) +{"content": "```\nIssue occurred <@user_id>\n```"} + +# βœ… User ping outside code block +{"content": "```\nIssue occurred\n```\nManual intervention needed <@user_id>"} +``` + +## Tdarr Monitoring Issues + +### Script Not Running +**Symptoms**: No monitoring alerts, script execution failures +**Diagnosis**: +```bash +# Check cron job status +crontab -l | grep tdarr-timeout-monitor +systemctl status cron + +# Run script manually for debugging +bash -x /path/to/tdarr-timeout-monitor.sh + +# Check script permissions +ls -la /path/to/tdarr-timeout-monitor.sh +``` + +**Solutions**: +```bash +# Fix script permissions +chmod +x /path/to/tdarr-timeout-monitor.sh + +# Reinstall cron job +crontab -e +# Add: */20 * * * * /full/path/to/tdarr-timeout-monitor.sh + +# Check script environment +# Ensure PATH and variables are set correctly in script +``` + +### API Connection Failures +**Symptoms**: Cannot connect to Tdarr server, timeout errors +**Diagnosis**: +```bash +# Test Tdarr API manually +curl -f "http://tdarr-server:8266/api/v2/status" + +# Check network connectivity +ping tdarr-server +nc -zv tdarr-server 8266 + +# Verify SSH access to server +ssh tdarr "docker ps | grep tdarr" +``` + +**Solutions**: +```bash +# Update server connection in script +# Verify server IP and port are correct + +# Test API endpoints +curl "http://10.10.0.43:8265/api/v2/status" # Web port +curl "http://10.10.0.43:8266/api/v2/status" # Server port + +# Check Tdarr server logs +ssh tdarr "docker logs tdarr | tail -20" +``` + +## Windows Desktop Monitoring Issues + +### PowerShell Script Not Running +**Symptoms**: No reboot notifications from Windows systems +**Diagnosis**: +```powershell +# Check scheduled task status +Get-ScheduledTask -TaskName "Reboot*" | Get-ScheduledTaskInfo + +# Test script execution manually +PowerShell -ExecutionPolicy Bypass -File "C:\path\to\windows-reboot-monitor.ps1" + +# Check PowerShell execution policy +Get-ExecutionPolicy +``` + +**Solutions**: +```powershell +# Set execution policy +Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser + +# Recreate scheduled tasks +schtasks /Create /XML "C:\path\to\task.xml" /TN "RebootMonitor" + +# Check task trigger configuration +Get-ScheduledTask -TaskName "RebootMonitor" | Get-ScheduledTaskTrigger +``` + +### Network Access from Windows +**Symptoms**: PowerShell cannot reach Discord webhook +**Diagnosis**: +```powershell +# Test network connectivity +Test-NetConnection discord.com -Port 443 + +# Test webhook manually +Invoke-RestMethod -Uri $webhookUrl -Method Post -Body '{"content":"test"}' -ContentType "application/json" + +# Check Windows firewall +Get-NetFirewallRule | Where-Object {$_.DisplayName -like "*PowerShell*"} +``` + +**Solutions**: +```powershell +# Allow PowerShell through firewall +New-NetFirewallRule -DisplayName "PowerShell Outbound" -Direction Outbound -Program "C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe" -Action Allow + +# Test with simplified request +$body = @{content="Test from Windows"} | ConvertTo-Json +Invoke-RestMethod -Uri $webhookUrl -Method Post -Body $body -ContentType "application/json" +``` + +## Log Management Issues + +### Log Files Growing Too Large +**Symptoms**: Disk space filling up, slow log access +**Diagnosis**: +```bash +# Check log file sizes +du -sh /var/log/homelab-* +du -sh /tmp/*monitor*.log + +# Check available disk space +df -h /var/log +df -h /tmp +``` + +**Solutions**: +```bash +# Implement log rotation +cat > /etc/logrotate.d/homelab-monitoring << 'EOF' +/var/log/homelab-*.log { + daily + missingok + rotate 7 + compress + notifempty + create 644 root root +} +EOF + +# Manual log cleanup +find /tmp -name "*monitor*.log" -size +10M -delete +truncate -s 0 /tmp/large-log-file.log +``` + +### Log Rotation Not Working +**Symptoms**: Old logs not being cleaned up +**Diagnosis**: +```bash +# Check logrotate status +systemctl status logrotate +cat /var/lib/logrotate/status + +# Test logrotate configuration +logrotate -d /etc/logrotate.d/homelab-monitoring +``` + +**Solutions**: +```bash +# Force log rotation +logrotate -f /etc/logrotate.d/homelab-monitoring + +# Fix logrotate configuration +sudo nano /etc/logrotate.d/homelab-monitoring +# Verify syntax and permissions +``` + +## Cron Job Issues + +### Scheduled Tasks Not Running +**Symptoms**: Scripts not executing at scheduled times +**Diagnosis**: +```bash +# Check cron service +systemctl status cron +systemctl status crond # RHEL/CentOS + +# View cron logs +grep CRON /var/log/syslog +journalctl -u cron + +# List all cron jobs +crontab -l +sudo crontab -l # System crontab +``` + +**Solutions**: +```bash +# Restart cron service +sudo systemctl restart cron + +# Fix cron job syntax +# Ensure absolute paths are used +# Example: */20 * * * * /full/path/to/script.sh + +# Check script permissions and execution +ls -la /path/to/script.sh +/path/to/script.sh # Test manual execution +``` + +### Environment Variables in Cron +**Symptoms**: Scripts work manually but fail in cron +**Diagnosis**: +```bash +# Create test cron job to check environment +* * * * * env > /tmp/cron-env.txt + +# Compare with shell environment +env > /tmp/shell-env.txt +diff /tmp/shell-env.txt /tmp/cron-env.txt +``` + +**Solutions**: +```bash +# Set PATH in crontab +PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + +# Or set PATH in script +#!/bin/bash +export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + +# Source environment if needed +source /etc/environment +``` + +## Network Monitoring Issues + +### False Positives +**Symptoms**: Alerts for services that are actually working +**Diagnosis**: +```bash +# Test monitoring checks manually +curl -sSf --max-time 10 "https://service.homelab.local" +ping -c1 -W5 10.10.0.100 + +# Check for intermittent network issues +for i in {1..10}; do ping -c1 host || echo "Fail $i"; done +``` + +**Solutions**: +```bash +# Adjust timeout values +curl --max-time 30 "$service" # Increase timeout + +# Add retry logic +for retry in {1..3}; do + if curl -sSf "$service" >/dev/null 2>&1; then + break + elif [ $retry -eq 3 ]; then + send_alert "Service $service failed after 3 retries" + fi + sleep 5 +done +``` + +### Missing Alerts +**Symptoms**: Real failures not triggering notifications +**Diagnosis**: +```bash +# Verify monitoring script logic +bash -x monitoring-script.sh + +# Check if services are actually down +systemctl status service-name +curl -v service-url +``` + +**Solutions**: +```bash +# Lower detection thresholds +# Increase monitoring frequency +# Add redundant monitoring methods + +# Test alert mechanism +echo "Test alert" | send_alert_function +``` + +## System Resource Issues + +### Monitoring Overhead +**Symptoms**: High CPU/memory usage from monitoring scripts +**Diagnosis**: +```bash +# Monitor the monitoring scripts +top -p $(pgrep -f monitor) +ps aux | grep monitor + +# Check monitoring frequency +crontab -l | grep monitor +``` + +**Solutions**: +```bash +# Reduce monitoring frequency +# Change from */1 to */5 minutes + +# Optimize scripts +# Remove unnecessary commands +# Use efficient tools (prefer curl over wget, etc.) + +# Add resource limits +timeout 30 monitoring-script.sh +``` + +## Emergency Recovery + +### Complete Monitoring Failure +**Recovery Steps**: +```bash +# Restart all monitoring services +sudo systemctl restart cron +sudo systemctl restart rsyslog + +# Reinstall monitoring scripts +cd /path/to/scripts +./install-monitoring.sh + +# Test all components +./test-monitoring.sh +``` + +### Discord Integration Lost +**Quick Recovery**: +```bash +# Test webhook +curl -X POST "$BACKUP_WEBHOOK_URL" -H "Content-Type: application/json" -d '{"content": "Monitoring restored"}' + +# Switch to backup webhook if needed +export DISCORD_WEBHOOK_URL="$BACKUP_WEBHOOK_URL" +``` + +## Prevention and Best Practices + +### Monitoring Health Checks +```bash +#!/bin/bash +# monitor-the-monitors.sh +MONITORING_SCRIPTS="/path/to/tdarr-monitor.sh /path/to/network-monitor.sh" + +for script in $MONITORING_SCRIPTS; do + if [ ! -x "$script" ]; then + echo "ALERT: $script not executable" | send_alert + fi + + # Check if script has run recently + if [ $(($(date +%s) - $(stat -c %Y "$script.last_run" 2>/dev/null || echo 0))) -gt 3600 ]; then + echo "ALERT: $script hasn't run in over an hour" | send_alert + fi +done +``` + +### Backup Alerting Channels +```bash +# Multiple notification methods +send_alert() { + local message="$1" + + # Primary: Discord + curl -X POST "$DISCORD_WEBHOOK" -d "{\"content\":\"$message\"}" || \ + # Backup: Email + echo "$message" | mail -s "Homelab Alert" admin@domain.com || \ + # Last resort: Local log + echo "$(date): $message" >> /var/log/critical-alerts.log +} +``` + +This troubleshooting guide covers the most common monitoring system issues and provides systematic recovery procedures. \ No newline at end of file diff --git a/networking/CONTEXT.md b/networking/CONTEXT.md new file mode 100644 index 0000000..c348574 --- /dev/null +++ b/networking/CONTEXT.md @@ -0,0 +1,309 @@ +# Networking Infrastructure - Technology Context + +## Overview +Home lab networking infrastructure with focus on reverse proxy configuration, SSL/TLS management, SSH key management, and network security. This context covers service discovery, load balancing, and performance optimization patterns. + +## Architecture Patterns + +### Reverse Proxy and Load Balancing +**Pattern**: Centralized traffic management with SSL termination +```nginx +# Nginx reverse proxy pattern +upstream backend { + server 10.10.0.100:3000; + server 10.10.0.101:3000; + keepalive 32; +} + +server { + listen 443 ssl http2; + server_name myapp.homelab.local; + + ssl_certificate /etc/ssl/certs/homelab.crt; + ssl_certificate_key /etc/ssl/private/homelab.key; + + location / { + proxy_pass http://backend; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } +} +``` + +### Network Segmentation Strategy +**Pattern**: VLAN-based isolation with controlled inter-VLAN routing +``` +Management VLAN: 10.10.0.x/24 # VM management, SSH access +Services VLAN: 10.10.1.x/24 # Application services +Storage VLAN: 10.10.2.x/24 # NAS, backup traffic +DMZ VLAN: 10.10.10.x/24 # External-facing services +``` + +## SSH Key Management + +### Centralized Key Distribution +**Pattern**: Automated SSH key deployment with emergency backup +```bash +# Primary access key +~/.ssh/homelab_rsa # Daily operations key + +# Emergency access key +~/.ssh/emergency_homelab_rsa # Backup recovery key + +# Automated deployment +for host in $(cat hosts.txt); do + ssh-copy-id -i ~/.ssh/homelab_rsa.pub user@$host + ssh-copy-id -i ~/.ssh/emergency_homelab_rsa.pub user@$host +done +``` + +### Key Lifecycle Management +**Pattern**: Regular rotation with zero-downtime deployment +1. **Generation**: Create new key pairs annually +2. **Distribution**: Deploy to all managed systems +3. **Verification**: Test connectivity with new keys +4. **Rotation**: Remove old keys after verification +5. **Backup**: Store keys in secure, recoverable location + +## Service Discovery and DNS + +### Local DNS Resolution +**Pattern**: Internal DNS for service discovery +```bind +# Home lab DNS zones +homelab.local. IN A 10.10.0.16 # DNS server +proxmox.homelab.local. IN A 10.10.0.10 # Hypervisor +nas.homelab.local. IN A 10.10.0.20 # Storage +tdarr.homelab.local. IN A 10.10.0.43 # Media server +``` + +### Container Service Discovery +**Pattern**: Docker network-based service resolution +```yaml +# Docker Compose service discovery +version: "3.8" +services: + web: + networks: + - frontend + - backend + api: + networks: + - backend + - database + db: + networks: + - database + +networks: + frontend: + driver: bridge + backend: + driver: bridge + database: + driver: bridge + internal: true # No external access +``` + +## Security Patterns + +### SSH Security Hardening +**Configuration**: Secure SSH server setup +```sshd_config +# /etc/ssh/sshd_config.d/99-homelab-security.conf +PasswordAuthentication no +PubkeyAuthentication yes +PermitRootLogin no +AllowUsers cal +Protocol 2 +ClientAliveInterval 300 +ClientAliveCountMax 2 +MaxAuthTries 3 +X11Forwarding no +``` + +### Network Access Control +**Pattern**: Firewall-based service protection +```bash +# ufw firewall rules +ufw default deny incoming +ufw default allow outgoing +ufw allow ssh +ufw allow from 10.10.0.0/24 to any port 22 +ufw allow from 10.10.0.0/24 to any port 80 +ufw allow from 10.10.0.0/24 to any port 443 +``` + +### SSL/TLS Certificate Management +**Pattern**: Automated certificate lifecycle +```bash +# Let's Encrypt automation +certbot certonly --nginx \ + --email admin@homelab.local \ + --agree-tos \ + --domains homelab.local,*.homelab.local + +# Certificate renewal automation +0 2 * * * certbot renew --quiet && systemctl reload nginx +``` + +## Performance Optimization + +### Connection Management +**Pattern**: Optimized connection handling +```nginx +# Nginx performance tuning +worker_processes auto; +worker_connections 1024; + +keepalive_timeout 65; +keepalive_requests 1000; + +gzip on; +gzip_vary on; +gzip_types text/plain text/css application/json application/javascript; + +# Connection pooling +upstream backend { + server 10.10.0.100:3000 max_fails=3 fail_timeout=30s; + keepalive 32; +} +``` + +### Caching Strategies +**Pattern**: Multi-level caching architecture +```nginx +# Static content caching +location ~* \.(jpg|jpeg|png|gif|ico|css|js)$ { + expires 1y; + add_header Cache-Control "public, immutable"; +} + +# Proxy caching +proxy_cache_path /var/cache/nginx levels=1:2 keys_zone=app_cache:10m; +proxy_cache app_cache; +proxy_cache_valid 200 302 10m; +``` + +## Network Storage Integration + +### CIFS/SMB Mount Resilience +**Pattern**: Robust network filesystem mounting +```fstab +//nas.homelab.local/media /mnt/media cifs \ + credentials=/etc/cifs/credentials,\ + uid=1000,gid=1000,\ + file_mode=0644,dir_mode=0755,\ + iocharset=utf8,\ + cache=strict,\ + actimeo=30,\ + _netdev,\ + reconnect,\ + soft,\ + rsize=1048576,\ + wsize=1048576 0 0 +``` + +## Monitoring and Observability + +### Network Health Monitoring +**Pattern**: Automated connectivity verification +```bash +#!/bin/bash +# network-health-check.sh +HOSTS="10.10.0.10 10.10.0.20 10.10.0.43" +DNS_SERVERS="10.10.0.16 8.8.8.8" + +for host in $HOSTS; do + if ping -c1 -W5 $host >/dev/null 2>&1; then + echo "βœ… $host: Reachable" + else + echo "❌ $host: Unreachable" + fi +done + +for dns in $DNS_SERVERS; do + if nslookup google.com $dns >/dev/null 2>&1; then + echo "βœ… DNS $dns: Working" + else + echo "❌ DNS $dns: Failed" + fi +done +``` + +### Service Availability Monitoring +**Pattern**: HTTP/HTTPS endpoint monitoring +```bash +# Service health check +SERVICES="https://homelab.local http://proxmox.homelab.local:8006" + +for service in $SERVICES; do + if curl -sSf --max-time 10 "$service" >/dev/null 2>&1; then + echo "βœ… $service: Available" + else + echo "❌ $service: Unavailable" + fi +done +``` + +## Common Integration Patterns + +### Reverse Proxy with Docker +**Pattern**: Container service exposure +```nginx +# Dynamic service discovery with Docker +location /api/ { + proxy_pass http://api-container:3000/; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; +} + +location /web/ { + proxy_pass http://web-container:8080/; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; # WebSocket support +} +``` + +### VPN Integration +**Pattern**: Secure remote access +```openvpn +# OpenVPN server configuration +port 1194 +proto udp +dev tun +ca ca.crt +cert server.crt +key server.key +dh dh.pem +server 10.8.0.0 255.255.255.0 +push "route 10.10.0.0 255.255.0.0" # Home lab networks +keepalive 10 120 +``` + +## Best Practices + +### Security Implementation +1. **SSH Keys Only**: Disable password authentication everywhere +2. **Network Segmentation**: Use VLANs for isolation +3. **Certificate Management**: Automate SSL/TLS certificate lifecycle +4. **Access Control**: Implement least-privilege networking +5. **Monitoring**: Continuous network and service monitoring + +### Performance Optimization +1. **Connection Pooling**: Reuse connections for efficiency +2. **Caching**: Implement multi-level caching strategies +3. **Compression**: Enable gzip for reduced bandwidth +4. **Keep-Alives**: Optimize connection persistence +5. **CDN Strategy**: Cache static content effectively + +### Operational Excellence +1. **Documentation**: Maintain network topology documentation +2. **Automation**: Script routine network operations +3. **Backup**: Regular configuration backups +4. **Testing**: Regular connectivity and performance testing +5. **Change Management**: Controlled network configuration changes + +This technology context provides comprehensive guidance for implementing robust networking infrastructure in home lab environments. \ No newline at end of file diff --git a/reference/networking/cifs-mount-resilience-fixes.md b/networking/examples/cifs-mount-resilience-fixes.md similarity index 100% rename from reference/networking/cifs-mount-resilience-fixes.md rename to networking/examples/cifs-mount-resilience-fixes.md diff --git a/reference/networking/nas-mount-configuration.md b/networking/examples/nas-mount-configuration.md similarity index 100% rename from reference/networking/nas-mount-configuration.md rename to networking/examples/nas-mount-configuration.md diff --git a/reference/storage/network-filesystem-limitations.md b/networking/examples/network-filesystem-limitations.md similarity index 100% rename from reference/storage/network-filesystem-limitations.md rename to networking/examples/network-filesystem-limitations.md diff --git a/examples/networking/nginx-config.md b/networking/examples/nginx-config.md similarity index 100% rename from examples/networking/nginx-config.md rename to networking/examples/nginx-config.md diff --git a/networking/examples/security_improvements.md b/networking/examples/security_improvements.md new file mode 100644 index 0000000..9dbe5a0 --- /dev/null +++ b/networking/examples/security_improvements.md @@ -0,0 +1,99 @@ +# Home Lab Security Improvements + +## Current Security Issues + +### Critical Issues Found: +- **Password Authentication**: All servers using password-based SSH authentication +- **Credential Reuse**: Same password used across 7 home network servers +- **Insecure Storage**: Passwords stored in FileZilla (base64 encoded, not encrypted) +- **Root Access**: Cloud servers using root user accounts + +### Risk Assessment: +- **High**: Password-based authentication vulnerable to brute force attacks +- **High**: Shared passwords create single point of failure +- **Medium**: FileZilla credentials accessible to anyone with file system access +- **Medium**: Root access increases attack surface + +## Implemented Solutions + +### 1. SSH Key-Based Authentication +- **Generated separate key pairs** for home lab vs cloud servers +- **4096-bit RSA keys** for strong encryption +- **Descriptive key comments** for identification + +### 2. SSH Configuration Management +- **Centralized config** in `~/.ssh/config` +- **Host aliases** for easy server access +- **Port forwarding** pre-configured for common services +- **Security defaults** (ServerAliveInterval, StrictHostKeyChecking) + +### 3. Network Segmentation +- **Home network** (10.10.0.0/24) uses dedicated key +- **Cloud servers** use separate key pair +- **Service-specific aliases** for different server roles + +## Additional Security Recommendations + +### Immediate Actions: +1. **Deploy SSH keys** using the provided script +2. **Test key-based authentication** on all servers +3. **Disable password authentication** once keys work +4. **Remove FileZilla passwords** after migration + +### Server Hardening: +```bash +# On each server, edit /etc/ssh/sshd_config: +PasswordAuthentication no +PubkeyAuthentication yes +PermitRootLogin no # (create non-root user on cloud servers first) +Port 2222 # Change default SSH port +AllowUsers cal # Restrict SSH access +``` + +### Monitoring: +- **SSH login monitoring** with fail2ban +- **Key rotation schedule** (annually) +- **Access logging** review + +### Future Enhancements: +- **Certificate-based authentication** (SSH CA) +- **Multi-factor authentication** (TOTP) +- **VPN access** for home network +- **Bastion host** for cloud servers + +## Migration Plan + +### Phase 1: Key Deployment βœ… +- [x] Generate SSH key pairs +- [x] Create SSH configuration +- [x] Document server inventory + +### Phase 2: Authentication Migration +- [ ] Deploy public keys to all servers +- [ ] Test SSH connections with keys +- [ ] Verify all services accessible + +### Phase 3: Security Lockdown +- [ ] Disable password authentication +- [ ] Change default SSH ports +- [ ] Configure fail2ban +- [ ] Remove FileZilla credentials + +### Phase 4: Monitoring & Maintenance +- [ ] Set up access logging +- [ ] Schedule key rotation +- [ ] Document incident response + +## Connection Examples + +After setup, you'll connect using simple aliases: +```bash +# Instead of: ssh cal@10.10.0.42 +ssh database-apis + +# Instead of: ssh root@172.237.147.99 +ssh akamai + +# With automatic port forwarding: +ssh pihole # Forwards port 8080 β†’ localhost:80 +``` \ No newline at end of file diff --git a/networking/examples/server_inventory.yaml b/networking/examples/server_inventory.yaml new file mode 100644 index 0000000..8f0dd87 --- /dev/null +++ b/networking/examples/server_inventory.yaml @@ -0,0 +1,70 @@ +--- +# Home Lab Server Inventory +# Generated from FileZilla configuration + +home_network: + subnet: "10.10.0.0/24" + servers: + database_apis: + hostname: "10.10.0.42" + port: 22 + user: "cal" + services: ["database", "api"] + description: "Database and API services" + + discord_bots: + hostname: "10.10.0.33" + port: 22 + user: "cal" + services: ["discord", "bots"] + description: "Discord bot hosting" + + home_docker: + hostname: "10.10.0.124" + port: 22 + user: "cal" + services: ["docker", "containers"] + description: "Main Docker container host" + + pihole: + hostname: "10.10.0.16" + port: 22 + user: "cal" + services: ["dns", "adblock"] + description: "Pi-hole DNS and ad blocking" + + sba_pd_bots: + hostname: "10.10.0.88" + port: 22 + user: "cal" + services: ["bots", "automation"] + description: "SBa and PD bot services" + + tdarr: + hostname: "10.10.0.43" + port: 22 + user: "cal" + services: ["media", "transcoding"] + description: "Tdarr media transcoding" + + vpn_docker: + hostname: "10.10.0.121" + port: 22 + user: "cal" + services: ["vpn", "docker"] + description: "VPN and Docker services" + +remote_servers: + akamai_nano: + hostname: "172.237.147.99" + port: 22 + user: "root" + provider: "akamai" + description: "Akamai cloud nano instance" + + vultr_host: + hostname: "45.76.25.231" + port: 22 + user: "root" + provider: "vultr" + description: "Vultr cloud host" \ No newline at end of file diff --git a/examples/networking/ssh-homelab-setup.md b/networking/examples/ssh-homelab-setup.md similarity index 100% rename from examples/networking/ssh-homelab-setup.md rename to networking/examples/ssh-homelab-setup.md diff --git a/patterns/networking/ssh-key-management.md b/networking/examples/ssh-key-management.md similarity index 100% rename from patterns/networking/ssh-key-management.md rename to networking/examples/ssh-key-management.md diff --git a/reference/networking/ssh-troubleshooting.md b/networking/examples/ssh-troubleshooting.md similarity index 100% rename from reference/networking/ssh-troubleshooting.md rename to networking/examples/ssh-troubleshooting.md diff --git a/reference/networking/troubleshooting.md b/networking/examples/troubleshooting.md similarity index 100% rename from reference/networking/troubleshooting.md rename to networking/examples/troubleshooting.md diff --git a/networking/scripts/ssh_key_maintenance.sh b/networking/scripts/ssh_key_maintenance.sh new file mode 100755 index 0000000..c7ccb61 --- /dev/null +++ b/networking/scripts/ssh_key_maintenance.sh @@ -0,0 +1,114 @@ +#!/bin/bash +# SSH Key Maintenance and Backup Script +# Run this periodically to maintain key security + +echo "πŸ”§ SSH Key Maintenance and Backup" + +# Check if NAS is mounted +if [ ! -d "/mnt/NV2" ]; then + echo "❌ ERROR: NAS not mounted at /mnt/NV2" + exit 1 +fi + +# Create timestamp +TIMESTAMP=$(date +%Y%m%d-%H%M%S) +BACKUP_ROOT="/mnt/NV2/ssh-keys" +BACKUP_DIR="$BACKUP_ROOT/maintenance-$TIMESTAMP" + +# Ensure backup directory structure +mkdir -p "$BACKUP_DIR" +chmod 700 "$BACKUP_DIR" + +echo "πŸ“ Creating maintenance backup in: $BACKUP_DIR" + +# Backup current keys and config +cp ~/.ssh/*_rsa* "$BACKUP_DIR/" 2>/dev/null || true +cp ~/.ssh/config "$BACKUP_DIR/" 2>/dev/null || true +cp ~/.ssh/known_hosts "$BACKUP_DIR/" 2>/dev/null || true + +# Check key ages and recommend rotation +echo "" +echo "πŸ” Key Age Analysis:" +for key in ~/.ssh/*_rsa; do + if [ -f "$key" ]; then + age_days=$(( ($(date +%s) - $(stat -c %Y "$key")) / 86400 )) + basename_key=$(basename "$key") + + if [ $age_days -gt 365 ]; then + echo "⚠️ $basename_key: $age_days days old - ROTATION RECOMMENDED" + elif [ $age_days -gt 180 ]; then + echo "⚑ $basename_key: $age_days days old - consider rotation" + else + echo "βœ… $basename_key: $age_days days old - OK" + fi + fi +done + +# Test key accessibility +echo "" +echo "πŸ” Testing Key Access:" +for key in ~/.ssh/*_rsa; do + if [ -f "$key" ]; then + basename_key=$(basename "$key") + if ssh-keygen -l -f "$key" >/dev/null 2>&1; then + echo "βœ… $basename_key: Valid and readable" + else + echo "❌ $basename_key: CORRUPTED or unreadable" + fi + fi +done + +# Clean up old backups (keep last 10) +echo "" +echo "🧹 Cleaning old backups (keeping last 10):" +cd "$BACKUP_ROOT" +ls -dt backup-* maintenance-* 2>/dev/null | tail -n +11 | while read old_backup; do + if [ -d "$old_backup" ]; then + echo "πŸ—‘οΈ Removing old backup: $old_backup" + rm -rf "$old_backup" + fi +done + +# Generate maintenance report +cat > "$BACKUP_DIR/MAINTENANCE_REPORT.md" << EOF +# SSH Key Maintenance Report +Generated: $(date) +Host: $(hostname) +User: $(whoami) + +## Backup Location +$BACKUP_DIR + +## Key Inventory +$(ls -la ~/.ssh/*_rsa* 2>/dev/null || echo "No SSH keys found") + +## SSH Config Status +$(if [ -f ~/.ssh/config ]; then echo "SSH config exists: ~/.ssh/config"; else echo "No SSH config found"; fi) + +## Server Connection Tests +Run these commands to verify connectivity: + +### Primary Keys: +ssh -o ConnectTimeout=5 database-apis 'echo "DB APIs: OK"' +ssh -o ConnectTimeout=5 pihole 'echo "PiHole: OK"' +ssh -o ConnectTimeout=5 akamai 'echo "Akamai: OK"' + +### Emergency Keys (if deployed): +ssh -i ~/.ssh/emergency_homelab_rsa -o ConnectTimeout=5 cal@10.10.0.16 'echo "Emergency Home: OK"' +ssh -i ~/.ssh/emergency_cloud_rsa -o ConnectTimeout=5 root@172.237.147.99 'echo "Emergency Cloud: OK"' + +## Next Maintenance Due +$(date -d '+3 months') + +## Key Rotation Schedule +- Home lab keys: Annual (generated $(date -r ~/.ssh/homelab_rsa 2>/dev/null || echo "Not found")) +- Cloud keys: Annual (generated $(date -r ~/.ssh/cloud_servers_rsa 2>/dev/null || echo "Not found")) +- Emergency keys: Bi-annual + +EOF + +echo "βœ… Maintenance backup completed" +echo "πŸ“„ Report saved: $BACKUP_DIR/MAINTENANCE_REPORT.md" +echo "" +echo "πŸ’‘ Schedule this script to run monthly via cron:" +echo " 0 2 1 * * /path/to/ssh_key_maintenance.sh" \ No newline at end of file diff --git a/networking/troubleshooting.md b/networking/troubleshooting.md new file mode 100644 index 0000000..35465ef --- /dev/null +++ b/networking/troubleshooting.md @@ -0,0 +1,496 @@ +# Networking Infrastructure Troubleshooting Guide + +## SSH Connection Issues + +### SSH Authentication Failures +**Symptoms**: Permission denied, connection refused, timeout +**Diagnosis**: +```bash +# Verbose SSH debugging +ssh -vvv user@host + +# Test different authentication methods +ssh -o PasswordAuthentication=no user@host +ssh -o PubkeyAuthentication=yes user@host + +# Check local key files +ls -la ~/.ssh/ +ssh-keygen -lf ~/.ssh/homelab_rsa.pub +``` + +**Solutions**: +```bash +# Re-deploy SSH keys +ssh-copy-id -i ~/.ssh/homelab_rsa.pub user@host +ssh-copy-id -i ~/.ssh/emergency_homelab_rsa.pub user@host + +# Fix key permissions +chmod 600 ~/.ssh/homelab_rsa +chmod 644 ~/.ssh/homelab_rsa.pub +chmod 700 ~/.ssh + +# Verify remote authorized_keys +ssh user@host 'chmod 700 ~/.ssh && chmod 600 ~/.ssh/authorized_keys' +``` + +### SSH Service Issues +**Symptoms**: Connection refused, service not running +**Diagnosis**: +```bash +# Check SSH service status +systemctl status sshd +ss -tlnp | grep :22 + +# Test port connectivity +nc -zv host 22 +nmap -p 22 host +``` + +**Solutions**: +```bash +# Restart SSH service +sudo systemctl restart sshd +sudo systemctl enable sshd + +# Check firewall +sudo ufw status +sudo ufw allow ssh + +# Verify SSH configuration +sudo sshd -T | grep -E "(passwordauth|pubkeyauth|permitroot)" +``` + +## Network Connectivity Problems + +### Basic Network Troubleshooting +**Symptoms**: Cannot reach hosts, timeouts, routing issues +**Diagnosis**: +```bash +# Basic connectivity tests +ping host +traceroute host +mtr host + +# Check local network configuration +ip addr show +ip route show +cat /etc/resolv.conf +``` + +**Solutions**: +```bash +# Restart networking +sudo systemctl restart networking +sudo netplan apply # Ubuntu + +# Reset network interface +sudo ip link set eth0 down +sudo ip link set eth0 up + +# Check default gateway +sudo ip route add default via 10.10.0.1 +``` + +### DNS Resolution Issues +**Symptoms**: Cannot resolve hostnames, slow resolution +**Diagnosis**: +```bash +# Test DNS resolution +nslookup google.com +dig google.com +host google.com + +# Check DNS servers +systemd-resolve --status +cat /etc/resolv.conf +``` + +**Solutions**: +```bash +# Temporary DNS fix +echo "nameserver 8.8.8.8" | sudo tee /etc/resolv.conf + +# Restart DNS services +sudo systemctl restart systemd-resolved + +# Flush DNS cache +sudo systemd-resolve --flush-caches +``` + +## Reverse Proxy and Load Balancer Issues + +### Nginx Configuration Problems +**Symptoms**: 502 Bad Gateway, 503 Service Unavailable, SSL errors +**Diagnosis**: +```bash +# Check Nginx status and logs +systemctl status nginx +sudo tail -f /var/log/nginx/error.log +sudo tail -f /var/log/nginx/access.log + +# Test Nginx configuration +sudo nginx -t +sudo nginx -T # Show full configuration +``` + +**Solutions**: +```bash +# Reload Nginx configuration +sudo nginx -s reload + +# Check upstream servers +curl -I http://backend-server:port +telnet backend-server port + +# Fix common configuration issues +sudo nano /etc/nginx/sites-available/default +# Check proxy_pass URLs, upstream definitions +``` + +### SSL/TLS Certificate Issues +**Symptoms**: Certificate warnings, expired certificates, connection errors +**Diagnosis**: +```bash +# Check certificate validity +openssl s_client -connect host:443 -servername host +openssl x509 -in /etc/ssl/certs/cert.pem -text -noout + +# Check certificate expiry +openssl x509 -in /etc/ssl/certs/cert.pem -noout -dates +``` + +**Solutions**: +```bash +# Renew Let's Encrypt certificates +sudo certbot renew --dry-run +sudo certbot renew --force-renewal + +# Generate self-signed certificate +sudo openssl req -x509 -nodes -days 365 -newkey rsa:2048 \ + -keyout /etc/ssl/private/selfsigned.key \ + -out /etc/ssl/certs/selfsigned.crt +``` + +## Network Storage Issues + +### CIFS/SMB Mount Problems +**Symptoms**: Mount failures, connection timeouts, permission errors +**Diagnosis**: +```bash +# Test SMB connectivity +smbclient -L //nas-server -U username +testparm # Test Samba configuration + +# Check mount status +mount | grep cifs +df -h | grep cifs +``` + +**Solutions**: +```bash +# Remount with verbose logging +sudo mount -t cifs //server/share /mnt/point -o username=user,password=pass,vers=3.0 + +# Fix mount options in /etc/fstab +//server/share /mnt/point cifs credentials=/etc/cifs/credentials,uid=1000,gid=1000,iocharset=utf8,file_mode=0644,dir_mode=0755,cache=strict,_netdev 0 0 + +# Test credentials +sudo cat /etc/cifs/credentials +# Should contain: username=, password=, domain= +``` + +### NFS Mount Issues +**Symptoms**: Stale file handles, mount hangs, permission denied +**Diagnosis**: +```bash +# Check NFS services +systemctl status nfs-client.target +showmount -e nfs-server + +# Test NFS connectivity +rpcinfo -p nfs-server +``` + +**Solutions**: +```bash +# Restart NFS services +sudo systemctl restart nfs-client.target + +# Remount NFS shares +sudo umount /mnt/nfs-share +sudo mount -t nfs server:/path /mnt/nfs-share + +# Fix stale file handles +sudo umount -f /mnt/nfs-share +sudo mount /mnt/nfs-share +``` + +## Firewall and Security Issues + +### Port Access Problems +**Symptoms**: Connection refused, filtered ports, blocked services +**Diagnosis**: +```bash +# Check firewall status +sudo ufw status verbose +sudo iptables -L -n -v + +# Test port accessibility +nc -zv host port +nmap -p port host +``` + +**Solutions**: +```bash +# Open required ports +sudo ufw allow ssh +sudo ufw allow 80/tcp +sudo ufw allow 443/tcp +sudo ufw allow from 10.10.0.0/24 + +# Reset firewall if needed +sudo ufw --force reset +sudo ufw enable +``` + +### Network Security Issues +**Symptoms**: Unauthorized access, suspicious traffic, security alerts +**Diagnosis**: +```bash +# Check active connections +ss -tuln +netstat -tuln + +# Review logs for security events +sudo tail -f /var/log/auth.log +sudo tail -f /var/log/syslog | grep -i security +``` + +**Solutions**: +```bash +# Block suspicious IPs +sudo ufw deny from suspicious-ip + +# Update SSH security +sudo nano /etc/ssh/sshd_config +# Set: PasswordAuthentication no, PermitRootLogin no +sudo systemctl restart sshd +``` + +## Service Discovery and DNS Issues + +### Local DNS Problems +**Symptoms**: Services unreachable by hostname, DNS timeouts +**Diagnosis**: +```bash +# Test local DNS resolution +nslookup service.homelab.local +dig @10.10.0.16 service.homelab.local + +# Check DNS server status +systemctl status bind9 # or named +``` + +**Solutions**: +```bash +# Add to /etc/hosts as temporary fix +echo "10.10.0.100 service.homelab.local" | sudo tee -a /etc/hosts + +# Restart DNS services +sudo systemctl restart bind9 +sudo systemctl restart systemd-resolved +``` + +### Container Networking Issues +**Symptoms**: Containers cannot communicate, service discovery fails +**Diagnosis**: +```bash +# Check Docker networks +docker network ls +docker network inspect bridge + +# Test container connectivity +docker exec container1 ping container2 +docker exec container1 nslookup container2 +``` + +**Solutions**: +```bash +# Create custom network +docker network create --driver bridge app-network +docker run --network app-network container + +# Fix DNS in containers +docker run --dns 8.8.8.8 container +``` + +## Performance Issues + +### Network Latency Problems +**Symptoms**: Slow response times, timeouts, poor performance +**Diagnosis**: +```bash +# Measure network latency +ping -c 100 host +mtr --report host + +# Check network interface stats +ip -s link show +cat /proc/net/dev +``` + +**Solutions**: +```bash +# Optimize network settings +echo 'net.core.rmem_max = 134217728' | sudo tee -a /etc/sysctl.conf +echo 'net.core.wmem_max = 134217728' | sudo tee -a /etc/sysctl.conf +sudo sysctl -p + +# Check for network congestion +iftop +nethogs +``` + +### Bandwidth Issues +**Symptoms**: Slow transfers, network congestion, dropped packets +**Diagnosis**: +```bash +# Test bandwidth +iperf3 -s # Server +iperf3 -c server-ip # Client + +# Check interface utilization +vnstat -i eth0 +``` + +**Solutions**: +```bash +# Implement QoS if needed +sudo tc qdisc add dev eth0 root fq_codel + +# Optimize buffer sizes +sudo ethtool -G eth0 rx 4096 tx 4096 +``` + +## Emergency Recovery Procedures + +### Network Emergency Recovery +**Complete network failure recovery**: +```bash +# Reset all network configuration +sudo systemctl stop networking +sudo ip addr flush eth0 +sudo ip route flush table main +sudo systemctl start networking + +# Manual network configuration +sudo ip addr add 10.10.0.100/24 dev eth0 +sudo ip route add default via 10.10.0.1 +echo "nameserver 8.8.8.8" | sudo tee /etc/resolv.conf +``` + +### SSH Emergency Access +**When locked out of systems**: +```bash +# Use emergency SSH key +ssh -i ~/.ssh/emergency_homelab_rsa user@host + +# Via console access (if available) +# Use hypervisor console or physical access + +# Reset SSH to allow password auth temporarily +sudo sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config +sudo systemctl restart sshd +``` + +### Service Recovery +**Critical service restoration**: +```bash +# Restart all network services +sudo systemctl restart networking +sudo systemctl restart nginx +sudo systemctl restart sshd + +# Emergency firewall disable +sudo ufw disable # CAUTION: Only for troubleshooting + +# Service-specific recovery +sudo systemctl restart docker +sudo systemctl restart systemd-resolved +``` + +## Monitoring and Prevention + +### Network Health Monitoring +```bash +#!/bin/bash +# network-monitor.sh +CRITICAL_HOSTS="10.10.0.1 10.10.0.16 nas.homelab.local" +CRITICAL_SERVICES="https://homelab.local http://proxmox.homelab.local:8006" + +for host in $CRITICAL_HOSTS; do + if ! ping -c1 -W5 $host >/dev/null 2>&1; then + echo "ALERT: $host unreachable" | logger -t network-monitor + fi +done + +for service in $CRITICAL_SERVICES; do + if ! curl -sSf --max-time 10 "$service" >/dev/null 2>&1; then + echo "ALERT: $service unavailable" | logger -t network-monitor + fi +done +``` + +### Automated Recovery Scripts +```bash +#!/bin/bash +# network-recovery.sh +if ! ping -c1 8.8.8.8 >/dev/null 2>&1; then + echo "Network down, attempting recovery..." + sudo systemctl restart networking + sleep 10 + if ping -c1 8.8.8.8 >/dev/null 2>&1; then + echo "Network recovered" + else + echo "Manual intervention required" + fi +fi +``` + +## Quick Reference Commands + +### Network Diagnostics +```bash +# Connectivity tests +ping host +traceroute host +mtr host +nc -zv host port + +# Service checks +systemctl status networking +systemctl status nginx +systemctl status sshd + +# Network configuration +ip addr show +ip route show +ss -tuln +``` + +### Emergency Commands +```bash +# Network restart +sudo systemctl restart networking + +# SSH emergency access +ssh -i ~/.ssh/emergency_homelab_rsa user@host + +# Firewall quick disable (emergency only) +sudo ufw disable + +# DNS quick fix +echo "nameserver 8.8.8.8" | sudo tee /etc/resolv.conf +``` + +This troubleshooting guide provides comprehensive solutions for common networking issues in home lab environments. \ No newline at end of file diff --git a/patterns/docker/README.md b/patterns/docker/README.md deleted file mode 100644 index 4d4e474..0000000 --- a/patterns/docker/README.md +++ /dev/null @@ -1,26 +0,0 @@ -# Docker Patterns - -## Container Best Practices -- Use multi-stage builds for production images -- Minimize layer count and image size -- Run containers as non-root users -- Use specific version tags, avoid `latest` -- Implement health checks - -## Common Patterns -- **Multi-service applications**: Use docker-compose for local development -- **Production deployments**: Single-container per service with orchestration -- **Development environments**: Volume mounts for code changes -- **CI/CD integration**: Build, test, and push in pipeline stages - -## Security Considerations -- Scan images for vulnerabilities -- Use distroless or minimal base images -- Implement resource limits -- Network isolation between services - -## Related Documentation -- Examples: `/examples/docker/multi-stage-builds.md` -- Examples: `/examples/docker/compose-patterns.md` -- Reference: `/reference/docker/troubleshooting.md` -- Reference: `/reference/docker/security-checklist.md` \ No newline at end of file diff --git a/patterns/networking/README.md b/patterns/networking/README.md deleted file mode 100644 index 15af9a9..0000000 --- a/patterns/networking/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# Networking Patterns - -## Infrastructure Setup -- **Reverse proxy** configuration (Nginx/Traefik) -- **Load balancing** strategies and health checks -- **SSL/TLS termination** and certificate management -- **Network segmentation** and VLANs - -## Service Discovery -- **DNS-based** service resolution -- **Container networking** with Docker networks -- **Service mesh** patterns for microservices -- **API gateway** implementation - -## Security Patterns -- **Firewall rules** and port management -- **VPN setup** for remote access -- **Zero-trust networking** principles -- **Network monitoring** and intrusion detection - -## Performance Optimization -- **CDN integration** for static assets -- **Connection pooling** and keep-alives -- **Bandwidth management** and QoS -- **Caching strategies** at network level - -## Related Documentation -- Examples: `/examples/networking/nginx-config.md` -- Examples: `/examples/networking/vpn-setup.md` -- Examples: `/examples/networking/load-balancing.md` -- Reference: `/reference/networking/troubleshooting.md` -- Reference: `/reference/networking/security.md` \ No newline at end of file diff --git a/patterns/vm-management/README.md b/patterns/vm-management/README.md deleted file mode 100644 index 884cd02..0000000 --- a/patterns/vm-management/README.md +++ /dev/null @@ -1,66 +0,0 @@ -# Virtual Machine Management Patterns - -## Automated Provisioning -- **Cloud-init deployment** - Fully automated VM provisioning from first boot -- **Post-install scripts** - Standardized configuration for existing VMs -- **SSH key management** - Automated key deployment with emergency backup -- **Security hardening** - Password auth disabled, firewall configured - -## VM Provisioning Strategies - -### Template-Based Deployment -- **Ubuntu Server templates** optimized for home lab environments -- **Resource allocation** sizing and planning -- **Network configuration** and VLAN assignment (10.10.0.x networks) -- **Storage provisioning** and disk management - -### Infrastructure as Code -- **Cloud-init templates** for repeatable VM creation -- **Bash provisioning scripts** for existing infrastructure -- **SSH key integration** with existing homelab key management -- **Docker environment** setup with user permissions - -## Lifecycle Management -- **Automated provisioning** with infrastructure as code -- **Configuration management** with standardized scripts -- **Snapshot management** and rollback strategies -- **Scaling policies** for resource optimization - -## Monitoring & Maintenance -- **Resource monitoring** (CPU, memory, disk, network) -- **Health checks** and alerting systems -- **Patch management** and update strategies -- **Performance tuning** and optimization - -## Backup & Recovery -- **VM-level backups** vs **application-level backups** -- **Disaster recovery** planning and testing -- **High availability** configurations -- **Migration strategies** between hosts - -## Implementation Workflows - -### New VM Creation (Recommended) -1. **Create VM in Proxmox** with cloud-init support -2. **Apply cloud-init template** (`scripts/vm-management/cloud-init-user-data.yaml`) -3. **Start VM** - fully automated provisioning -4. **Verify setup** via SSH key authentication - -### Existing VM Configuration -1. **Run post-install script** (`scripts/vm-management/vm-post-install.sh `) -2. **Automated provisioning** handles updates, SSH keys, Docker -3. **Security hardening** applied automatically -4. **Test connectivity** and verify Docker installation - -## Security Architecture -- **SSH key-based authentication** only (passwords disabled) -- **Emergency key backup** for failover access -- **User privilege separation** (sudo required, docker group) -- **Automatic security updates** configured -- **Network isolation** ready (10.10.0.x internal network) - -## Related Documentation -- **Implementation**: `scripts/vm-management/README.md` - Complete setup guides -- **SSH Keys**: `patterns/networking/ssh-key-management.md` - Key lifecycle management -- **Examples**: `examples/networking/ssh-homelab-setup.md` - SSH integration patterns -- **Reference**: `reference/vm-management/troubleshooting.md` - Common issues and solutions \ No newline at end of file diff --git a/scripts/monitoring/tdarr_monitor.py b/scripts/monitoring/tdarr_monitor.py deleted file mode 100755 index 2390976..0000000 --- a/scripts/monitoring/tdarr_monitor.py +++ /dev/null @@ -1,498 +0,0 @@ -#!/usr/bin/env python3 -""" -Tdarr API Monitoring Script - -Monitors Tdarr server via its web API endpoints: -- Server status and health -- Queue status and statistics -- Node status and performance -- Library scan progress -- Worker activity - -Usage: - python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check all - python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check queue - python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes -""" - -import argparse -import json -import logging -import sys -from dataclasses import dataclass, asdict -from datetime import datetime -from typing import Dict, List, Optional, Any -import requests -from urllib.parse import urljoin - - -@dataclass -class ServerStatus: - timestamp: str - server_url: str - status: str - error: Optional[str] = None - version: Optional[str] = None - server_id: Optional[str] = None - uptime: Optional[str] = None - system_info: Optional[Dict[str, Any]] = None - - -@dataclass -class QueueStats: - total_files: int - queued: int - processing: int - completed: int - queue_items: List[Dict[str, Any]] - - -@dataclass -class QueueStatus: - timestamp: str - queue_stats: Optional[QueueStats] = None - error: Optional[str] = None - - -@dataclass -class NodeInfo: - id: Optional[str] - nodeName: Optional[str] - status: str - lastSeen: Optional[int] - version: Optional[str] - platform: Optional[str] - workers: Dict[str, int] - processing: List[Dict[str, Any]] - - -@dataclass -class NodeSummary: - total_nodes: int - online_nodes: int - offline_nodes: int - online_details: List[NodeInfo] - offline_details: List[NodeInfo] - - -@dataclass -class NodeStatus: - timestamp: str - nodes: List[Dict[str, Any]] - node_summary: Optional[NodeSummary] = None - error: Optional[str] = None - - -@dataclass -class LibraryInfo: - name: Optional[str] - path: Optional[str] - file_count: int - scan_progress: int - last_scan: Optional[str] - is_scanning: bool - - -@dataclass -class ScanStatus: - total_libraries: int - total_files: int - scanning_libraries: int - - -@dataclass -class LibraryStatus: - timestamp: str - libraries: List[LibraryInfo] - scan_status: Optional[ScanStatus] = None - error: Optional[str] = None - - -@dataclass -class Statistics: - total_transcodes: int - space_saved: int - total_files_processed: int - failed_transcodes: int - processing_speed: int - eta: Optional[str] - - -@dataclass -class StatisticsStatus: - timestamp: str - statistics: Optional[Statistics] = None - error: Optional[str] = None - - -@dataclass -class HealthCheck: - status: str - healthy: bool - online_count: Optional[int] = None - total_count: Optional[int] = None - accessible: Optional[bool] = None - total_items: Optional[int] = None - - -@dataclass -class HealthStatus: - timestamp: str - overall_status: str - checks: Dict[str, HealthCheck] - - -class TdarrMonitor: - def __init__(self, server_url: str, timeout: int = 30): - """Initialize Tdarr monitor with server URL.""" - self.server_url = server_url.rstrip('/') - self.timeout = timeout - self.session = requests.Session() - - # Configure logging - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s' - ) - self.logger = logging.getLogger(__name__) - - def _make_request(self, endpoint: str) -> Optional[Dict[str, Any]]: - """Make HTTP request to Tdarr API endpoint.""" - url = urljoin(self.server_url, endpoint) - - try: - response = self.session.get(url, timeout=self.timeout) - response.raise_for_status() - return response.json() - - except requests.exceptions.RequestException as e: - self.logger.error(f"Request failed for {url}: {e}") - return None - except json.JSONDecodeError as e: - self.logger.error(f"JSON decode failed for {url}: {e}") - return None - - def get_server_status(self) -> ServerStatus: - """Get overall server status and configuration.""" - timestamp = datetime.now().isoformat() - - # Try to get server info from API - data = self._make_request('/api/v2/get-server-info') - if data: - return ServerStatus( - timestamp=timestamp, - server_url=self.server_url, - status='online', - version=data.get('version'), - server_id=data.get('serverId'), - uptime=data.get('uptime'), - system_info=data.get('systemInfo', {}) - ) - else: - return ServerStatus( - timestamp=timestamp, - server_url=self.server_url, - status='offline', - error='Unable to connect to Tdarr server' - ) - - def get_queue_status(self) -> QueueStatus: - """Get transcoding queue status and statistics.""" - timestamp = datetime.now().isoformat() - - # Get queue information - data = self._make_request('/api/v2/get-queue') - if data: - queue_data = data.get('queue', []) - - # Calculate queue statistics - total_files = len(queue_data) - queued_files = len([f for f in queue_data if f.get('status') == 'Queued']) - processing_files = len([f for f in queue_data if f.get('status') == 'Processing']) - completed_files = len([f for f in queue_data if f.get('status') == 'Completed']) - - queue_stats = QueueStats( - total_files=total_files, - queued=queued_files, - processing=processing_files, - completed=completed_files, - queue_items=queue_data[:10] # First 10 items for details - ) - - return QueueStatus( - timestamp=timestamp, - queue_stats=queue_stats - ) - else: - return QueueStatus( - timestamp=timestamp, - error='Unable to fetch queue data' - ) - - def get_node_status(self) -> NodeStatus: - """Get status of all connected nodes.""" - timestamp = datetime.now().isoformat() - - # Get nodes information - data = self._make_request('/api/v2/get-nodes') - if data: - nodes = data.get('nodes', []) - - # Process node information - online_nodes = [] - offline_nodes = [] - - for node in nodes: - node_info = NodeInfo( - id=node.get('_id'), - nodeName=node.get('nodeName'), - status='online' if node.get('lastSeen', 0) > 0 else 'offline', - lastSeen=node.get('lastSeen'), - version=node.get('version'), - platform=node.get('platform'), - workers={ - 'cpu': node.get('workers', {}).get('CPU', 0), - 'gpu': node.get('workers', {}).get('GPU', 0) - }, - processing=node.get('currentJobs', []) - ) - - if node_info.status == 'online': - online_nodes.append(node_info) - else: - offline_nodes.append(node_info) - - node_summary = NodeSummary( - total_nodes=len(nodes), - online_nodes=len(online_nodes), - offline_nodes=len(offline_nodes), - online_details=online_nodes, - offline_details=offline_nodes - ) - - return NodeStatus( - timestamp=timestamp, - nodes=nodes, - node_summary=node_summary - ) - else: - return NodeStatus( - timestamp=timestamp, - nodes=[], - error='Unable to fetch node data' - ) - - def get_library_status(self) -> LibraryStatus: - """Get library scan status and file statistics.""" - timestamp = datetime.now().isoformat() - - # Get library information - data = self._make_request('/api/v2/get-libraries') - if data: - libraries = data.get('libraries', []) - - library_stats = [] - total_files = 0 - - for lib in libraries: - lib_info = LibraryInfo( - name=lib.get('name'), - path=lib.get('path'), - file_count=lib.get('totalFiles', 0), - scan_progress=lib.get('scanProgress', 0), - last_scan=lib.get('lastScan'), - is_scanning=lib.get('isScanning', False) - ) - library_stats.append(lib_info) - total_files += lib_info.file_count - - scan_status = ScanStatus( - total_libraries=len(libraries), - total_files=total_files, - scanning_libraries=len([l for l in library_stats if l.is_scanning]) - ) - - return LibraryStatus( - timestamp=timestamp, - libraries=library_stats, - scan_status=scan_status - ) - else: - return LibraryStatus( - timestamp=timestamp, - libraries=[], - error='Unable to fetch library data' - ) - - def get_statistics(self) -> StatisticsStatus: - """Get overall Tdarr statistics and health metrics.""" - timestamp = datetime.now().isoformat() - - # Get statistics - data = self._make_request('/api/v2/get-stats') - if data: - stats = data.get('stats', {}) - statistics = Statistics( - total_transcodes=stats.get('totalTranscodes', 0), - space_saved=stats.get('spaceSaved', 0), - total_files_processed=stats.get('totalFilesProcessed', 0), - failed_transcodes=stats.get('failedTranscodes', 0), - processing_speed=stats.get('processingSpeed', 0), - eta=stats.get('eta') - ) - - return StatisticsStatus( - timestamp=timestamp, - statistics=statistics - ) - else: - return StatisticsStatus( - timestamp=timestamp, - error='Unable to fetch statistics' - ) - - def health_check(self) -> HealthStatus: - """Perform comprehensive health check.""" - timestamp = datetime.now().isoformat() - - # Server connectivity - server_status = self.get_server_status() - server_check = HealthCheck( - status=server_status.status, - healthy=server_status.status == 'online' - ) - - # Node connectivity - node_status = self.get_node_status() - nodes_healthy = ( - node_status.node_summary.online_nodes > 0 if node_status.node_summary else False - ) and not node_status.error - - nodes_check = HealthCheck( - status='online' if nodes_healthy else 'offline', - healthy=nodes_healthy, - online_count=node_status.node_summary.online_nodes if node_status.node_summary else 0, - total_count=node_status.node_summary.total_nodes if node_status.node_summary else 0 - ) - - # Queue status - queue_status = self.get_queue_status() - queue_healthy = not queue_status.error - queue_check = HealthCheck( - status='accessible' if queue_healthy else 'error', - healthy=queue_healthy, - accessible=queue_healthy, - total_items=queue_status.queue_stats.total_files if queue_status.queue_stats else 0 - ) - - checks = { - 'server': server_check, - 'nodes': nodes_check, - 'queue': queue_check - } - - # Determine overall health - all_checks_healthy = all(check.healthy for check in checks.values()) - overall_status = 'healthy' if all_checks_healthy else 'unhealthy' - - return HealthStatus( - timestamp=timestamp, - overall_status=overall_status, - checks=checks - ) - - -def main(): - parser = argparse.ArgumentParser(description='Monitor Tdarr server via API') - parser.add_argument('--server', required=True, help='Tdarr server URL (e.g., http://10.10.0.43:8265)') - parser.add_argument('--check', choices=['all', 'status', 'queue', 'nodes', 'libraries', 'stats', 'health'], - default='health', help='Type of check to perform') - parser.add_argument('--timeout', type=int, default=30, help='Request timeout in seconds') - parser.add_argument('--output', choices=['json', 'pretty'], default='pretty', help='Output format') - parser.add_argument('--verbose', action='store_true', help='Enable verbose logging') - - args = parser.parse_args() - - if args.verbose: - logging.getLogger().setLevel(logging.DEBUG) - - # Initialize monitor - monitor = TdarrMonitor(args.server, args.timeout) - - # Perform requested check - result = None - if args.check == 'all': - result = { - 'server_status': monitor.get_server_status(), - 'queue_status': monitor.get_queue_status(), - 'node_status': monitor.get_node_status(), - 'library_status': monitor.get_library_status(), - 'statistics': monitor.get_statistics() - } - elif args.check == 'status': - result = monitor.get_server_status() - elif args.check == 'queue': - result = monitor.get_queue_status() - elif args.check == 'nodes': - result = monitor.get_node_status() - elif args.check == 'libraries': - result = monitor.get_library_status() - elif args.check == 'stats': - result = monitor.get_statistics() - elif args.check == 'health': - result = monitor.health_check() - - # Output results - if args.output == 'json': - # Convert dataclasses to dictionaries for JSON serialization - if args.check == 'all': - json_result = {} - for key, value in result.items(): - json_result[key] = asdict(value) - print(json.dumps(json_result, indent=2)) - else: - print(json.dumps(asdict(result), indent=2)) - else: - # Pretty print format - print(f"=== Tdarr Monitor Results - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===") - - if args.check == 'health' or (hasattr(result, 'overall_status') and result.overall_status): - health = result if hasattr(result, 'overall_status') else None - if health: - status = health.overall_status - print(f"Overall Status: {status.upper()}") - - if health.checks: - print("\nHealth Checks:") - for check_name, check_data in health.checks.items(): - status_icon = "βœ“" if check_data.healthy else "βœ—" - print(f" {status_icon} {check_name.title()}: {asdict(check_data)}") - - if args.check == 'all': - for section, data in result.items(): - print(f"\n=== {section.replace('_', ' ').title()} ===") - print(json.dumps(asdict(data), indent=2)) - elif args.check != 'health': - print(json.dumps(asdict(result), indent=2)) - - # Exit with appropriate code - if result: - # Check for unhealthy status in health check - if isinstance(result, HealthStatus) and result.overall_status == 'unhealthy': - sys.exit(1) - # Check for errors in individual status objects (all status classes except HealthStatus have error attribute) - elif (isinstance(result, (ServerStatus, QueueStatus, NodeStatus, LibraryStatus, StatisticsStatus)) - and result.error): - sys.exit(1) - # Check for errors in 'all' results - elif isinstance(result, dict): - for status_obj in result.values(): - if (isinstance(status_obj, (ServerStatus, QueueStatus, NodeStatus, LibraryStatus, StatisticsStatus)) - and status_obj.error): - sys.exit(1) - - sys.exit(0) - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/scripts/tdarr-manager b/scripts/tdarr-manager deleted file mode 100755 index 6495d7b..0000000 --- a/scripts/tdarr-manager +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -# Tdarr Manager - Quick access to Tdarr scheduler controls -# This is a convenience script that forwards to the main manager - -SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" -exec "${SCRIPT_DIR}/tdarr/tdarr-schedule-manager.sh" "$@" \ No newline at end of file diff --git a/tdarr/CONTEXT.md b/tdarr/CONTEXT.md new file mode 100644 index 0000000..0f6b399 --- /dev/null +++ b/tdarr/CONTEXT.md @@ -0,0 +1,152 @@ +# Tdarr Transcoding System - Technology Context + +## Overview +Tdarr is a distributed transcoding system that converts media files to optimized formats. This implementation uses an intelligent gaming-aware scheduler with unmapped node architecture for optimal performance and system stability. + +## Architecture Patterns + +### Distributed Unmapped Node Architecture (Recommended) +**Pattern**: Server-Node separation with local high-speed cache +- **Server**: Tdarr Server manages queue, web interface, and coordination +- **Node**: Unmapped nodes with local NVMe cache for processing +- **Benefits**: 3-5x performance improvement, network I/O reduction, linear scaling + +**When to Use**: +- Multiple transcoding nodes across network +- High-performance requirements (10GB+ files) +- Network bandwidth limitations +- Gaming systems requiring GPU priority management + +### Configuration Principles +1. **Cache Optimization**: Use local NVMe storage for work directories +2. **Gaming Detection**: Automatic pause during GPU-intensive activities +3. **Resource Isolation**: Container limits prevent kernel-level crashes +4. **Monitoring Integration**: Automated cleanup and Discord notifications + +## Core Components + +### Gaming-Aware Scheduler +**Purpose**: Automatically manages Tdarr node to avoid conflicts with gaming +**Location**: `scripts/tdarr-schedule-manager.sh` + +**Key Features**: +- Detects gaming processes (Steam, Lutris, Wine, etc.) +- GPU usage monitoring (>15% threshold) +- Configurable time windows +- Automated temporary directory cleanup + +**Schedule Format**: `"HOUR_START-HOUR_END:DAYS"` +- `"22-07:daily"` - Overnight transcoding +- `"09-17:1-5"` - Business hours weekdays only +- `"14-16:6,7"` - Weekend afternoon window + +### Monitoring System +**Purpose**: Prevents staging section timeouts and system instability +**Location**: `scripts/monitoring/tdarr-timeout-monitor.sh` + +**Capabilities**: +- Staging timeout detection (300-second hardcoded limit) +- Automatic work directory cleanup +- Discord notifications with user pings +- Log rotation and retention management + +### Container Architecture +**Server Configuration**: +```yaml +# Hybrid storage with resource limits +services: + tdarr: + image: ghcr.io/haveagitgat/tdarr:latest + ports: ["8265:8266"] + volumes: + - "./tdarr-data:/app/configs" + - "/mnt/media:/media" +``` + +**Node Configuration**: +```bash +# Unmapped node with local cache +podman run -d \ + --name tdarr-node-gpu \ + -e nodeType=unmapped \ + -v "/mnt/NV2/tdarr-cache:/cache" \ + --device nvidia.com/gpu=all \ + ghcr.io/haveagitgat/tdarr_node:latest +``` + +## Implementation Patterns + +### Performance Optimization +1. **Local Cache Strategy**: Download β†’ Process β†’ Upload (vs. streaming) +2. **Resource Limits**: Prevent memory exhaustion and kernel crashes +3. **Network Resilience**: CIFS mount options for stability +4. **Automated Cleanup**: Prevent accumulation of stuck directories + +### Error Prevention +1. **Plugin Safety**: Null-safe forEach operations `(streams || []).forEach()` +2. **Clean Installation**: Avoid custom plugin mounts causing version conflicts +3. **Container Isolation**: Resource limits prevent system-level crashes +4. **Network Stability**: Unmapped architecture reduces CIFS dependency + +### Gaming Integration +1. **Process Detection**: Monitor for gaming applications and utilities +2. **GPU Threshold**: Stop transcoding when GPU usage >15% +3. **Time Windows**: Respect user-defined allowed transcoding hours +4. **Manual Override**: Direct start/stop commands bypass scheduler + +## Common Workflows + +### Initial Setup +1. Start server with "Allow unmapped Nodes" enabled +2. Configure node as unmapped with local cache +3. Install gaming-aware scheduler via cron +4. Set up monitoring system for automated cleanup + +### Troubleshooting Patterns +1. **forEach Errors**: Clean plugin installation, avoid custom mounts +2. **Staging Timeouts**: Monitor system handles automatic cleanup +3. **System Crashes**: Convert to unmapped node architecture +4. **Network Issues**: Implement CIFS resilience options + +### Performance Tuning +1. **Cache Size**: 100-500GB NVMe for concurrent jobs +2. **Bandwidth**: Unmapped nodes reduce streaming requirements +3. **Scaling**: Linear scaling with additional unmapped nodes +4. **GPU Priority**: Gaming detection ensures responsive system + +## Best Practices + +### Production Deployment +- Use unmapped node architecture for stability +- Implement comprehensive monitoring +- Configure gaming-aware scheduling for desktop systems +- Set appropriate container resource limits + +### Development Guidelines +- Test with internal Tdarr test files first +- Implement null-safety checks in custom plugins +- Use structured logging for troubleshooting +- Separate concerns: scheduling, monitoring, processing + +### Security Considerations +- Container isolation prevents system-level failures +- Resource limits protect against memory exhaustion +- Network mount resilience prevents kernel crashes +- Automated cleanup prevents disk space issues + +## Migration Patterns + +### From Mapped to Unmapped Nodes +1. Enable "Allow unmapped Nodes" in server options +2. Update node configuration (add nodeType=unmapped) +3. Change cache volume to local storage +4. Remove media volume mapping +5. Test workflow and monitor performance + +### Plugin System Cleanup +1. Remove all custom plugin mounts +2. Force server restart to regenerate plugin ZIP +3. Restart nodes to download fresh plugins +4. Verify forEach fixes in downloaded plugins + +This technology context provides the foundation for implementing, troubleshooting, and optimizing Tdarr transcoding systems in home lab environments. \ No newline at end of file diff --git a/tdarr/examples/tdarr-cifs-troubleshooting-2025-08-11.md b/tdarr/examples/tdarr-cifs-troubleshooting-2025-08-11.md new file mode 100644 index 0000000..ecb9146 --- /dev/null +++ b/tdarr/examples/tdarr-cifs-troubleshooting-2025-08-11.md @@ -0,0 +1,143 @@ +# Tdarr CIFS Troubleshooting Session - 2025-08-11 + +## Problem Statement +Tdarr unmapped node experiencing persistent download timeouts at 9:08 PM with large files (31GB+ remux), causing "Cancelling" messages and stuck downloads. Downloads would hang for 33+ minutes before timing out, despite container remaining running. + +## Initial Hypothesis: Mapped vs Unmapped Node Issue +**Status**: ❌ **DISPROVEN** +- Suspected unmapped node timeout configuration differences +- Windows PC running mapped Tdarr node works fine (slow but stable) +- Both mapped and unmapped Linux nodes exhibited identical timeout issues +- **Conclusion**: Architecture type was not the root cause + +## Key Insight: Windows vs Linux Performance Difference +**Observation**: Windows Tdarr node (mapped mode) works without timeouts, Linux nodes (both mapped/unmapped) fail +**Implication**: Platform-specific issue, likely network stack or CIFS implementation + +## Root Cause Discovery Process + +### Phase 1: Linux Client CIFS Analysis +**Method**: Direct CIFS mount testing on Tdarr node machine (nobara-pc) + +**Initial CIFS Mount Configuration** (problematic): +```bash +//10.10.0.35/media on /mnt/media type cifs (rw,relatime,vers=3.1.1,cache=strict,upcall_target=app,username=root,uid=1000,forceuid,gid=1000,forcegid,addr=10.10.0.35,file_mode=0755,dir_mode=0755,soft,nounix,serverino,mapposix,noperm,reparse=nfs,nativesocket,symlink=native,rsize=4194304,wsize=4194304,bsize=1048576,retrans=1,echo_interval=60,actimeo=30,closetimeo=1,_netdev,x-systemd.automount,x-systemd.device-timeout=10,x-systemd.mount-timeout=30) +``` + +**Critical Issues Identified**: +- `soft` - Mount fails on timeout instead of retrying indefinitely +- `retrans=1` - Only 1 retry attempt (NFS option, invalid for CIFS) +- `closetimeo=1` - Very short close timeout (1 second) +- `cache=strict` - No local caching, poor performance for large files +- `x-systemd.mount-timeout=30` - 30-second mount timeout + +**Optimization Applied**: +```bash +//10.10.0.35/media /mnt/media cifs credentials=/home/cal/.samba_credentials,uid=1000,gid=1000,vers=3.1.1,hard,rsize=16777216,wsize=16777216,cache=loose,actimeo=60,echo_interval=30,_netdev,x-systemd.automount,x-systemd.device-timeout=60,x-systemd.mount-timeout=120,noperm 0 0 +``` + +**Performance Testing Results**: +- **Local SSD**: `dd` 800MB in 0.217s (4.0 GB/s) - baseline +- **CIFS 1MB blocks**: 42.7 MB/s - fast, no issues +- **CIFS 4MB blocks**: 205 MB/s - fast, no issues +- **CIFS 8MB blocks**: 83.1 MB/s - **3-minute terminal freeze** + +**Critical Discovery**: Block size dependency causing I/O blocking with large transfers + +### Phase 2: Tdarr Server-Side Analysis +**Method**: Test Tdarr API download path directly + +**API Test Command**: +```bash +curl -X POST "http://10.10.0.43:8265/api/v2/file/download" \ + -H "Content-Type: application/json" \ + -d '{"filePath": "/media/Movies/Jumanji (1995)/Jumanji (1995) Remux-1080p Proper.mkv"}' \ + -o /tmp/tdarr-api-test.mkv +``` + +**Results**: +- **Performance**: 55.7-58.6 MB/s sustained +- **Progress**: Downloaded 15.3GB of 23GB (66%) +- **Failure**: **Download hung at 66% completion** +- **Timing**: Hung after ~5 minutes (consistent with previous timeout patterns) + +### Phase 3: Tdarr Server CIFS Configuration Analysis +**Method**: Examine server-side storage mount + +**Server CIFS Mount** (problematic): +```bash +//10.10.0.35/media /mnt/truenas-share cifs credentials=/root/.truenascreds,vers=3.1.1,rsize=4194304,wsize=4194304,cache=strict,actimeo=30,echo_interval=60,noperm 0 0 +``` + +**Server Issues Identified**: +- **Missing `hard`** - Defaults to `soft` mount behavior +- `cache=strict` - No local caching (same issue as client) +- **No retry/timeout extensions** - Uses unreliable kernel defaults +- **No systemd timeout protection** + +## Root Cause Confirmed +**Primary Issue**: Tdarr server's CIFS mount to TrueNAS using suboptimal configuration +**Impact**: Large file streaming via Tdarr API hangs when server's CIFS mount hits I/O blocking +**Evidence**: API download hung at exact same pattern as node timeouts (66% through large file) + +## Solution Strategy +**Fix Tdarr Server CIFS Mount Configuration**: +```bash +//10.10.0.35/media /mnt/truenas-share cifs credentials=/root/.truenascreds,vers=3.1.1,hard,rsize=4194304,wsize=4194304,cache=loose,actimeo=60,echo_interval=30,_netdev,x-systemd.device-timeout=60,x-systemd.mount-timeout=120,noperm 0 0 +``` + +**Key Optimizations**: +- `hard` - Retry indefinitely instead of timing out +- `cache=loose` - Enable local caching for large file performance +- `actimeo=60` - Longer attribute caching +- `echo_interval=30` - More frequent keep-alives +- Extended systemd timeouts for reliability + +## Implementation Steps +1. **Update server `/etc/fstab`** with optimized CIFS configuration +2. **Remount server storage**: + ```bash + ssh tdarr "sudo umount /mnt/truenas-share" + ssh tdarr "sudo systemctl daemon-reload" + ssh tdarr "sudo mount /mnt/truenas-share" + ``` +3. **Test large file API download** to verify fix +4. **Resume Tdarr transcoding** with confidence in large file handling + +## Technical Insights + +### CIFS vs SMB Protocol Differences +- **Windows nodes**: Use native SMB implementation (stable) +- **Linux nodes**: Use kernel CIFS module (prone to I/O blocking with poor configuration) +- **Block size sensitivity**: Large block transfers require careful timeout/retry configuration + +### Tdarr Architecture Impact +- **Unmapped nodes**: Download entire files via API before processing (high bandwidth, vulnerable to server CIFS issues) +- **Mapped nodes**: Stream files during processing (lower bandwidth, still vulnerable to server CIFS issues) +- **Root cause affects both architectures** since server-side storage access is the bottleneck + +### Performance Expectations Post-Fix +- **Consistent 50-100 MB/s** for large file downloads +- **No timeout failures** with properly configured hard mounts +- **Reliable processing** of 31GB+ remux files + +## Files Modified +- **Client**: `/etc/fstab` on nobara-pc (CIFS optimization applied) +- **Server**: `/etc/fstab` on tdarr server (pending optimization) + +## Monitoring and Validation +- **Success criteria**: Tdarr API download of 23GB+ file completes without hanging +- **Performance target**: Sustained 50+ MB/s throughout entire transfer +- **Reliability target**: No timeouts during large file processing + +## Session Outcome +**Status**: βœ… **ROOT CAUSE IDENTIFIED AND SOLUTION READY** +- Eliminated client-side variables through systematic testing +- Confirmed server-side CIFS configuration as bottleneck +- Validated fix strategy through client-side optimization success +- Ready to implement server-side solution + +--- +*Session Date: 2025-08-11* +*Duration: ~3 hours* +*Methods: Direct testing, API analysis, mount configuration review* \ No newline at end of file diff --git a/tdarr/examples/tdarr-node-configurations.md b/tdarr/examples/tdarr-node-configurations.md new file mode 100644 index 0000000..c5d94b1 --- /dev/null +++ b/tdarr/examples/tdarr-node-configurations.md @@ -0,0 +1,183 @@ +# Tdarr Node Container Configurations + +## Overview +Complete examples for running Tdarr transcoding nodes in containers, covering both CPU-only and GPU-accelerated setups. + +## CPU-Only Configuration (Docker Compose) + +For systems without GPU or when GPU isn't needed: + +```yaml +version: "3.4" +services: + tdarr-node: + container_name: tdarr-node-cpu + image: ghcr.io/haveagitgat/tdarr_node:latest + restart: unless-stopped + environment: + - TZ=America/Chicago + - UMASK_SET=002 + - nodeName=local-workstation-cpu + - serverIP=YOUR_TDARR_SERVER_IP # Replace with your tdarr server IP + - serverPort=8266 + - inContainer=true + - ffmpegVersion=6 + volumes: + # Mount your media from the same NAS share as the server + - /path/to/your/media:/media # Replace with your local media mount + # Temp directory for transcoding cache + - ./temp:/temp +``` + +**Use case**: +- CPU-only transcoding +- Testing Tdarr functionality +- Systems without dedicated GPU +- When GPU drivers aren't available + +## GPU-Accelerated Configuration (Podman) + +**Recommended for Fedora/RHEL/CentOS/Nobara systems:** + +### Mapped Node (Direct Media Access) +```bash +podman run -d --name tdarr-node-gpu-mapped \ + --gpus all \ + --restart unless-stopped \ + -e TZ=America/Chicago \ + -e UMASK_SET=002 \ + -e nodeName=local-workstation-gpu-mapped \ + -e serverIP=10.10.0.43 \ + -e serverPort=8266 \ + -e inContainer=true \ + -e ffmpegVersion=6 \ + -e NVIDIA_DRIVER_CAPABILITIES=all \ + -e NVIDIA_VISIBLE_DEVICES=all \ + -v /mnt/NV2/tdarr-cache:/cache \ + -v /mnt/media/TV:/media/TV \ + -v /mnt/media/Movies:/media/Movies \ + -v /mnt/media/tdarr/tdarr-cache-clean:/temp \ + ghcr.io/haveagitgat/tdarr_node:latest +``` + +### Unmapped Node (Downloads Files) +```bash +podman run -d --name tdarr-node-gpu-unmapped \ + --gpus all \ + --restart unless-stopped \ + -e TZ=America/Chicago \ + -e UMASK_SET=002 \ + -e nodeName=local-workstation-gpu-unmapped \ + -e serverIP=10.10.0.43 \ + -e serverPort=8266 \ + -e inContainer=true \ + -e ffmpegVersion=6 \ + -e NVIDIA_DRIVER_CAPABILITIES=all \ + -e NVIDIA_VISIBLE_DEVICES=all \ + -v /mnt/NV2/tdarr-cache:/cache \ + -v /mnt/media:/media \ + -v /mnt/media/tdarr/tdarr-cache-clean:/temp \ + ghcr.io/haveagitgat/tdarr_node:latest +``` + +**Use cases**: +- **Mapped**: Direct media access, faster processing, no file downloads +- **Unmapped**: Works when network shares aren't available locally +- Hardware video encoding/decoding (NVENC/NVDEC) +- High-performance transcoding with NVMe cache +- Multiple concurrent streams +- Fedora-based systems where Podman works better than Docker + +## GPU-Accelerated Configuration (Docker) + +**For Ubuntu/Debian systems where Docker GPU support works:** + +```yaml +version: "3.4" +services: + tdarr-node: + container_name: tdarr-node-gpu + image: ghcr.io/haveagitgat/tdarr_node:latest + restart: unless-stopped + environment: + - TZ=America/Chicago + - UMASK_SET=002 + - nodeName=local-workstation-gpu + - serverIP=YOUR_TDARR_SERVER_IP + - serverPort=8266 + - inContainer=true + - ffmpegVersion=6 + - NVIDIA_DRIVER_CAPABILITIES=all + - NVIDIA_VISIBLE_DEVICES=all + volumes: + - /path/to/your/media:/media + - ./temp:/temp + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] +``` + +## Configuration Parameters + +### Required Environment Variables +- `TZ`: Timezone (e.g., `America/Chicago`) +- `nodeName`: Unique identifier for this node +- `serverIP`: IP address of Tdarr server +- `serverPort`: Tdarr server port (typically 8266) +- `inContainer`: Set to `true` for containerized deployments +- `ffmpegVersion`: FFmpeg version to use (6 recommended) + +### GPU-Specific Variables +- `NVIDIA_DRIVER_CAPABILITIES`: Set to `all` for full GPU access +- `NVIDIA_VISIBLE_DEVICES`: `all` for all GPUs, or specific GPU IDs + +### Volume Mounts +- `/media`: Mount point for media files (must match server configuration) +- `/temp`: Temporary directory for transcoding cache + +## Platform-Specific Recommendations + +### Fedora/RHEL/CentOS/Nobara +- **GPU**: Use Podman (Docker Desktop has GPU issues) +- **CPU**: Docker or Podman both work fine + +### Ubuntu/Debian +- **GPU**: Use Docker with nvidia-container-toolkit +- **CPU**: Docker recommended + +### Testing GPU Functionality + +Verify GPU access inside container: +```bash +# For Podman +podman exec tdarr-node-gpu nvidia-smi + +# For Docker +docker exec tdarr-node-gpu nvidia-smi +``` + +Test NVENC encoding: +```bash +# For Podman +podman exec tdarr-node-gpu /usr/local/bin/tdarr-ffmpeg -f lavfi -i testsrc2=duration=5:size=1920x1080:rate=30 -c:v h264_nvenc -t 5 /tmp/test.mp4 + +# For Docker +docker exec tdarr-node-gpu /usr/local/bin/tdarr-ffmpeg -f lavfi -i testsrc2=duration=5:size=1920x1080:rate=30 -c:v h264_nvenc -t 5 /tmp/test.mp4 +``` + +## Troubleshooting + +- **GPU not detected**: See `reference/docker/nvidia-gpu-troubleshooting.md` +- **Permission issues**: Ensure proper UMASK_SET and volume permissions +- **Connection issues**: Verify serverIP and firewall settings +- **Performance issues**: Monitor CPU/GPU utilization during transcoding + +## Related Documentation + +- `patterns/docker/gpu-acceleration.md` - GPU acceleration patterns +- `reference/docker/nvidia-gpu-troubleshooting.md` - Detailed GPU troubleshooting +- `start-tdarr-gpu-podman.sh` - Ready-to-use Podman startup script \ No newline at end of file diff --git a/tdarr/examples/tdarr-node-local/docker-compose-cpu.yml b/tdarr/examples/tdarr-node-local/docker-compose-cpu.yml new file mode 100644 index 0000000..3c4f574 --- /dev/null +++ b/tdarr/examples/tdarr-node-local/docker-compose-cpu.yml @@ -0,0 +1,28 @@ +version: "3.4" +services: + tdarr-node: + container_name: tdarr-node-local-cpu + image: ghcr.io/haveagitgat/tdarr_node:latest + restart: unless-stopped + environment: + - TZ=America/Chicago + - UMASK_SET=002 + - nodeName=local-workstation-cpu + - serverIP=192.168.1.100 # Replace with your Tdarr server IP + - serverPort=8266 + - inContainer=true + - ffmpegVersion=6 + volumes: + # Media access (same as server) + - /mnt/media:/media # Replace with your media path + # Local transcoding cache + - ./temp:/temp + # Resource limits for CPU transcoding + deploy: + resources: + limits: + cpus: '14' # Leave some cores for system (16-core = use 14) + memory: 32G # Generous for 4K transcoding + reservations: + cpus: '8' # Minimum guaranteed cores + memory: 16G \ No newline at end of file diff --git a/tdarr/examples/tdarr-node-local/docker-compose-gpu.yml b/tdarr/examples/tdarr-node-local/docker-compose-gpu.yml new file mode 100644 index 0000000..592e194 --- /dev/null +++ b/tdarr/examples/tdarr-node-local/docker-compose-gpu.yml @@ -0,0 +1,45 @@ +version: "3.4" +services: + tdarr-node: + container_name: tdarr-node-local-gpu + image: ghcr.io/haveagitgat/tdarr_node:latest + restart: unless-stopped + environment: + - TZ=America/Chicago + - UMASK_SET=002 + - nodeName=local-workstation-gpu + - serverIP=192.168.1.100 # Replace with your Tdarr server IP + - serverPort=8266 + - inContainer=true + - ffmpegVersion=6 + # NVIDIA environment variables + - NVIDIA_DRIVER_CAPABILITIES=all + - NVIDIA_VISIBLE_DEVICES=all + volumes: + # Media access (same as server) + - /mnt/media:/media # Replace with your media path + # Local transcoding cache + - ./temp:/temp + devices: + - /dev/dri:/dev/dri # Intel/AMD GPU fallback + + # GPU configuration - choose ONE method: + + # Method 1: Deploy syntax (recommended) + deploy: + resources: + limits: + memory: 16G # GPU transcoding uses less RAM + reservations: + memory: 8G + devices: + - driver: nvidia + count: all + capabilities: [gpu] + + # Method 2: Runtime (alternative) + # runtime: nvidia + + # Method 3: CDI (future) + # devices: + # - nvidia.com/gpu=all \ No newline at end of file diff --git a/tdarr/examples/tdarr-node-local/start-tdarr-mapped-node.sh b/tdarr/examples/tdarr-node-local/start-tdarr-mapped-node.sh new file mode 100755 index 0000000..3b09a8f --- /dev/null +++ b/tdarr/examples/tdarr-node-local/start-tdarr-mapped-node.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# Tdarr Mapped Node with GPU Support - Example Script +# This script starts a MAPPED Tdarr node container with NVIDIA GPU acceleration using Podman +# +# MAPPED NODES: Direct access to media files via volume mounts +# Use this approach when you want the node to directly access your media library +# for local processing without server coordination for file transfers +# +# Configure these variables for your setup: + +set -e + +CONTAINER_NAME="tdarr-node-gpu-mapped" +SERVER_IP="YOUR_SERVER_IP" # e.g., "10.10.0.43" or "192.168.1.100" +SERVER_PORT="8266" # Default Tdarr server port +NODE_NAME="YOUR_NODE_NAME" # e.g., "workstation-gpu" or "local-gpu-node" +MEDIA_PATH="/path/to/your/media" # e.g., "/mnt/media" or "/home/user/Videos" +CACHE_PATH="/path/to/cache" # e.g., "/mnt/ssd/tdarr-cache" + +echo "πŸš€ Starting MAPPED Tdarr Node with GPU support using Podman..." +echo " Media Path: ${MEDIA_PATH}" +echo " Cache Path: ${CACHE_PATH}" + +# Stop and remove existing container if it exists +if podman ps -a --format "{{.Names}}" | grep -q "^${CONTAINER_NAME}$"; then + echo "πŸ›‘ Stopping existing container: ${CONTAINER_NAME}" + podman stop "${CONTAINER_NAME}" 2>/dev/null || true + podman rm "${CONTAINER_NAME}" 2>/dev/null || true +fi + +# Start Tdarr node with GPU support +echo "🎬 Starting Tdarr Node container..." +podman run -d --name "${CONTAINER_NAME}" \ + --gpus all \ + --restart unless-stopped \ + -e TZ=America/Chicago \ + -e UMASK_SET=002 \ + -e nodeName="${NODE_NAME}" \ + -e serverIP="${SERVER_IP}" \ + -e serverPort="${SERVER_PORT}" \ + -e inContainer=true \ + -e ffmpegVersion=6 \ + -e logLevel=DEBUG \ + -e NVIDIA_DRIVER_CAPABILITIES=all \ + -e NVIDIA_VISIBLE_DEVICES=all \ + -v "${MEDIA_PATH}:/media" \ + -v "${CACHE_PATH}:/temp" \ + ghcr.io/haveagitgat/tdarr_node:latest + +echo "⏳ Waiting for container to initialize..." +sleep 5 + +# Check container status +if podman ps --format "{{.Names}}" | grep -q "^${CONTAINER_NAME}$"; then + echo "βœ… Mapped Tdarr Node is running successfully!" + echo "" + echo "πŸ“Š Container Status:" + podman ps --filter "name=${CONTAINER_NAME}" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" + echo "" + echo "πŸ” Testing GPU Access:" + if podman exec "${CONTAINER_NAME}" nvidia-smi --query-gpu=name --format=csv,noheader,nounits 2>/dev/null; then + echo "πŸŽ‰ GPU is accessible in container!" + else + echo "⚠️ GPU test failed, but container is running" + fi + echo "" + echo "🌐 Connection Details:" + echo " Server: ${SERVER_IP}:${SERVER_PORT}" + echo " Node Name: ${NODE_NAME}" + echo "" + echo "πŸ§ͺ Test NVENC encoding:" + echo " podman exec ${CONTAINER_NAME} /usr/local/bin/tdarr-ffmpeg -f lavfi -i testsrc2=duration=5:size=1920x1080:rate=30 -c:v h264_nvenc -preset fast -t 5 /tmp/test.mp4" + echo "" + echo "πŸ“‹ Container Management:" + echo " View logs: podman logs ${CONTAINER_NAME}" + echo " Stop: podman stop ${CONTAINER_NAME}" + echo " Remove: podman rm ${CONTAINER_NAME}" +else + echo "❌ Failed to start container" + echo "πŸ“‹ Checking logs..." + podman logs "${CONTAINER_NAME}" --tail 10 + exit 1 +fi \ No newline at end of file diff --git a/tdarr/examples/tdarr-server-setup/README.md b/tdarr/examples/tdarr-server-setup/README.md new file mode 100644 index 0000000..d7f4a4d --- /dev/null +++ b/tdarr/examples/tdarr-server-setup/README.md @@ -0,0 +1,69 @@ +# Tdarr Server Setup Example + +## Directory Structure +``` +~/container-data/tdarr/ +β”œβ”€β”€ docker-compose.yml +β”œβ”€β”€ stonefish-tdarr-plugins/ # Custom plugins +β”œβ”€β”€ tdarr/ +β”‚ β”œβ”€β”€ server/ # Local storage +β”‚ β”œβ”€β”€ configs/ +β”‚ └── logs/ +└── temp/ # Local temp if needed +``` + +## Storage Strategy + +### Local Storage (Fast Access) +- **Database**: SQLite requires local filesystem for WAL mode +- **Configs**: Frequently accessed during startup +- **Logs**: Regular writes during operation + +### Network Storage (Capacity) +- **Backups**: Infrequent access, large files +- **Media**: Read-only during transcoding +- **Cache**: Temporary transcoding files + +## Upgrade Process + +### Major Version Upgrades +1. **Backup current state** + ```bash + docker-compose down + cp docker-compose.yml docker-compose.yml.backup + ``` + +2. **For clean start** (recommended for major versions): + ```bash + # Remove old database + sudo rm -rf ./tdarr/server + mkdir -p ./tdarr/server + + # Pull latest image + docker-compose pull + + # Start fresh + docker-compose up -d + ``` + +3. **Monitor initialization** + ```bash + docker-compose logs -f + ``` + +## Common Issues + +### Disk Space +- Monitor local database growth +- Regular cleanup of old backups +- Use network storage for large static data + +### Permissions +- Container runs as PUID/PGID (usually 0/0) +- Ensure proper ownership of mounted directories +- Use `sudo rm -rf` for root-owned container files + +### Network Filesystem Issues +- SQLite incompatible with NFS/SMB for database +- Keep database local, only backups on network +- Monitor transcoding cache disk usage \ No newline at end of file diff --git a/tdarr/examples/tdarr-server-setup/docker-compose.yml b/tdarr/examples/tdarr-server-setup/docker-compose.yml new file mode 100644 index 0000000..4291d43 --- /dev/null +++ b/tdarr/examples/tdarr-server-setup/docker-compose.yml @@ -0,0 +1,37 @@ +version: "3.4" +services: + tdarr: + container_name: tdarr + image: ghcr.io/haveagitgat/tdarr:latest + restart: unless-stopped + network_mode: bridge + ports: + - 8265:8265 # webUI port + - 8266:8266 # server port + environment: + - TZ=America/Chicago + - PUID=0 + - PGID=0 + - UMASK_SET=002 + - serverIP=0.0.0.0 + - serverPort=8266 + - webUIPort=8265 + - internalNode=false # Disable for distributed setup + - inContainer=true + - ffmpegVersion=6 + - nodeName=docker-server + volumes: + # Plugin mounts (stonefish example) + - ./stonefish-tdarr-plugins/FlowPlugins/:/app/server/Tdarr/Plugins/FlowPlugins/ + - ./stonefish-tdarr-plugins/FlowPluginsTs/:/app/server/Tdarr/Plugins/FlowPluginsTs/ + - ./stonefish-tdarr-plugins/Community/:/app/server/Tdarr/Plugins/Community/ + + # Hybrid storage strategy + - ./tdarr/server:/app/server # Local: Database, configs, logs + - ./tdarr/configs:/app/configs + - ./tdarr/logs:/app/logs + - /mnt/truenas-share/tdarr/tdarr-server/Backups:/app/server/Tdarr/Backups # Network: Backups + + # Media and cache + - /mnt/truenas-share:/media + - /mnt/truenas-share/tdarr/tdarr-cache:/temp \ No newline at end of file diff --git a/tdarr/scripts/CONTEXT.md b/tdarr/scripts/CONTEXT.md new file mode 100644 index 0000000..864128b --- /dev/null +++ b/tdarr/scripts/CONTEXT.md @@ -0,0 +1,212 @@ +# Tdarr Scripts - Operational Context + +## Script Overview +This directory contains active operational scripts for Tdarr transcoding automation, gaming-aware scheduling, and system management. + +## Core Scripts + +### Gaming-Aware Scheduler +**Primary Script**: `tdarr-schedule-manager.sh` +**Purpose**: Comprehensive management interface for gaming-aware Tdarr scheduling + +**Key Functions**: +- **Preset Management**: Quick schedule templates (night-only, work-safe, weekend-heavy, gaming-only) +- **Installation**: Automated cron job setup and configuration +- **Status Monitoring**: Real-time status and logging +- **Configuration**: Interactive schedule editing and validation + +**Usage Patterns**: +```bash +# Quick setup +./tdarr-schedule-manager.sh preset work-safe +./tdarr-schedule-manager.sh install + +# Monitoring +./tdarr-schedule-manager.sh status +./tdarr-schedule-manager.sh logs + +# Testing +./tdarr-schedule-manager.sh test +``` + +### Container Management +**Start Script**: `start-tdarr-gpu-podman-clean.sh` +**Purpose**: Launch unmapped Tdarr node with optimized configuration + +**Key Features**: +- **Unmapped Node Configuration**: Local cache for optimal performance +- **GPU Support**: Full NVIDIA device passthrough +- **Resource Optimization**: Direct NVMe cache mapping +- **Clean Architecture**: No media volume dependencies + +**Stop Script**: `stop-tdarr-gpu-podman.sh` +**Purpose**: Graceful container shutdown with cleanup + +### Scheduling Engine +**Core Engine**: `tdarr-cron-check-configurable.sh` +**Purpose**: Minute-by-minute decision engine for Tdarr state management + +**Decision Logic**: +1. **Gaming Detection**: Check for active gaming processes +2. **GPU Monitoring**: Verify GPU usage below threshold (15%) +3. **Time Window Validation**: Ensure current time within allowed schedule +4. **State Management**: Start/stop Tdarr based on conditions + +**Gaming Process Detection**: +- Steam, Lutris, Heroic Games Launcher +- Wine, Bottles (Windows compatibility layers) +- GameMode, MangoHUD (gaming utilities) +- GPU usage monitoring via nvidia-smi + +### Configuration Management +**Config File**: `tdarr-schedule.conf` +**Purpose**: Centralized configuration for scheduler behavior + +**Configuration Structure**: +```bash +# Time blocks: "HOUR_START-HOUR_END:DAYS" +SCHEDULE_BLOCKS="22-07:daily 09-17:1-5" + +# Gaming detection settings +GPU_THRESHOLD=15 +GAMING_PROCESSES="steam lutris heroic wine bottles gamemode mangohud" + +# Operational settings +LOG_FILE="/tmp/tdarr-scheduler.log" +CONTAINER_NAME="tdarr-node-gpu" +``` + +## Operational Patterns + +### Automated Maintenance +**Cron Integration**: Two automated systems running simultaneously +1. **Scheduler** (every minute): `tdarr-cron-check-configurable.sh` +2. **Cleanup** (every 6 hours): Temporary directory maintenance + +**Cleanup Automation**: +```bash +# Removes abandoned transcoding directories +0 */6 * * * find /tmp -name "tdarr-workDir2-*" -type d -mmin +360 -exec rm -rf {} \; 2>/dev/null || true +``` + +### Logging Strategy +**Log Location**: `/tmp/tdarr-scheduler.log` +**Log Format**: Timestamped entries with decision reasoning +**Log Rotation**: Manual cleanup, focused on recent activity + +**Log Examples**: +``` +[2025-08-13 14:30:01] Gaming detected (steam), stopping Tdarr +[2025-08-13 14:35:01] Gaming ended, but outside allowed hours (14:35 not in 22-07:daily) +[2025-08-13 22:00:01] Starting Tdarr (no gaming, within schedule) +``` + +### System Integration +**Gaming Detection**: Real-time process monitoring +**GPU Monitoring**: nvidia-smi integration for usage thresholds +**Container Management**: Podman-based lifecycle management +**Cron Integration**: Standard system scheduler for automation + +## Configuration Presets + +### Preset Profiles +**night-only**: `"22-07:daily"` - Overnight transcoding only +**work-safe**: `"22-07:daily 09-17:1-5"` - Nights + work hours +**weekend-heavy**: `"22-07:daily 09-17:1-5 08-20:6-7"` - Maximum time +**gaming-only**: No time limits, gaming detection only + +### Schedule Format Specification +**Format**: `"HOUR_START-HOUR_END:DAYS"` +**Examples**: +- `"22-07:daily"` - 10PM to 7AM every day (overnight) +- `"09-17:1-5"` - 9AM to 5PM Monday-Friday +- `"14-16:6,7"` - 2PM to 4PM Saturday and Sunday +- `"08-20:6-7"` - 8AM to 8PM weekends only + +## Container Architecture + +### Unmapped Node Configuration +**Architecture Choice**: Local cache with API-based transfers +**Benefits**: 3-5x performance improvement, reduced network dependency + +**Container Environment**: +```bash +-e nodeType=unmapped +-e unmappedNodeCache=/cache +-e enableGpu=true +-e TZ=America/New_York +``` + +**Volume Configuration**: +```bash +# Local high-speed cache (NVMe) +-v "/mnt/NV2/tdarr-cache:/cache" + +# Configuration persistence +-v "/mnt/NV2/tdarr-cache-clean:/temp" + +# No media volumes (unmapped mode uses API) +``` + +### Resource Management +**GPU Access**: Full NVIDIA device passthrough +**Memory**: Controlled by container limits +**CPU**: Shared with host system +**Storage**: Local NVMe for optimal I/O performance + +## Troubleshooting Context + +### Common Issues +1. **Gaming Not Detected**: Check process names in configuration +2. **Time Window Issues**: Verify schedule block format +3. **Container Start Failures**: Check GPU device access +4. **Log File Growth**: Manual cleanup of scheduler logs + +### Diagnostic Commands +```bash +# Test current conditions +./tdarr-schedule-manager.sh test + +# View real-time logs +./tdarr-schedule-manager.sh logs + +# Check container status +podman ps | grep tdarr + +# Verify GPU access +podman exec tdarr-node-gpu nvidia-smi +``` + +### Recovery Procedures +```bash +# Reset to defaults +./tdarr-schedule-manager.sh preset work-safe + +# Reinstall scheduler +./tdarr-schedule-manager.sh install + +# Manual container restart +./stop-tdarr-gpu-podman.sh +./start-tdarr-gpu-podman-clean.sh +``` + +## Integration Points + +### External Dependencies +- **Podman**: Container runtime for node management +- **nvidia-smi**: GPU monitoring and device access +- **cron**: System scheduler for automation +- **SSH**: Remote server access (monitoring scripts) + +### File System Dependencies +- **Cache Directory**: `/mnt/NV2/tdarr-cache` (local NVMe) +- **Temp Directory**: `/mnt/NV2/tdarr-cache-clean` (processing space) +- **Log Files**: `/tmp/tdarr-scheduler.log` (operational logs) +- **Configuration**: Local `tdarr-schedule.conf` file + +### Network Dependencies +- **Tdarr Server**: API communication for unmapped node operation +- **Discord Webhooks**: Optional notification integration (via monitoring) +- **NAS Access**: For final file storage (post-processing only) + +This operational context provides comprehensive guidance for managing active Tdarr automation scripts in production environments. \ No newline at end of file diff --git a/scripts/tdarr/README.md b/tdarr/scripts/README.md similarity index 100% rename from scripts/tdarr/README.md rename to tdarr/scripts/README.md diff --git a/scripts/tdarr/start-tdarr-gpu-podman-clean.sh b/tdarr/scripts/start-tdarr-gpu-podman-clean.sh similarity index 100% rename from scripts/tdarr/start-tdarr-gpu-podman-clean.sh rename to tdarr/scripts/start-tdarr-gpu-podman-clean.sh diff --git a/scripts/tdarr/stop-tdarr-gpu-podman.sh b/tdarr/scripts/stop-tdarr-gpu-podman.sh similarity index 100% rename from scripts/tdarr/stop-tdarr-gpu-podman.sh rename to tdarr/scripts/stop-tdarr-gpu-podman.sh diff --git a/scripts/tdarr/tdarr-cron-check-configurable.sh b/tdarr/scripts/tdarr-cron-check-configurable.sh similarity index 100% rename from scripts/tdarr/tdarr-cron-check-configurable.sh rename to tdarr/scripts/tdarr-cron-check-configurable.sh diff --git a/scripts/tdarr/tdarr-schedule-manager.sh b/tdarr/scripts/tdarr-schedule-manager.sh similarity index 100% rename from scripts/tdarr/tdarr-schedule-manager.sh rename to tdarr/scripts/tdarr-schedule-manager.sh diff --git a/scripts/tdarr/tdarr-schedule.conf b/tdarr/scripts/tdarr-schedule.conf similarity index 100% rename from scripts/tdarr/tdarr-schedule.conf rename to tdarr/scripts/tdarr-schedule.conf diff --git a/tdarr/troubleshooting.md b/tdarr/troubleshooting.md new file mode 100644 index 0000000..5b08c1e --- /dev/null +++ b/tdarr/troubleshooting.md @@ -0,0 +1,272 @@ +# Tdarr Troubleshooting Guide + +## forEach Error Resolution + +### Problem: TypeError: Cannot read properties of undefined (reading 'forEach') +**Symptoms**: Scanning phase fails at "Tagging video res" step, preventing all transcodes +**Root Cause**: Custom plugin mounts override community plugins with incompatible versions + +### Solution: Clean Plugin Installation +1. **Remove custom plugin mounts** from docker-compose.yml +2. **Force plugin regeneration**: + ```bash + ssh tdarr "docker restart tdarr" + podman restart tdarr-node-gpu + ``` +3. **Verify clean plugins**: Check for null-safety fixes `(streams || []).forEach()` + +### Plugin Safety Patterns +```javascript +// ❌ Unsafe - causes forEach errors +args.variables.ffmpegCommand.streams.forEach() + +// βœ… Safe - null-safe forEach +(args.variables.ffmpegCommand.streams || []).forEach() +``` + +## Staging Section Timeout Issues + +### Problem: Files removed from staging after 300 seconds +**Symptoms**: +- `.tmp` files stuck in work directories +- ENOTEMPTY errors during cleanup +- Subsequent jobs blocked + +### Solution: Automated Monitoring System +**Monitor Script**: `/mnt/NV2/Development/claude-home/scripts/monitoring/tdarr-timeout-monitor.sh` + +**Automatic Actions**: +- Detects staging timeouts every 20 minutes +- Removes stuck work directories +- Sends Discord notifications +- Logs all cleanup activities + +### Manual Cleanup Commands +```bash +# Check staging section +ssh tdarr "docker logs tdarr | tail -50" + +# Find stuck work directories +find /mnt/NV2/tdarr-cache -name "tdarr-workDir*" -type d + +# Force cleanup stuck directory +rm -rf /mnt/NV2/tdarr-cache/tdarr-workDir-[ID] +``` + +## System Stability Issues + +### Problem: Kernel crashes during intensive transcoding +**Root Cause**: CIFS network issues during large file streaming (mapped nodes) + +### Solution: Convert to Unmapped Node Architecture +1. **Enable unmapped nodes** in server Options +2. **Update node configuration**: + ```bash + # Add to container environment + -e nodeType=unmapped + -e unmappedNodeCache=/cache + + # Use local cache volume + -v "/mnt/NV2/tdarr-cache:/cache" + + # Remove media volume (no longer needed) + ``` +3. **Benefits**: Eliminates CIFS streaming, prevents kernel crashes + +### Container Resource Limits +```yaml +# Prevent memory exhaustion +deploy: + resources: + limits: + memory: 8G + cpus: '6' +``` + +## Gaming Detection Issues + +### Problem: Tdarr doesn't stop during gaming +**Check gaming detection**: +```bash +# Test current gaming detection +./tdarr-schedule-manager.sh test + +# View scheduler logs +tail -f /tmp/tdarr-scheduler.log + +# Verify GPU usage detection +nvidia-smi +``` + +### Gaming Process Detection +**Monitored Processes**: +- Steam, Lutris, Heroic Games Launcher +- Wine, Bottles (Windows compatibility) +- GameMode, MangoHUD (utilities) +- **GPU usage >15%** (configurable threshold) + +### Configuration Adjustments +```bash +# Edit gaming detection threshold +./tdarr-schedule-manager.sh edit + +# Apply preset configurations +./tdarr-schedule-manager.sh preset gaming-only # No time limits +./tdarr-schedule-manager.sh preset night-only # 10PM-7AM only +``` + +## Network and Access Issues + +### Server Connection Problems +**Server Access Commands**: +```bash +# SSH to Tdarr server +ssh tdarr + +# Check server status +ssh tdarr "docker ps | grep tdarr" + +# View server logs +ssh tdarr "docker logs tdarr" + +# Access server container +ssh tdarr "docker exec -it tdarr /bin/bash" +``` + +### Node Registration Issues +```bash +# Check node logs +podman logs tdarr-node-gpu + +# Verify node registration +# Look for "Node registered" in server logs +ssh tdarr "docker logs tdarr | grep -i node" + +# Test node connectivity +curl http://10.10.0.43:8265/api/v2/status +``` + +## Performance Issues + +### Slow Transcoding Performance +**Diagnosis**: +1. **Check cache location**: Should be local NVMe, not network +2. **Verify unmapped mode**: `nodeType=unmapped` in container +3. **Monitor I/O**: `iotop` during transcoding + +**Expected Performance**: +- **Mapped nodes**: Constant SMB streaming (~100MB/s) +- **Unmapped nodes**: Download once β†’ Process locally β†’ Upload once + +### GPU Utilization Problems +```bash +# Monitor GPU usage during transcoding +watch nvidia-smi + +# Check GPU device access in container +podman exec tdarr-node-gpu nvidia-smi + +# Verify NVENC encoder availability +podman exec tdarr-node-gpu ffmpeg -encoders | grep nvenc +``` + +## Plugin System Issues + +### Plugin Loading Failures +**Troubleshooting Steps**: +1. **Check plugin directory**: Ensure no custom mounts override community plugins +2. **Verify dependencies**: FlowHelper files (`metadataUtils.js`, `letterboxUtils.js`) +3. **Test plugin syntax**: + ```bash + # Test plugin in Node.js + node -e "require('./path/to/plugin.js')" + ``` + +### Custom Plugin Integration +**Safe Integration Pattern**: +1. **Selective mounting**: Mount only specific required plugins +2. **Dependency verification**: Include all FlowHelper dependencies +3. **Version compatibility**: Ensure plugins match Tdarr version +4. **Null-safety checks**: Add `|| []` to forEach operations + +## Monitoring and Logging + +### Log Locations +```bash +# Scheduler logs +tail -f /tmp/tdarr-scheduler.log + +# Monitor logs +tail -f /tmp/tdarr-monitor/monitor.log + +# Server logs +ssh tdarr "docker logs tdarr" + +# Node logs +podman logs tdarr-node-gpu +``` + +### Discord Notification Issues +**Check webhook configuration**: +```bash +# Test Discord webhook +curl -X POST [WEBHOOK_URL] \ + -H "Content-Type: application/json" \ + -d '{"content": "Test message"}' +``` + +**Common Issues**: +- JSON escaping in message content +- Markdown formatting in Discord +- User ping placement (outside code blocks) + +## Emergency Recovery + +### Complete System Reset +```bash +# Stop all containers +podman stop tdarr-node-gpu +ssh tdarr "docker stop tdarr" + +# Clean cache directories +rm -rf /mnt/NV2/tdarr-cache/tdarr-workDir* + +# Remove scheduler +crontab -e # Delete tdarr lines + +# Restart with clean configuration +./start-tdarr-gpu-podman-clean.sh +./tdarr-schedule-manager.sh preset work-safe +./tdarr-schedule-manager.sh install +``` + +### Data Recovery +**Important**: Tdarr processes files in-place, original files remain untouched +- **Queue data**: Stored in server configuration (`/app/configs`) +- **Progress data**: Lost on container restart (unmapped nodes) +- **Cache files**: Safe to delete, will re-download + +## Common Error Patterns + +### "Copy failed" in Staging Section +**Cause**: Network timeout during file transfer to unmapped node +**Solution**: Monitoring system automatically retries + +### "ENOTEMPTY" Directory Cleanup Errors +**Cause**: Partial downloads leave files in work directories +**Solution**: Force remove directories, monitoring handles automatically + +### Node Disconnection During Processing +**Cause**: Gaming detection or manual stop during active job +**Result**: File returns to queue automatically, safe to restart + +## Prevention Best Practices + +1. **Use unmapped node architecture** for stability +2. **Implement monitoring system** for automatic cleanup +3. **Configure gaming-aware scheduling** for desktop systems +4. **Set container resource limits** to prevent crashes +5. **Use clean plugin installation** to avoid forEach errors +6. **Monitor system resources** during intensive operations + +This troubleshooting guide covers the most common issues and their resolutions for production Tdarr deployments. \ No newline at end of file diff --git a/vm-management/CONTEXT.md b/vm-management/CONTEXT.md new file mode 100644 index 0000000..5be2f1e --- /dev/null +++ b/vm-management/CONTEXT.md @@ -0,0 +1,296 @@ +# Virtual Machine Management - Technology Context + +## Overview +Virtual machine management for home lab environments with focus on automated provisioning, infrastructure as code, and security-first configuration. This context covers VM lifecycle management, Proxmox integration, and standardized deployment patterns. + +## Architecture Patterns + +### Infrastructure as Code (IaC) Approach +**Pattern**: Declarative VM configuration with repeatable deployments +```yaml +# Cloud-init template pattern +#cloud-config +users: + - name: cal + groups: [sudo, docker] + ssh_authorized_keys: + - ssh-rsa AAAAB3... primary-key + - ssh-rsa AAAAB3... emergency-key +packages: + - docker.io + - docker-compose +runcmd: + - systemctl enable docker + - usermod -aG docker cal +``` + +### Template-Based Deployment Strategy +**Pattern**: Standardized VM templates with cloud-init automation +- **Base Templates**: Ubuntu Server with cloud-init support +- **Resource Allocation**: Standardized sizing (2CPU/4GB/20GB baseline) +- **Network Configuration**: Predefined VLAN assignments (10.10.0.x internal) +- **Security Hardening**: SSH keys only, password auth disabled + +## Provisioning Strategies + +### Cloud-Init Deployment (Recommended for New VMs) +**Purpose**: Fully automated VM provisioning from first boot +**Implementation**: +1. Create VM in Proxmox with cloud-init support +2. Apply standardized cloud-init template +3. VM configures itself automatically on first boot +4. No manual intervention required + +**Benefits**: +- Zero-touch deployment +- Consistent configuration +- Security hardening from first boot +- Immediate productivity + +### Post-Install Scripting (Existing VMs) +**Purpose**: Standardize existing VM configurations +**Implementation**: +```bash +./vm-post-install.sh [username] +# Automated: updates, SSH keys, Docker, hardening +``` + +**Use Cases**: +- Legacy VM standardization +- Imported VM configuration +- Recovery and remediation +- Incremental improvements + +## Security Architecture + +### SSH Key-Based Authentication +**Pattern**: Dual key deployment for security and redundancy + +```bash +# Primary access key +~/.ssh/homelab_rsa # Daily operations + +# Emergency access key +~/.ssh/emergency_homelab_rsa # Backup/recovery access +``` + +**Security Controls**: +- Password authentication completely disabled +- Root login prohibited +- SSH keys managed centrally +- Automatic key deployment + +### User Privilege Management +**Pattern**: Least privilege with sudo elevation +```bash +# User configuration +username: cal +groups: [sudo, docker] # Minimal required groups +shell: /bin/bash +sudo: ALL=(ALL) NOPASSWD:ALL # Operational convenience +``` + +**Access Controls**: +- Non-root user accounts only +- Sudo required for administrative tasks +- Docker group for container management +- SSH key authentication mandatory + +### Network Security +**Pattern**: Network segmentation and access control +- **Internal Network**: 10.10.0.x/24 for VM communication +- **Management Access**: SSH (port 22) only +- **Service Isolation**: Application-specific port exposure +- **Firewall Ready**: iptables/ufw configuration prepared + +## Lifecycle Management Patterns + +### VM Creation Workflow +1. **Template Selection**: Choose appropriate base image +2. **Resource Allocation**: Size based on workload requirements +3. **Network Assignment**: VLAN and IP address planning +4. **Cloud-Init Configuration**: Apply standardized template +5. **Automated Provisioning**: Zero-touch deployment +6. **Verification**: Automated connectivity and configuration tests + +### Configuration Management +**Pattern**: Standardized system configuration +```bash +# Essential packages +packages: [ + "curl", "wget", "git", "vim", "htop", "unzip", + "docker.io", "docker-compose-plugin" +] + +# System services +runcmd: + - systemctl enable docker + - systemctl enable ssh + - systemctl enable unattended-upgrades +``` + +### Maintenance Automation +**Pattern**: Automated updates and maintenance +- **Security Updates**: Automatic installation enabled +- **Package Management**: Standardized package selection +- **Service Management**: Consistent service configuration +- **Log Management**: Centralized logging ready + +## Resource Management + +### Sizing Standards +**Pattern**: Standardized VM resource allocation + +```yaml +# Basic workload (web services, small databases) +vcpus: 2 +memory: 4096 # 4GB +disk: 20 # 20GB + +# Medium workload (application servers, medium databases) +vcpus: 4 +memory: 8192 # 8GB +disk: 40 # 40GB + +# Heavy workload (transcoding, large databases) +vcpus: 6 +memory: 16384 # 16GB +disk: 100 # 100GB +``` + +### Storage Strategy +**Pattern**: Application-appropriate storage allocation +- **System Disk**: OS and applications (20-40GB) +- **Data Volumes**: Application data (variable) +- **Backup Storage**: Network-attached for persistence +- **Cache Storage**: Local fast storage for performance + +### Network Planning +**Pattern**: Structured network addressing +```yaml +# Network segments +management: 10.10.0.x/24 # VM management and SSH access +services: 10.10.1.x/24 # Application services +storage: 10.10.2.x/24 # Storage and backup traffic +dmz: 10.10.10.x/24 # External-facing services +``` + +## Monitoring and Operations + +### Health Monitoring +**Pattern**: Automated system health checks +```bash +# Resource monitoring +cpu_usage: <80% +memory_usage: <90% +disk_usage: <85% +network_connectivity: verified + +# Service monitoring +ssh_service: active +docker_service: active +unattended_upgrades: active +``` + +### Backup Strategies +**Pattern**: Multi-tier backup approach +- **VM Snapshots**: Point-in-time recovery (Proxmox) +- **Application Data**: Specific application backup procedures +- **Configuration Backup**: Cloud-init templates and scripts +- **SSH Keys**: Centralized key management backup + +### Performance Tuning +**Pattern**: Workload-optimized configuration +```yaml +# CPU optimization +cpu_type: host # Performance over compatibility +numa: enabled # NUMA awareness for multi-socket + +# Memory optimization +ballooning: enabled # Dynamic memory allocation +hugepages: disabled # Unless specifically needed + +# Storage optimization +cache: writethrough # Balance performance and safety +io_thread: enabled # Improve I/O performance +``` + +## Integration Patterns + +### Container Platform Integration +**Pattern**: Docker-ready VM deployment +```bash +# Automated Docker setup +- docker.io installation +- docker-compose plugin +- User added to docker group +- Service auto-start enabled +- Container runtime verified +``` + +### SSH Infrastructure Integration +**Pattern**: Centralized SSH key management +```bash +# Key deployment automation +primary_key: ~/.ssh/homelab_rsa.pub +emergency_key: ~/.ssh/emergency_homelab_rsa.pub +backup_system: automated +rotation_policy: annual +``` + +### Network Services Integration +**Pattern**: Ready for service deployment +- **Reverse Proxy**: Nginx/Traefik ready configuration +- **DNS**: Local DNS registration prepared +- **Certificates**: Let's Encrypt integration ready +- **Monitoring**: Prometheus/Grafana agent ready + +## Common Implementation Workflows + +### New VM Deployment +1. **Create VM** in Proxmox with cloud-init support +2. **Configure resources** based on workload requirements +3. **Apply cloud-init template** with standardized configuration +4. **Start VM** and wait for automated provisioning +5. **Verify deployment** via SSH key authentication +6. **Deploy applications** using container or package management + +### Existing VM Standardization +1. **Assess current configuration** and identify gaps +2. **Run post-install script** for automated updates +3. **Verify SSH key deployment** and password auth disable +4. **Test Docker installation** and user permissions +5. **Update documentation** with new configuration +6. **Schedule regular maintenance** and monitoring + +### VM Migration and Recovery +1. **Create VM snapshot** before changes +2. **Export VM configuration** and cloud-init template +3. **Test recovery procedure** in staging environment +4. **Document recovery steps** and verification procedures +5. **Implement backup automation** for critical VMs + +## Best Practices + +### Security Hardening +1. **SSH Keys Only**: Disable password authentication completely +2. **Emergency Access**: Deploy backup SSH keys for recovery +3. **User Separation**: Non-root users with sudo privileges +4. **Automatic Updates**: Enable security update automation +5. **Network Isolation**: Use VLANs and firewall rules + +### Operational Excellence +1. **Infrastructure as Code**: Use cloud-init for reproducible deployments +2. **Standardization**: Consistent VM sizing and configuration +3. **Automation**: Minimize manual configuration steps +4. **Documentation**: Maintain deployment templates and procedures +5. **Testing**: Verify deployments before production use + +### Performance Optimization +1. **Resource Right-Sizing**: Match resources to workload requirements +2. **Storage Strategy**: Use appropriate storage tiers +3. **Network Optimization**: Plan network topology for performance +4. **Monitoring**: Implement resource usage monitoring +5. **Capacity Planning**: Plan for growth and scaling + +This technology context provides comprehensive guidance for implementing virtual machine management in home lab and production environments using modern IaC principles and security best practices. \ No newline at end of file diff --git a/examples/vm-management/proxmox-automation.md b/vm-management/examples/proxmox-automation.md similarity index 100% rename from examples/vm-management/proxmox-automation.md rename to vm-management/examples/proxmox-automation.md diff --git a/reference/vm-management/troubleshooting.md b/vm-management/examples/troubleshooting.md similarity index 100% rename from reference/vm-management/troubleshooting.md rename to vm-management/examples/troubleshooting.md diff --git a/scripts/vm-management/README.md b/vm-management/scripts/README.md similarity index 100% rename from scripts/vm-management/README.md rename to vm-management/scripts/README.md diff --git a/scripts/vm-management/cloud-init-user-data.yaml b/vm-management/scripts/cloud-init-user-data.yaml similarity index 100% rename from scripts/vm-management/cloud-init-user-data.yaml rename to vm-management/scripts/cloud-init-user-data.yaml diff --git a/scripts/vm-management/vm-post-install.sh b/vm-management/scripts/vm-post-install.sh similarity index 100% rename from scripts/vm-management/vm-post-install.sh rename to vm-management/scripts/vm-post-install.sh diff --git a/vm-management/troubleshooting.md b/vm-management/troubleshooting.md new file mode 100644 index 0000000..8fa173c --- /dev/null +++ b/vm-management/troubleshooting.md @@ -0,0 +1,652 @@ +# Virtual Machine Management Troubleshooting Guide + +## VM Provisioning Issues + +### Cloud-Init Configuration Problems + +#### Cloud-Init Not Executing +**Symptoms**: +- VM starts but user accounts not created +- SSH keys not deployed +- Packages not installed +- Configuration not applied + +**Diagnosis**: +```bash +# Check cloud-init status and logs +ssh root@ 'cloud-init status --long' +ssh root@ 'cat /var/log/cloud-init.log' +ssh root@ 'cat /var/log/cloud-init-output.log' + +# Verify cloud-init configuration +ssh root@ 'cloud-init query userdata' + +# Check for YAML syntax errors +ssh root@ 'cloud-init devel schema --config-file /var/lib/cloud/instance/user-data.txt' +``` + +**Solutions**: +```bash +# Re-run cloud-init (CAUTION: may overwrite changes) +ssh root@ 'cloud-init clean --logs' +ssh root@ 'cloud-init init --local' +ssh root@ 'cloud-init init' +ssh root@ 'cloud-init modules --mode=config' +ssh root@ 'cloud-init modules --mode=final' + +# Manual user creation if cloud-init fails +ssh root@ 'useradd -m -s /bin/bash -G sudo,docker cal' +ssh root@ 'mkdir -p /home/cal/.ssh' +ssh root@ 'chown cal:cal /home/cal/.ssh' +ssh root@ 'chmod 700 /home/cal/.ssh' +``` + +#### Invalid Cloud-Init YAML +**Symptoms**: +- Cloud-init fails with syntax errors +- Parser errors in cloud-init logs +- Partial configuration application + +**Common YAML Issues**: +```yaml +# ❌ Incorrect indentation +users: +- name: cal +groups: [sudo, docker] # Wrong indentation + +# βœ… Correct indentation +users: + - name: cal + groups: [sudo, docker] # Proper indentation + +# ❌ Missing quotes for special characters +ssh_authorized_keys: + - ssh-rsa AAAAB3NzaC1... user@host # May fail with special chars + +# βœ… Quoted strings +ssh_authorized_keys: + - "ssh-rsa AAAAB3NzaC1... user@host" +``` + +### VM Boot and Startup Issues + +#### VM Won't Start +**Symptoms**: +- VM fails to boot from Proxmox +- Kernel panic messages +- Boot loop or hanging + +**Diagnosis**: +```bash +# Check VM configuration +pvesh get /nodes/pve/qemu//config + +# Check resource allocation +pvesh get /nodes/pve/qemu//status/current + +# Review VM logs via Proxmox console +# Use Proxmox web interface -> VM -> Console + +# Check Proxmox host resources +pvesh get /nodes/pve/status +``` + +**Solutions**: +```bash +# Increase memory allocation +pvesh set /nodes/pve/qemu//config -memory 4096 + +# Reset CPU configuration +pvesh set /nodes/pve/qemu//config -cpu host -cores 2 + +# Check and repair disk +# Stop VM, then: +pvesh get /nodes/pve/qemu//config | grep scsi0 +# Use fsck on the disk image if needed +``` + +#### Resource Constraints +**Symptoms**: +- VM extremely slow performance +- Out-of-memory kills +- Disk I/O bottlenecks + +**Diagnosis**: +```bash +# Inside VM resource check +free -h +df -h +iostat 1 5 +vmstat 1 5 + +# Proxmox host resource check +pvesh get /nodes/pve/status +cat /proc/meminfo +df -h /var/lib/vz +``` + +**Solutions**: +```bash +# Increase VM resources via Proxmox +pvesh set /nodes/pve/qemu//config -memory 8192 +pvesh set /nodes/pve/qemu//config -cores 4 + +# Resize VM disk +# Proxmox GUI: Hardware -> Hard Disk -> Resize +# Then extend filesystem: +sudo growpart /dev/sda 1 +sudo resize2fs /dev/sda1 +``` + +## SSH Access Issues + +### SSH Connection Failures + +#### Cannot Connect to VM +**Symptoms**: +- Connection timeout +- Connection refused +- Host unreachable + +**Diagnosis**: +```bash +# Network connectivity tests +ping +traceroute + +# SSH service tests +nc -zv 22 +nmap -p 22 + +# From Proxmox console, check SSH service +systemctl status sshd +ss -tlnp | grep :22 +``` + +**Solutions**: +```bash +# Via Proxmox console - restart SSH +systemctl start sshd +systemctl enable sshd + +# Check and configure firewall +ufw status +# If blocking SSH: +ufw allow ssh +ufw allow 22/tcp + +# Network configuration reset +ip addr show +dhclient # For DHCP +systemctl restart networking +``` + +#### SSH Key Authentication Failures +**Symptoms**: +- Password prompts despite key installation +- "Permission denied (publickey)" +- "No more authentication methods" + +**Diagnosis**: +```bash +# Verbose SSH debugging +ssh -vvv cal@ + +# Check key files locally +ls -la ~/.ssh/homelab_rsa* +ls -la ~/.ssh/emergency_homelab_rsa* + +# Via console or password auth, check VM +ls -la ~/.ssh/ +cat ~/.ssh/authorized_keys +``` + +**Solutions**: +```bash +# Fix SSH directory permissions +chmod 700 ~/.ssh +chmod 600 ~/.ssh/authorized_keys +chown -R cal:cal ~/.ssh + +# Re-deploy SSH keys +cat > ~/.ssh/authorized_keys << 'EOF' +ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQC... # primary key +ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQD... # emergency key +EOF + +# Verify SSH server configuration +sudo sshd -T | grep -E "(passwordauth|pubkeyauth|permitroot)" +``` + +#### SSH Security Configuration Issues +**Symptoms**: +- Password authentication still enabled +- Root login allowed +- Insecure SSH settings + +**Diagnosis**: +```bash +# Check effective SSH configuration +sudo sshd -T | grep -E "(passwordauth|pubkeyauth|permitroot|allowusers)" + +# Review SSH config files +cat /etc/ssh/sshd_config +ls /etc/ssh/sshd_config.d/ +``` + +**Solutions**: +```bash +# Apply security hardening +sudo tee /etc/ssh/sshd_config.d/99-homelab-security.conf << 'EOF' +PasswordAuthentication no +PubkeyAuthentication yes +PermitRootLogin no +AllowUsers cal +Protocol 2 +ClientAliveInterval 300 +ClientAliveCountMax 2 +MaxAuthTries 3 +X11Forwarding no +EOF + +sudo systemctl restart sshd +``` + +## Docker Installation and Configuration Issues + +### Docker Installation Failures + +#### Package Installation Fails +**Symptoms**: +- Docker packages not found +- GPG key verification errors +- Repository access failures + +**Diagnosis**: +```bash +# Test internet connectivity +ping google.com +curl -I https://download.docker.com + +# Check repository configuration +cat /etc/apt/sources.list.d/docker.list +apt-cache policy docker-ce + +# Check for package conflicts +dpkg -l | grep docker +``` + +**Solutions**: +```bash +# Remove conflicting packages +sudo apt remove -y docker docker-engine docker.io containerd runc + +# Reinstall Docker repository +sudo mkdir -p /etc/apt/keyrings +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg + +echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list + +# Install Docker +sudo apt update +sudo apt install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +``` + +#### Docker Service Issues +**Symptoms**: +- Docker daemon won't start +- Socket connection errors +- Service failure on boot + +**Diagnosis**: +```bash +# Check service status +systemctl status docker +journalctl -u docker.service -f + +# Check system resources +df -h +free -h + +# Test daemon manually +sudo dockerd --debug +``` + +**Solutions**: +```bash +# Restart Docker service +sudo systemctl stop docker +sudo systemctl start docker +sudo systemctl enable docker + +# Clear corrupted Docker data +sudo systemctl stop docker +sudo rm -rf /var/lib/docker/tmp/* +sudo systemctl start docker + +# Reset Docker configuration +sudo mv /etc/docker/daemon.json /etc/docker/daemon.json.bak 2>/dev/null || true +sudo systemctl restart docker +``` + +### Docker Permission and Access Issues + +#### Permission Denied Errors +**Symptoms**: +- Must use sudo for Docker commands +- "Permission denied" when accessing Docker socket +- User not in docker group + +**Diagnosis**: +```bash +# Check user groups +groups +groups cal +getent group docker + +# Check Docker socket permissions +ls -la /var/run/docker.sock + +# Verify Docker service is running +systemctl status docker +``` + +**Solutions**: +```bash +# Add user to docker group +sudo usermod -aG docker cal + +# Create docker group if missing +sudo groupadd docker 2>/dev/null || true +sudo usermod -aG docker cal + +# Apply group membership (requires logout/login or): +newgrp docker + +# Fix socket permissions +sudo chown root:docker /var/run/docker.sock +sudo chmod 664 /var/run/docker.sock +``` + +## Network Configuration Problems + +### IP Address and Connectivity Issues + +#### Incorrect IP Configuration +**Symptoms**: +- VM has wrong IP address +- No network connectivity +- Cannot reach default gateway + +**Diagnosis**: +```bash +# Check network configuration +ip addr show +ip route show +cat /etc/netplan/*.yaml + +# Test connectivity +ping $(ip route | grep default | awk '{print $3}') # Gateway +ping 8.8.8.8 # External connectivity +``` + +**Solutions**: +```bash +# Fix netplan configuration +sudo tee /etc/netplan/00-installer-config.yaml << 'EOF' +network: + version: 2 + ethernets: + ens18: + dhcp4: false + addresses: [10.10.0.200/24] + gateway4: 10.10.0.1 + nameservers: + addresses: [10.10.0.16, 8.8.8.8] +EOF + +# Apply network configuration +sudo netplan apply +``` + +#### DNS Resolution Problems +**Symptoms**: +- Cannot resolve domain names +- Package downloads fail +- Host lookup failures + +**Diagnosis**: +```bash +# Check DNS configuration +cat /etc/resolv.conf +systemd-resolve --status + +# Test DNS resolution +nslookup google.com +dig google.com @8.8.8.8 +``` + +**Solutions**: +```bash +# Fix DNS in netplan (see above example) +sudo netplan apply + +# Temporary DNS fix +echo "nameserver 8.8.8.8" | sudo tee /etc/resolv.conf + +# Restart DNS services +sudo systemctl restart systemd-resolved +sudo systemctl restart networking +``` + +## System Maintenance Issues + +### Package Management Problems + +#### Update Failures +**Symptoms**: +- apt update fails +- Repository signature errors +- Dependency conflicts + +**Diagnosis**: +```bash +# Check repository status +sudo apt update +apt-cache policy + +# Check disk space +df -h / +df -h /var + +# Check for held packages +apt-mark showhold +``` + +**Solutions**: +```bash +# Fix broken packages +sudo apt --fix-broken install +sudo dpkg --configure -a + +# Clean package cache +sudo apt clean +sudo apt autoclean +sudo apt autoremove + +# Reset problematic repositories +sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys +sudo apt update +``` + +### Storage and Disk Space Issues + +#### Disk Space Exhaustion +**Symptoms**: +- Cannot install packages +- Docker operations fail +- System becomes unresponsive + +**Diagnosis**: +```bash +# Check disk usage +df -h +du -sh /home/* /var/* /opt/* 2>/dev/null + +# Find large files +find / -size +100M 2>/dev/null | head -20 +``` + +**Solutions**: +```bash +# Clean system files +sudo apt clean +sudo apt autoremove +sudo journalctl --vacuum-time=7d + +# Clean Docker data +docker system prune -a -f +docker volume prune -f + +# Extend disk (Proxmox GUI: Hardware -> Resize) +# Then extend filesystem: +sudo growpart /dev/sda 1 +sudo resize2fs /dev/sda1 +``` + +## Emergency Recovery Procedures + +### SSH Access Recovery + +#### Complete SSH Lockout +**Recovery Steps**: +1. **Use Proxmox console** for direct VM access +2. **Reset SSH configuration**: + ```bash + # Via console + sudo cp /etc/ssh/sshd_config.backup /etc/ssh/sshd_config 2>/dev/null || true + sudo systemctl restart sshd + ``` +3. **Re-enable emergency access**: + ```bash + # Temporary password access for recovery + sudo passwd cal + sudo sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config + sudo systemctl restart sshd + ``` + +#### Emergency SSH Key Deployment +**If primary keys fail**: +```bash +# Use emergency key +ssh -i ~/.ssh/emergency_homelab_rsa cal@ + +# Or deploy keys via console +mkdir -p ~/.ssh +chmod 700 ~/.ssh +cat > ~/.ssh/authorized_keys << 'EOF' +ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQC... # primary key +ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQD... # emergency key +EOF +chmod 600 ~/.ssh/authorized_keys +``` + +### VM Recovery and Rebuild + +#### Corrupt VM Recovery +**Steps**: +1. **Create snapshot** before attempting recovery +2. **Export VM data**: + ```bash + # Backup important data + rsync -av cal@:/home/cal/ ./vm-backup/ + ``` +3. **Restore from template**: + ```bash + # Delete corrupt VM + pvesh delete /nodes/pve/qemu/ + + # Clone from template + pvesh create /nodes/pve/qemu//clone -newid -name + ``` + +#### Post-Install Script Recovery +**If automation fails**: +```bash +# Run in debug mode +bash -x ./scripts/vm-management/vm-post-install.sh + +# Manual step execution +ssh cal@ 'sudo apt update && sudo apt upgrade -y' +ssh cal@ 'curl -fsSL https://get.docker.com | sh' +ssh cal@ 'sudo usermod -aG docker cal' +``` + +## Prevention and Monitoring + +### Pre-Deployment Validation +```bash +# Verify prerequisites +ls -la ~/.ssh/homelab_rsa* +ls -la ~/.ssh/emergency_homelab_rsa* +ping 10.10.0.1 + +# Test cloud-init YAML +python3 -c "import yaml; yaml.safe_load(open('cloud-init-user-data.yaml'))" +``` + +### Health Monitoring Script +```bash +#!/bin/bash +# vm-health-check.sh +VM_IPS="10.10.0.200 10.10.0.201 10.10.0.202" + +for ip in $VM_IPS; do + if ssh -o ConnectTimeout=5 -o BatchMode=yes cal@$ip 'uptime' >/dev/null 2>&1; then + echo "βœ… $ip: SSH OK" + # Check Docker + if ssh cal@$ip 'docker info >/dev/null 2>&1'; then + echo "βœ… $ip: Docker OK" + else + echo "❌ $ip: Docker FAILED" + fi + else + echo "❌ $ip: SSH FAILED" + fi +done +``` + +### Automated Backup +```bash +# Schedule in crontab: 0 2 * * * /path/to/vm-backup.sh +#!/bin/bash +for vm_ip in 10.10.0.{200..210}; do + if ping -c1 $vm_ip >/dev/null 2>&1; then + rsync -av --exclude='.cache' cal@$vm_ip:/home/cal/ ./backups/$vm_ip/ + fi +done +``` + +## Quick Reference Commands + +### Essential VM Management +```bash +# VM control via Proxmox +pvesh get /nodes/pve/qemu//status/current +pvesh create /nodes/pve/qemu//status/start +pvesh create /nodes/pve/qemu//status/stop + +# SSH with alternative keys +ssh -i ~/.ssh/emergency_homelab_rsa cal@ + +# System health checks +free -h && df -h && systemctl status docker +docker system info && docker system df +``` + +### Recovery Resources +- **SSH Keys Backup**: `/mnt/NV2/ssh-keys/backup-*/` +- **Proxmox Console**: Direct VM access when SSH fails +- **Emergency Contact**: Use Discord notifications for critical issues + +This troubleshooting guide covers comprehensive recovery procedures for VM management issues in home lab environments. \ No newline at end of file