From 10c9e0d854a41748ce64f6eb1c0ac04fe2347233 Mon Sep 17 00:00:00 2001
From: Cal Corum <calcorum@users.noreply.github.com>
Date: Tue, 12 Aug 2025 23:20:15 -0500
Subject: [PATCH] CLAUDE: Migrate to technology-first documentation
 architecture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Complete restructure from patterns/examples/reference to technology-focused directories:

• Created technology-specific directories with comprehensive documentation:
  - /tdarr/ - Transcoding automation with gaming-aware scheduling
  - /docker/ - Container management with GPU acceleration patterns
  - /vm-management/ - Virtual machine automation and cloud-init
  - /networking/ - SSH infrastructure, reverse proxy, and security
  - /monitoring/ - System health checks and Discord notifications
  - /databases/ - Database patterns and troubleshooting
  - /development/ - Programming language patterns (bash, nodejs, python, vuejs)

• Enhanced CLAUDE.md with intelligent context loading:
  - Technology-first loading rules for automatic context provision
  - Troubleshooting keyword triggers for emergency scenarios
  - Documentation maintenance protocols with automated reminders
  - Context window management for optimal documentation updates

• Preserved valuable content from .claude/tmp/:
  - SSH security improvements and server inventory
  - Tdarr CIFS troubleshooting and Docker iptables solutions
  - Operational scripts with proper technology classification

• Benefits achieved:
  - Self-contained technology directories with complete context
  - Automatic loading of relevant documentation based on keywords
  - Emergency-ready troubleshooting with comprehensive guides
  - Scalable structure for future technology additions
  - Eliminated context bloat through targeted loading

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .claude/settings.json                         |    7 +
 .gitignore                                    |    3 +-
 CLAUDE.md                                     |  315 +++--
 .../README.md => databases/CONTEXT.md         |    0
 databases/troubleshooting.md                  |  316 +++++
 .../README.md => development/bash-CONTEXT.md  |    0
 .../bash-troubleshooting.md                   |    0
 .../python => development}/debugging.md       |    0
 .../nodejs-CONTEXT.md                         |    0
 .../python-CONTEXT.md                         |    0
 .../service-management.md                     |    0
 .../README.md => development/vuejs-CONTEXT.md |    0
 .../python => development}/web-frameworks.md  |    0
 docker/CONTEXT.md                             |  331 +++++
 .../examples}/crash-analysis-summary.md       |    0
 .../examples}/distributed-transcoding.md      |    0
 ...docker-iptables-troubleshooting-session.md |  262 ++++
 .../examples}/gpu-acceleration.md             |    0
 .../examples}/multi-stage-builds.md           |    0
 .../examples}/nvidia-gpu-troubleshooting.md   |    0
 .../examples}/nvidia-troubleshooting.md       |    0
 .../examples}/tdarr-container-fixes.md        |    0
 .../tdarr-monitoring-configuration.md         |    0
 .../examples}/tdarr-node-configurations.md    |    0
 .../tdarr-node-local/docker-compose-cpu.yml   |    0
 .../tdarr-node-local/docker-compose-gpu.yml   |    0
 .../start-tdarr-mapped-node.sh                |    0
 .../examples}/tdarr-server-setup/README.md    |    0
 .../tdarr-server-setup/docker-compose.yml     |    0
 .../examples}/tdarr-troubleshooting.md        |    0
 .../examples}/troubleshooting.md              |    0
 docker/troubleshooting.md                     |  466 +++++++
 legacy/old-scripts-README.md                  |  172 +++
 monitoring/CONTEXT.md                         |  142 ++
 monitoring/examples/cron-job-management.md    |  326 +++++
 .../scripts}/README.md                        |    0
 .../scripts}/setup-discord-monitoring.md      |    0
 .../scripts}/tdarr-timeout-monitor.sh         |    0
 monitoring/scripts/tdarr_monitor.py           | 1234 +++++++++++++++++
 .../scripts}/windows-desktop/README.md        |    0
 .../windows-reboot-monitor.ps1                |    0
 .../windows-reboot-task-shutdown.xml          |    0
 .../windows-reboot-task-startup.xml           |    0
 .../windows-setup-instructions.md             |    0
 monitoring/troubleshooting.md                 |  414 ++++++
 networking/CONTEXT.md                         |  309 +++++
 .../examples}/cifs-mount-resilience-fixes.md  |    0
 .../examples}/nas-mount-configuration.md      |    0
 .../network-filesystem-limitations.md         |    0
 .../examples}/nginx-config.md                 |    0
 networking/examples/security_improvements.md  |   99 ++
 networking/examples/server_inventory.yaml     |   70 +
 .../examples}/ssh-homelab-setup.md            |    0
 .../examples}/ssh-key-management.md           |    0
 .../examples}/ssh-troubleshooting.md          |    0
 .../examples}/troubleshooting.md              |    0
 networking/scripts/ssh_key_maintenance.sh     |  114 ++
 networking/troubleshooting.md                 |  496 +++++++
 patterns/docker/README.md                     |   26 -
 patterns/networking/README.md                 |   32 -
 patterns/vm-management/README.md              |   66 -
 scripts/monitoring/tdarr_monitor.py           |  498 -------
 scripts/tdarr-manager                         |    6 -
 tdarr/CONTEXT.md                              |  152 ++
 .../tdarr-cifs-troubleshooting-2025-08-11.md  |  143 ++
 tdarr/examples/tdarr-node-configurations.md   |  183 +++
 .../tdarr-node-local/docker-compose-cpu.yml   |   28 +
 .../tdarr-node-local/docker-compose-gpu.yml   |   45 +
 .../start-tdarr-mapped-node.sh                |   83 ++
 tdarr/examples/tdarr-server-setup/README.md   |   69 +
 .../tdarr-server-setup/docker-compose.yml     |   37 +
 tdarr/scripts/CONTEXT.md                      |  212 +++
 {scripts/tdarr => tdarr/scripts}/README.md    |    0
 .../scripts}/start-tdarr-gpu-podman-clean.sh  |    0
 .../scripts}/stop-tdarr-gpu-podman.sh         |    0
 .../scripts}/tdarr-cron-check-configurable.sh |    0
 .../scripts}/tdarr-schedule-manager.sh        |    0
 .../scripts}/tdarr-schedule.conf              |    0
 tdarr/troubleshooting.md                      |  272 ++++
 vm-management/CONTEXT.md                      |  296 ++++
 .../examples}/proxmox-automation.md           |    0
 .../examples}/troubleshooting.md              |    0
 .../scripts}/README.md                        |    0
 .../scripts}/cloud-init-user-data.yaml        |    0
 .../scripts}/vm-post-install.sh               |    0
 vm-management/troubleshooting.md              |  652 +++++++++
 86 files changed, 7123 insertions(+), 753 deletions(-)
 create mode 100644 .claude/settings.json
 rename patterns/databases/README.md => databases/CONTEXT.md (100%)
 create mode 100644 databases/troubleshooting.md
 rename patterns/bash/README.md => development/bash-CONTEXT.md (100%)
 rename reference/bash/troubleshooting.md => development/bash-troubleshooting.md (100%)
 rename {reference/python => development}/debugging.md (100%)
 rename patterns/nodejs/README.md => development/nodejs-CONTEXT.md (100%)
 rename patterns/python/README.md => development/python-CONTEXT.md (100%)
 rename {examples/bash => development}/service-management.md (100%)
 rename patterns/vuejs/README.md => development/vuejs-CONTEXT.md (100%)
 rename {examples/python => development}/web-frameworks.md (100%)
 create mode 100644 docker/CONTEXT.md
 rename {reference/docker => docker/examples}/crash-analysis-summary.md (100%)
 rename {patterns/docker => docker/examples}/distributed-transcoding.md (100%)
 create mode 100644 docker/examples/docker-iptables-troubleshooting-session.md
 rename {patterns/docker => docker/examples}/gpu-acceleration.md (100%)
 rename {examples/docker => docker/examples}/multi-stage-builds.md (100%)
 rename {reference/docker => docker/examples}/nvidia-gpu-troubleshooting.md (100%)
 rename {reference/docker => docker/examples}/nvidia-troubleshooting.md (100%)
 rename {reference/docker => docker/examples}/tdarr-container-fixes.md (100%)
 rename {reference/docker => docker/examples}/tdarr-monitoring-configuration.md (100%)
 rename {examples/docker => docker/examples}/tdarr-node-configurations.md (100%)
 rename {examples/docker => docker/examples}/tdarr-node-local/docker-compose-cpu.yml (100%)
 rename {examples/docker => docker/examples}/tdarr-node-local/docker-compose-gpu.yml (100%)
 rename {examples/docker => docker/examples}/tdarr-node-local/start-tdarr-mapped-node.sh (100%)
 rename {examples/docker => docker/examples}/tdarr-server-setup/README.md (100%)
 rename {examples/docker => docker/examples}/tdarr-server-setup/docker-compose.yml (100%)
 rename {reference/docker => docker/examples}/tdarr-troubleshooting.md (100%)
 rename {reference/docker => docker/examples}/troubleshooting.md (100%)
 create mode 100644 docker/troubleshooting.md
 create mode 100644 legacy/old-scripts-README.md
 create mode 100644 monitoring/CONTEXT.md
 create mode 100644 monitoring/examples/cron-job-management.md
 rename {scripts/monitoring => monitoring/scripts}/README.md (100%)
 rename {scripts/monitoring => monitoring/scripts}/setup-discord-monitoring.md (100%)
 rename {scripts/monitoring => monitoring/scripts}/tdarr-timeout-monitor.sh (100%)
 create mode 100755 monitoring/scripts/tdarr_monitor.py
 rename {scripts/monitoring => monitoring/scripts}/windows-desktop/README.md (100%)
 rename {scripts/monitoring => monitoring/scripts}/windows-desktop/windows-reboot-monitor.ps1 (100%)
 rename {scripts/monitoring => monitoring/scripts}/windows-desktop/windows-reboot-task-shutdown.xml (100%)
 rename {scripts/monitoring => monitoring/scripts}/windows-desktop/windows-reboot-task-startup.xml (100%)
 rename {scripts/monitoring => monitoring/scripts}/windows-desktop/windows-setup-instructions.md (100%)
 create mode 100644 monitoring/troubleshooting.md
 create mode 100644 networking/CONTEXT.md
 rename {reference/networking => networking/examples}/cifs-mount-resilience-fixes.md (100%)
 rename {reference/networking => networking/examples}/nas-mount-configuration.md (100%)
 rename {reference/storage => networking/examples}/network-filesystem-limitations.md (100%)
 rename {examples/networking => networking/examples}/nginx-config.md (100%)
 create mode 100644 networking/examples/security_improvements.md
 create mode 100644 networking/examples/server_inventory.yaml
 rename {examples/networking => networking/examples}/ssh-homelab-setup.md (100%)
 rename {patterns/networking => networking/examples}/ssh-key-management.md (100%)
 rename {reference/networking => networking/examples}/ssh-troubleshooting.md (100%)
 rename {reference/networking => networking/examples}/troubleshooting.md (100%)
 create mode 100755 networking/scripts/ssh_key_maintenance.sh
 create mode 100644 networking/troubleshooting.md
 delete mode 100644 patterns/docker/README.md
 delete mode 100644 patterns/networking/README.md
 delete mode 100644 patterns/vm-management/README.md
 delete mode 100755 scripts/monitoring/tdarr_monitor.py
 delete mode 100755 scripts/tdarr-manager
 create mode 100644 tdarr/CONTEXT.md
 create mode 100644 tdarr/examples/tdarr-cifs-troubleshooting-2025-08-11.md
 create mode 100644 tdarr/examples/tdarr-node-configurations.md
 create mode 100644 tdarr/examples/tdarr-node-local/docker-compose-cpu.yml
 create mode 100644 tdarr/examples/tdarr-node-local/docker-compose-gpu.yml
 create mode 100755 tdarr/examples/tdarr-node-local/start-tdarr-mapped-node.sh
 create mode 100644 tdarr/examples/tdarr-server-setup/README.md
 create mode 100644 tdarr/examples/tdarr-server-setup/docker-compose.yml
 create mode 100644 tdarr/scripts/CONTEXT.md
 rename {scripts/tdarr => tdarr/scripts}/README.md (100%)
 rename {scripts/tdarr => tdarr/scripts}/start-tdarr-gpu-podman-clean.sh (100%)
 rename {scripts/tdarr => tdarr/scripts}/stop-tdarr-gpu-podman.sh (100%)
 rename {scripts/tdarr => tdarr/scripts}/tdarr-cron-check-configurable.sh (100%)
 rename {scripts/tdarr => tdarr/scripts}/tdarr-schedule-manager.sh (100%)
 rename {scripts/tdarr => tdarr/scripts}/tdarr-schedule.conf (100%)
 create mode 100644 tdarr/troubleshooting.md
 create mode 100644 vm-management/CONTEXT.md
 rename {examples/vm-management => vm-management/examples}/proxmox-automation.md (100%)
 rename {reference/vm-management => vm-management/examples}/troubleshooting.md (100%)
 rename {scripts/vm-management => vm-management/scripts}/README.md (100%)
 rename {scripts/vm-management => vm-management/scripts}/cloud-init-user-data.yaml (100%)
 rename {scripts/vm-management => vm-management/scripts}/vm-post-install.sh (100%)
 create mode 100644 vm-management/troubleshooting.md

diff --git a/.claude/settings.json b/.claude/settings.json
new file mode 100644
index 0000000..5701dc9
--- /dev/null
+++ b/.claude/settings.json
@@ -0,0 +1,7 @@
+{
+    "notifications_disabled": true,
+    "allowed_working_directories": [
+        "/mnt/NV2/Development/claude-home",
+        "/mnt/media"
+    ]
+}
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index dd64d9e..b5b53c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 .claude/tmp/
-tmp/
\ No newline at end of file
+tmp/
+__pycache__
\ No newline at end of file
diff --git a/CLAUDE.md b/CLAUDE.md
index a9f52a2..e71912b 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -8,130 +8,98 @@
 - If creating a temporary file will help achieve your goal, please create the file in the .claude/tmp/ directory and clean up when you're done.
 - Prefer editing an existing file to creating a new one.
 - Following a complex task or series of tasks, prompt the user to save any key learnings from the session.
+- **Documentation Maintenance Reminder**: At the end of coding sessions, proactively ask: "Should I update our documentation to reflect the changes we made today?" Focus on CONTEXT.md files, troubleshooting guides, and any new patterns discovered.
+- **Context Window Management**: When approaching 25% context window remaining, prioritize documentation updates before auto-summarization occurs. Ask: "We're approaching context limits - should I update our documentation now to capture today's work before we lose context?"
 
 ## Automatic Context Loading Rules
 
-### File Extension Triggers
-When working with files, automatically load relevant documentation:
+### Technology-First Loading Rules
+When working with specific technologies, automatically load their dedicated context:
 
-**Python (.py, .pyx, .pyi)**
-- Load: `patterns/python/`
-- Load: `reference/python/`
-- If Django/Flask detected: Load `examples/python/web-frameworks.md`
-- If requests/httpx detected: Load `examples/python/api-clients.md`
+**Tdarr Keywords**
+- "tdarr", "transcode", "ffmpeg", "gpu transcoding", "nvenc", "scheduler", "api"
+  - Load: `tdarr/CONTEXT.md` (technology overview and patterns)
+  - Load: `tdarr/troubleshooting.md` (error handling and debugging)
+  - If working in `/tdarr/scripts/`: Load `tdarr/scripts/CONTEXT.md` (script-specific documentation)
+  - Note: Gaming-aware scheduling system with configurable time windows available
+  - Note: Comprehensive API monitoring available via `tdarr_monitor.py` with dataclass-based status tracking
 
-**JavaScript/Node.js (.js, .mjs, .ts)**
-- Load: `patterns/nodejs/`
-- Load: `reference/nodejs/`
-- If package.json exists: Load `examples/nodejs/package-management.md`
+**Docker Keywords**  
+- "docker", "container", "image", "compose", "kubernetes", "k8s", "dockerfile", "podman"
+  - Load: `docker/CONTEXT.md` (technology overview and patterns)
+  - Load: `docker/troubleshooting.md` (error handling and debugging)
+  - If working in `/docker/scripts/`: Load `docker/scripts/CONTEXT.md` (script-specific documentation)
 
-**Vue.js (.vue, vite.config.*, nuxt.config.*)**
-- Load: `patterns/vuejs/`
-- Load: `reference/vuejs/`
-- Load: `examples/vuejs/component-patterns.md`
+**VM Management Keywords**
+- "virtual machine", "vm", "proxmox", "kvm", "hypervisor", "guest", "virtualization"
+  - Load: `vm-management/CONTEXT.md` (technology overview and patterns)
+  - Load: `vm-management/troubleshooting.md` (error handling and debugging)
+  - If working in `/vm-management/scripts/`: Load `vm-management/scripts/CONTEXT.md` (script-specific documentation)
 
-**Shell Scripts (.sh, .bash, .zsh)**
-- Load: `patterns/bash/`
-- Load: `reference/bash/`
-- If systemd mentioned: Load `examples/bash/service-management.md`
+**Networking Keywords**
+- "network", "nginx", "proxy", "load balancer", "dns", "port", "firewall", "ssh", "ssl", "tls"
+  - Load: `networking/CONTEXT.md` (technology overview and patterns)
+  - Load: `networking/troubleshooting.md` (error handling and debugging)
+  - If working in `/networking/scripts/`: Load `networking/scripts/CONTEXT.md` (script-specific documentation)
 
-**Docker (Dockerfile, docker-compose.yml, .dockerignore)**
-- Load: `patterns/docker/`
-- Load: `reference/docker/`
-- Load: `examples/docker/multi-stage-builds.md`
+**Monitoring Keywords**
+- "monitoring", "alert", "notification", "discord", "health check", "status", "uptime", "windows reboot", "system monitor"
+  - Load: `monitoring/CONTEXT.md` (technology overview and patterns)
+  - Load: `monitoring/troubleshooting.md` (error handling and debugging)
+  - If working in `/monitoring/scripts/`: Load `monitoring/scripts/CONTEXT.md` (script-specific documentation)
+  - Note: Windows desktop monitoring with Discord notifications available
+  - Note: Comprehensive Tdarr API monitoring with dataclass-based status tracking
 
 ### Directory Context Triggers
 When working in specific directories:
 
-**Docker-related directories (/docker/, /containers/, /compose/)**
-- Load: `patterns/docker/`
-- Load: `examples/docker/`
-- Load: `reference/docker/troubleshooting.md`
+**Technology directories (/tdarr/, /docker/, /vm-management/, /networking/, /monitoring/)**
+- Load: `{technology}/CONTEXT.md` (technology overview)
+- Load: `{technology}/troubleshooting.md` (debugging info)
 
-**Database directories (/db/, /database/, /mysql/, /postgres/, /mongo/)**
-- Load: `patterns/databases/`
-- Load: `examples/databases/`
-- Load: `reference/databases/`
-
-**Network directories (/network/, /networking/, /nginx/, /traefik/)**
-- Load: `patterns/networking/`
-- Load: `examples/networking/`
-- Load: `reference/networking/troubleshooting.md`
-
-**VM directories (/vm/, /virtual/, /proxmox/, /kvm/)**
-- Load: `patterns/vm-management/`
-- Load: `examples/vm-management/`
-- Load: `reference/vm-management/`
-
-**Scripts directory (/scripts/, /scripts/*/)**
-- Load: `patterns/` (relevant to script type)
-- Load: `reference/` (relevant troubleshooting guides)  
-- Load: `scripts/*/README.md` (subsystem-specific documentation)
+**Script subdirectories (/tdarr/scripts/, /docker/scripts/, etc.)**
+- Load: `{technology}/CONTEXT.md` (parent technology context)
+- Load: `{technology}/scripts/CONTEXT.md` (script-specific context)
+- Load: `{technology}/troubleshooting.md` (debugging info)
 - Context: Active operational scripts - treat as production code
-- Note: Windows desktop monitoring system available in `scripts/monitoring/windows-desktop/`
 
-### Keyword Triggers
-When user mentions specific terms, automatically load relevant docs:
+**Legacy directories (for backward compatibility)**
+- `/scripts/tdarr/` → Load Tdarr context files
+- `/scripts/monitoring/` → Load Monitoring context files
+- `/patterns/`, `/examples/`, `/reference/` → Load as before until migration complete
 
-**Troubleshooting Keywords**
-- "debug", "error", "fail", "broken", "not working", "issue"
-  - Load: `reference/{relevant-tech}/troubleshooting.md`
-  - Load: `examples/{relevant-tech}/debugging.md`
+### File Extension Triggers
+For programming languages, load general development context:
 
-**Configuration Keywords**
-- "config", "configure", "setup", "install", "deploy"
-  - Load: `patterns/{relevant-tech}/`
-  - Load: `examples/{relevant-tech}/configuration.md`
+**Python (.py, .pyx, .pyi)**
+- Load: `development/python-CONTEXT.md` (Python patterns and best practices)
+- If Django/Flask detected: Load `development/web-frameworks-CONTEXT.md`
+- If requests/httpx detected: Load `development/api-clients-CONTEXT.md`
 
-**Performance Keywords**
-- "slow", "performance", "optimize", "memory", "cpu"
-  - Load: `reference/{relevant-tech}/performance.md`
-  - Load: `examples/{relevant-tech}/optimization.md`
+**JavaScript/Node.js (.js, .mjs, .ts)**
+- Load: `development/nodejs-CONTEXT.md` (Node.js patterns and best practices)
+- If package.json exists: Load `development/package-management-CONTEXT.md`
 
-**Security Keywords**
-- "secure", "ssl", "tls", "certificate", "auth", "firewall"
-  - Load: `patterns/networking/security.md`
-  - Load: `reference/networking/security.md`
+**Shell Scripts (.sh, .bash, .zsh)**
+- Load: `development/bash-CONTEXT.md` (Bash scripting patterns)
+- If systemd mentioned: Load `development/service-management-CONTEXT.md`
 
-**Database Keywords**
-- "database", "db", "sql", "mysql", "postgres", "mongo", "redis"
-  - Load: `patterns/databases/`
-  - Load: `examples/databases/`
+### Troubleshooting Keywords
+For troubleshooting scenarios, always load both context and troubleshooting files:
 
-**Container Keywords**
-- "docker", "container", "image", "compose", "kubernetes", "k8s"
-  - Load: `patterns/docker/`
-  - Load: `examples/docker/`
+**General Troubleshooting Keywords**
+- "shutdown", "stop", "emergency", "reset", "recovery", "crash", "broken", "not working", "error", "issue", "problem", "debug", "troubleshoot", "fix"
+  - If Tdarr context detected: Load `tdarr/CONTEXT.md` AND `tdarr/troubleshooting.md`
+  - If Docker context detected: Load `docker/CONTEXT.md` AND `docker/troubleshooting.md`
+  - If VM context detected: Load `vm-management/CONTEXT.md` AND `vm-management/troubleshooting.md`
+  - If Network context detected: Load `networking/CONTEXT.md` AND `networking/troubleshooting.md`
+  - If Monitoring context detected: Load `monitoring/CONTEXT.md` AND `monitoring/troubleshooting.md`
 
-**Network Keywords**
-- "network", "nginx", "proxy", "load balancer", "dns", "port", "firewall"
-  - Load: `patterns/networking/`
-  - Load: `examples/networking/`
-
-**SSH Keywords**
-- "ssh", "key", "authentication", "authorized_keys", "ssh-copy-id"
-  - Load: `patterns/networking/ssh-key-management.md`
-  - Load: `examples/networking/ssh-homelab-setup.md`
-  - Load: `reference/networking/ssh-troubleshooting.md`
-
-**VM Keywords**
-- "virtual machine", "vm", "proxmox", "kvm", "hypervisor", "guest"
-  - Load: `patterns/vm-management/`
-  - Load: `examples/vm-management/`
-
-**Tdarr Keywords**
-- "tdarr", "transcode", "ffmpeg", "gpu transcoding", "nvenc", "forEach error", "gaming detection", "scheduler", "monitoring", "api"
-  - Load: `reference/docker/tdarr-troubleshooting.md`
-  - Load: `patterns/docker/distributed-transcoding.md`
-  - Load: `scripts/tdarr/README.md` (for automation and scheduling)
-  - Load: `scripts/monitoring/README.md` (for monitoring and health checks)
-  - Note: Gaming-aware scheduling system with configurable time windows available
-  - Note: Comprehensive API monitoring available via `tdarr_monitor.py` with dataclass-based status tracking
-
-**Windows Monitoring Keywords**
-- "windows reboot", "discord notification", "system monitor", "windows desktop", "power outage", "windows update"
-  - Load: `scripts/monitoring/windows-desktop/README.md`
-  - Note: Complete Windows desktop monitoring with Discord notifications for reboots and system events
+**Specific Tdarr Troubleshooting Keywords**
+- "forEach error", "staging timeout", "gaming detection", "plugin error", "container stop", "node disconnect", "cache cleanup", "shutdown tdarr", "stop tdarr", "emergency tdarr", "reset tdarr"
+  - Load: `tdarr/CONTEXT.md` (technology overview)
+  - Load: `tdarr/troubleshooting.md` (specific solutions including Emergency Recovery section)
+  - If working in `/tdarr/scripts/`: Load `tdarr/scripts/CONTEXT.md`
 
 ### Priority Rules
 1. **File extension triggers** take highest priority
@@ -141,33 +109,132 @@ When user mentions specific terms, automatically load relevant docs:
 5. Always prefer specific over general (e.g., `vuejs/` over `nodejs/`)
 
 ### Context Loading Behavior
-- Load pattern files first for overview
-- Load relevant examples for implementation details
-- Load reference files for troubleshooting and edge cases
-- Maximum of 3 documentation files per trigger to maintain efficiency
-- If context becomes too large, prioritize most recent/specific files
+- **Technology context first**: Load CONTEXT.md for overview and patterns
+- **Troubleshooting context**: ALWAYS load troubleshooting.md for error scenarios and emergency procedures
+- **Script-specific context**: Load scripts/CONTEXT.md when working in script directories  
+- **Examples last**: Load examples for implementation details
+- **Critical rule**: For any troubleshooting scenario, load BOTH context and troubleshooting files to ensure complete information
+- Maximum of 3-4 documentation files per trigger to maintain efficiency while ensuring comprehensive coverage
 
 ## Documentation Structure
 
 ```
-/patterns/          # Technology overviews and best practices
-/examples/          # Complete working implementations  
-/reference/         # Troubleshooting, cheat sheets, fallback info
-/scripts/           # Active scripts and utilities for home lab operations
-  ├── tdarr/        # Tdarr automation with gaming-aware scheduling
-  ├── monitoring/   # System monitoring and alerting
-  │   ├── tdarr_monitor.py  # Comprehensive Tdarr API monitoring with dataclasses
-  │   └── windows-desktop/  # Windows reboot monitoring with Discord notifications
-  └── <future>/     # Other organized automation subsystems
-```
+/tdarr/                    # Tdarr transcoding automation
+  ├── CONTEXT.md           # Technology overview, patterns, best practices
+  ├── troubleshooting.md   # Error handling and debugging
+  ├── examples/            # Working configurations and templates
+  └── scripts/             # Active automation scripts
+      ├── CONTEXT.md       # Script-specific documentation
+      ├── monitoring.py    # Comprehensive API monitoring with dataclasses
+      └── scheduler.py     # Gaming-aware scheduling system
 
-Each pattern file should reference relevant examples and reference materials.
+/docker/                   # Container orchestration and management
+  ├── CONTEXT.md           # Technology overview, patterns, best practices
+  ├── troubleshooting.md   # Error handling and debugging
+  ├── examples/            # Working configurations and templates
+  └── scripts/             # Active automation scripts
+      └── CONTEXT.md       # Script-specific documentation
+
+/vm-management/            # Virtual machine operations
+  ├── CONTEXT.md           # Technology overview, patterns, best practices
+  ├── troubleshooting.md   # Error handling and debugging
+  ├── examples/            # Working configurations and templates
+  └── scripts/             # Active automation scripts
+      └── CONTEXT.md       # Script-specific documentation
+
+/networking/               # Network configuration and SSH management
+  ├── CONTEXT.md           # Technology overview, patterns, best practices
+  ├── troubleshooting.md   # Error handling and debugging
+  ├── examples/            # Working configurations and templates
+  └── scripts/             # Active automation scripts
+      └── CONTEXT.md       # Script-specific documentation
+
+/monitoring/               # System monitoring and alerting
+  ├── CONTEXT.md           # Technology overview, patterns, best practices
+  ├── troubleshooting.md   # Error handling and debugging
+  ├── examples/            # Working configurations and templates
+  └── scripts/             # Active automation scripts
+      ├── CONTEXT.md       # Script-specific documentation
+      └── windows-desktop/ # Windows reboot monitoring with Discord notifications
+
+/development/              # Programming language patterns and tools
+  ├── python-CONTEXT.md    # Python development patterns
+  ├── nodejs-CONTEXT.md    # Node.js development patterns
+  └── bash-CONTEXT.md      # Shell scripting patterns
+
+/legacy/                   # Backward compatibility during migration
+  ├── patterns/            # Old patterns structure (temporary)
+  ├── examples/            # Old examples structure (temporary)
+  └── reference/           # Old reference structure (temporary)
+```
 
 ### Directory Usage Guidelines
 
-- `/scripts/` - Contains actively used scripts for home lab management and operations
-  - Organized by subsystem (e.g., `tdarr/`, `networking/`, `vm-management/`)
-  - Each subsystem includes its own README.md with complete documentation
-- `/examples/` - Contains example configurations and template scripts for reference
-- `/patterns/` - Best practices and architectural guidance  
-- `/reference/` - Troubleshooting guides and technical references
+- Each technology directory is self-contained with its own context, troubleshooting, examples, and scripts
+- `CONTEXT.md` files provide technology overview, patterns, and best practices for Claude
+- `troubleshooting.md` files contain error handling and debugging information
+- `/scripts/` subdirectories contain active operational code with their own `CONTEXT.md`
+- `/examples/` subdirectories contain template configurations and reference implementations
+- `/development/` contains general programming language patterns that apply across technologies
+- `/legacy/` provides backward compatibility during the migration from the old structure
+
+## Documentation Maintenance Protocol
+
+### Automated Maintenance Triggers
+Claude Code should automatically prompt for documentation updates when:
+
+1. **New Technology Integration**: When working with a technology that doesn't have a dedicated directory
+   - Prompt: "I notice we're working with [technology] but don't have a dedicated `/[technology]/` directory. Should I create the technology-first structure with CONTEXT.md and troubleshooting.md files?"
+
+2. **New Error Patterns Discovered**: When encountering and solving new issues
+   - Prompt: "We just resolved a [technology] issue that isn't documented. Should I add this solution to `[technology]/troubleshooting.md`?"
+
+3. **New Scripts or Operational Procedures**: When creating new automation or workflows
+   - Prompt: "I created new scripts/procedures for [technology]. Should I update `[technology]/scripts/CONTEXT.md` and add any new operational patterns?"
+
+4. **Session End with Significant Changes**: When completing complex tasks
+   - Prompt: "We made significant changes to [technology] systems. Should I update our documentation to reflect the new patterns, configurations, or troubleshooting procedures we discovered?"
+
+### Documentation Update Checklist
+When "update our documentation" is requested, systematically check:
+
+**Technology-Specific Updates**:
+- [ ] Update `[technology]/CONTEXT.md` with new patterns or architectural changes
+- [ ] Add new troubleshooting scenarios to `[technology]/troubleshooting.md`
+- [ ] Update `[technology]/scripts/CONTEXT.md` for new operational procedures
+- [ ] Add working examples to `[technology]/examples/` if new configurations were created
+
+**Cross-Technology Updates**:
+- [ ] Update main CLAUDE.md loading rules if new keywords or triggers are needed
+- [ ] Add new technology directories to the Documentation Structure section
+- [ ] Update Directory Usage Guidelines if new organizational patterns emerge
+
+**Legacy Cleanup**:
+- [ ] Check if any old patterns/examples/reference files can be migrated to technology directories
+- [ ] Update or remove outdated information that conflicts with new approaches
+
+### Self-Maintenance Features
+
+**Loading Rule Validation**: Periodically verify that:
+- All technology directories have corresponding keyword triggers
+- Troubleshooting keywords include all common error scenarios
+- File paths in loading rules match actual directory structure
+
+**Documentation Completeness Check**: Each technology directory should have:
+- `CONTEXT.md` (overview, patterns, best practices)
+- `troubleshooting.md` (error scenarios, emergency procedures)
+- `examples/` (working configurations)
+- `scripts/CONTEXT.md` (if operational scripts exist)
+
+**Keyword Coverage Analysis**: Ensure loading rules cover:
+- Technology names and common aliases
+- Error types and troubleshooting scenarios  
+- Operational keywords (start, stop, configure, monitor)
+- Emergency keywords (shutdown, reset, recovery)
+
+### Warning Triggers
+Claude Code should warn when:
+- Working extensively with a technology that lacks dedicated documentation structure
+- Solving problems that aren't covered in existing troubleshooting guides
+- Creating scripts or procedures without corresponding CONTEXT.md documentation
+- Encountering loading rules that reference non-existent files
diff --git a/patterns/databases/README.md b/databases/CONTEXT.md
similarity index 100%
rename from patterns/databases/README.md
rename to databases/CONTEXT.md
diff --git a/databases/troubleshooting.md b/databases/troubleshooting.md
new file mode 100644
index 0000000..dd5734f
--- /dev/null
+++ b/databases/troubleshooting.md
@@ -0,0 +1,316 @@
+# Database Troubleshooting Guide
+
+## Connection Issues
+
+### Cannot Connect to Database
+**Symptoms**: Connection refused, timeout errors, authentication failures
+**Diagnosis**:
+```bash
+# Test basic connectivity
+telnet db-server 3306  # MySQL
+telnet db-server 5432  # PostgreSQL
+nc -zv db-server 6379  # Redis
+
+# Check database service status
+systemctl status mysql
+systemctl status postgresql
+systemctl status redis-server
+```
+
+**Solutions**:
+```bash
+# Restart database services
+sudo systemctl restart mysql
+sudo systemctl restart postgresql
+
+# Check configuration files
+sudo nano /etc/mysql/mysql.conf.d/mysqld.cnf
+sudo nano /etc/postgresql/*/main/postgresql.conf
+
+# Verify port bindings
+ss -tlnp | grep :3306  # MySQL
+ss -tlnp | grep :5432  # PostgreSQL
+```
+
+## Performance Issues
+
+### Slow Query Performance
+**Symptoms**: Long-running queries, high CPU usage, timeouts
+**Diagnosis**:
+```sql
+-- MySQL
+SHOW PROCESSLIST;
+SHOW ENGINE INNODB STATUS;
+EXPLAIN SELECT * FROM table WHERE condition;
+
+-- PostgreSQL  
+SELECT * FROM pg_stat_activity;
+EXPLAIN ANALYZE SELECT * FROM table WHERE condition;
+```
+
+**Solutions**:
+```sql
+-- Add missing indexes
+CREATE INDEX idx_column ON table(column);
+
+-- Analyze table statistics
+ANALYZE TABLE table_name;  -- MySQL
+ANALYZE table_name;        -- PostgreSQL
+
+-- Optimize queries
+-- Use LIMIT for large result sets
+-- Add WHERE clauses to filter results
+-- Use appropriate JOIN types
+```
+
+### Memory and Resource Issues
+**Symptoms**: Out of memory errors, swap usage, slow performance
+**Diagnosis**:
+```bash
+# Check memory usage
+free -h
+ps aux | grep mysql
+ps aux | grep postgres
+
+# Database-specific memory usage
+mysqladmin -u root -p status
+sudo -u postgres psql -c "SELECT * FROM pg_stat_database;"
+```
+
+**Solutions**:
+```bash
+# Adjust database memory settings
+# MySQL - /etc/mysql/mysql.conf.d/mysqld.cnf
+innodb_buffer_pool_size = 2G
+key_buffer_size = 256M
+
+# PostgreSQL - /etc/postgresql/*/main/postgresql.conf
+shared_buffers = 256MB
+effective_cache_size = 2GB
+work_mem = 4MB
+```
+
+## Data Integrity Issues
+
+### Corruption Detection and Recovery
+**Symptoms**: Table corruption errors, data inconsistencies
+**Diagnosis**:
+```sql
+-- MySQL
+CHECK TABLE table_name;
+mysqlcheck -u root -p --all-databases
+
+-- PostgreSQL
+-- Check for corruption in logs
+tail -f /var/log/postgresql/postgresql-*.log
+```
+
+**Solutions**:
+```sql
+-- MySQL table repair
+REPAIR TABLE table_name;
+mysqlcheck -u root -p --auto-repair database_name
+
+-- PostgreSQL consistency check
+-- Run VACUUM and REINDEX
+VACUUM FULL table_name;
+REINDEX TABLE table_name;
+```
+
+## Backup and Recovery Issues
+
+### Backup Failures
+**Symptoms**: Backup scripts failing, incomplete backups
+**Diagnosis**:
+```bash
+# Check backup script logs
+tail -f /var/log/backup.log
+
+# Test backup commands manually
+mysqldump -u root -p database_name > test_backup.sql
+pg_dump -U postgres database_name > test_backup.sql
+
+# Check disk space
+df -h /backup/location/
+```
+
+**Solutions**:
+```bash
+# Fix backup script permissions
+chmod +x /path/to/backup-script.sh
+chown backup-user:backup-group /backup/location/
+
+# Automated backup script example
+#!/bin/bash
+BACKUP_DIR="/backups/mysql"
+DATE=$(date +%Y%m%d_%H%M%S)
+
+mysqldump -u root -p$MYSQL_PASSWORD --all-databases > \
+    "$BACKUP_DIR/full_backup_$DATE.sql"
+
+# Compress and rotate backups
+gzip "$BACKUP_DIR/full_backup_$DATE.sql"
+find "$BACKUP_DIR" -name "*.gz" -mtime +7 -delete
+```
+
+## Authentication and Security Issues
+
+### Access Denied Errors
+**Symptoms**: Authentication failures, permission errors
+**Diagnosis**:
+```sql
+-- MySQL
+SELECT user, host FROM mysql.user;
+SHOW GRANTS FOR 'username'@'host';
+
+-- PostgreSQL
+\du  -- List users
+\l   -- List databases
+```
+
+**Solutions**:
+```sql
+-- MySQL user management
+CREATE USER 'newuser'@'localhost' IDENTIFIED BY 'password';
+GRANT ALL PRIVILEGES ON database.* TO 'newuser'@'localhost';
+FLUSH PRIVILEGES;
+
+-- PostgreSQL user management
+CREATE USER newuser WITH PASSWORD 'password';
+GRANT ALL PRIVILEGES ON DATABASE database_name TO newuser;
+```
+
+## Replication Issues
+
+### Master-Slave Replication Problems
+**Symptoms**: Replication lag, sync errors, slave disconnection
+**Diagnosis**:
+```sql
+-- MySQL Master
+SHOW MASTER STATUS;
+
+-- MySQL Slave
+SHOW SLAVE STATUS\G
+
+-- Check replication lag
+SELECT SECONDS_BEHIND_MASTER FROM SHOW SLAVE STATUS\G
+```
+
+**Solutions**:
+```sql
+-- Reset replication
+STOP SLAVE;
+RESET SLAVE;
+CHANGE MASTER TO MASTER_LOG_FILE='mysql-bin.000001', MASTER_LOG_POS=4;
+START SLAVE;
+
+-- Fix replication errors
+SET GLOBAL sql_slave_skip_counter = 1;
+START SLAVE;
+```
+
+## Storage and Disk Issues
+
+### Disk Space Problems
+**Symptoms**: Out of disk space errors, database growth
+**Diagnosis**:
+```bash
+# Check database sizes
+du -sh /var/lib/mysql/*
+du -sh /var/lib/postgresql/*/main/*
+
+# Find large tables
+SELECT table_schema, table_name, 
+       ROUND((data_length + index_length) / 1024 / 1024, 2) AS 'Size (MB)'
+FROM information_schema.tables 
+ORDER BY (data_length + index_length) DESC;
+```
+
+**Solutions**:
+```sql
+-- Clean up large tables
+DELETE FROM log_table WHERE created_date < DATE_SUB(NOW(), INTERVAL 30 DAY);
+OPTIMIZE TABLE log_table;
+
+-- Enable log rotation
+-- For MySQL binary logs
+SET GLOBAL expire_logs_days = 7;
+PURGE BINARY LOGS BEFORE DATE(NOW() - INTERVAL 7 DAY);
+```
+
+## Emergency Recovery
+
+### Database Won't Start
+**Recovery Steps**:
+```bash
+# Check error logs
+tail -f /var/log/mysql/error.log
+tail -f /var/log/postgresql/postgresql-*.log
+
+# Try safe mode start
+sudo mysqld_safe --skip-grant-tables &
+
+# Recovery from backup
+mysql -u root -p < backup_file.sql
+psql -U postgres database_name < backup_file.sql
+```
+
+### Complete Data Loss Recovery
+**Recovery Procedure**:
+```bash
+# Stop database service
+sudo systemctl stop mysql
+
+# Restore from backup
+cd /var/lib/mysql
+sudo rm -rf *
+sudo tar -xzf /backups/mysql_full_backup.tar.gz
+
+# Fix permissions
+sudo chown -R mysql:mysql /var/lib/mysql
+sudo chmod 755 /var/lib/mysql
+
+# Start database
+sudo systemctl start mysql
+```
+
+## Monitoring and Prevention
+
+### Database Health Monitoring
+```bash
+#!/bin/bash
+# db-health-check.sh
+
+# Check if database is responding
+if ! mysqladmin -u root -p$MYSQL_PASSWORD ping >/dev/null 2>&1; then
+    echo "ALERT: MySQL not responding" | send_alert
+fi
+
+# Check disk space
+DISK_USAGE=$(df /var/lib/mysql | awk 'NR==2 {print $5}' | sed 's/%//')
+if [ $DISK_USAGE -gt 80 ]; then
+    echo "ALERT: Database disk usage at ${DISK_USAGE}%" | send_alert
+fi
+
+# Check for long-running queries
+LONG_QUERIES=$(mysql -u root -p$MYSQL_PASSWORD -e "SHOW PROCESSLIST" | grep -c "Query.*[0-9][0-9][0-9]")
+if [ $LONG_QUERIES -gt 5 ]; then
+    echo "ALERT: $LONG_QUERIES long-running queries detected" | send_alert
+fi
+```
+
+### Automated Maintenance
+```bash
+# Daily maintenance script
+#!/bin/bash
+# Optimize tables
+mysqlcheck -u root -p$MYSQL_PASSWORD --auto-repair --optimize --all-databases
+
+# Update table statistics
+mysql -u root -p$MYSQL_PASSWORD -e "FLUSH TABLES; ANALYZE TABLE table_name;"
+
+# Backup rotation
+find /backups -name "*.sql.gz" -mtime +30 -delete
+```
+
+This troubleshooting guide provides systematic approaches to resolving common database issues in home lab environments.
\ No newline at end of file
diff --git a/patterns/bash/README.md b/development/bash-CONTEXT.md
similarity index 100%
rename from patterns/bash/README.md
rename to development/bash-CONTEXT.md
diff --git a/reference/bash/troubleshooting.md b/development/bash-troubleshooting.md
similarity index 100%
rename from reference/bash/troubleshooting.md
rename to development/bash-troubleshooting.md
diff --git a/reference/python/debugging.md b/development/debugging.md
similarity index 100%
rename from reference/python/debugging.md
rename to development/debugging.md
diff --git a/patterns/nodejs/README.md b/development/nodejs-CONTEXT.md
similarity index 100%
rename from patterns/nodejs/README.md
rename to development/nodejs-CONTEXT.md
diff --git a/patterns/python/README.md b/development/python-CONTEXT.md
similarity index 100%
rename from patterns/python/README.md
rename to development/python-CONTEXT.md
diff --git a/examples/bash/service-management.md b/development/service-management.md
similarity index 100%
rename from examples/bash/service-management.md
rename to development/service-management.md
diff --git a/patterns/vuejs/README.md b/development/vuejs-CONTEXT.md
similarity index 100%
rename from patterns/vuejs/README.md
rename to development/vuejs-CONTEXT.md
diff --git a/examples/python/web-frameworks.md b/development/web-frameworks.md
similarity index 100%
rename from examples/python/web-frameworks.md
rename to development/web-frameworks.md
diff --git a/docker/CONTEXT.md b/docker/CONTEXT.md
new file mode 100644
index 0000000..3c44222
--- /dev/null
+++ b/docker/CONTEXT.md
@@ -0,0 +1,331 @@
+# Docker Container Technology - Technology Context
+
+## Overview
+Docker containerization for home lab environments with focus on performance optimization, GPU acceleration, and distributed workloads. This context covers container architecture patterns, security practices, and production deployment strategies.
+
+## Architecture Patterns
+
+### Container Design Principles
+1. **Single Responsibility**: One service per container
+2. **Immutable Infrastructure**: Treat containers as replaceable units
+3. **Resource Isolation**: Use container limits and cgroups
+4. **Security First**: Run as non-root, minimal attack surface
+5. **Configuration Management**: Environment variables and external configs
+
+### Multi-Stage Build Pattern
+**Purpose**: Minimize production image size and attack surface
+```dockerfile
+# Build stage
+FROM node:18 AS builder
+WORKDIR /app
+COPY package*.json ./
+RUN npm ci --only=production
+
+# Production stage  
+FROM node:18-alpine AS production
+WORKDIR /app
+COPY --from=builder /app/node_modules ./node_modules
+COPY . .
+USER 1000
+EXPOSE 3000
+CMD ["node", "server.js"]
+```
+
+### Distributed Application Architecture
+**Pattern**: Server-Node separation with specialized workloads
+
+```
+┌─────────────────┐    ┌──────────────────────────────────┐
+│   Control Plane │    │        Worker Nodes              │
+│                 │    │  ┌─────────┐ ┌─────────┐         │
+│  - Web Interface│◄──►│  │ Node 1  │ │ Node 2  │ ...     │
+│  - Job Queue    │    │  │ GPU+CPU │ │ GPU+CPU │         │
+│  - Coordination │    │  │Local SSD│ │Local SSD│         │
+│                 │    │  └─────────┘ └─────────┘         │
+└─────────────────┘    └──────────────────────────────────┘
+         │                              │
+         └──────── Shared Storage ──────┘
+              (NAS/SAN for persistence)
+```
+
+## Container Runtime Platforms
+
+### Docker vs Podman Comparison
+**Docker**: Traditional daemon-based approach
+- Requires Docker daemon running as root
+- Centralized container management
+- Established ecosystem and tooling
+
+**Podman** (Recommended for GPU workloads):
+- Daemonless architecture
+- Better GPU integration with NVIDIA
+- Rootless containers for enhanced security
+- Direct systemd integration
+
+### GPU Acceleration Support
+**NVIDIA Container Toolkit Integration**:
+```bash
+# Podman GPU configuration (recommended)
+podman run -d --name gpu-workload \
+    --device nvidia.com/gpu=all \
+    -e NVIDIA_DRIVER_CAPABILITIES=all \
+    -e NVIDIA_VISIBLE_DEVICES=all \
+    myapp:latest
+
+# Docker GPU configuration
+docker run -d --name gpu-workload \
+    --gpus all \
+    -e NVIDIA_DRIVER_CAPABILITIES=all \
+    myapp:latest
+```
+
+## Performance Optimization Patterns
+
+### Hybrid Storage Strategy
+**Pattern**: Balance performance and persistence for different data types
+
+```yaml
+volumes:
+  # Local storage (SSD/NVMe) - High Performance
+  - ./app/data:/app/data              # Database - frequent I/O
+  - ./app/configs:/app/configs        # Config - startup performance
+  - ./app/logs:/app/logs              # Logs - continuous writing
+  - ./cache:/cache                    # Work directories - temp processing
+  
+  # Network storage (NAS) - Persistence & Backup  
+  - /mnt/nas/backups:/app/backups     # Backups - infrequent access
+  - /mnt/nas/media:/media:ro          # Source data - read-only
+```
+
+**Benefits**:
+- **Local Operations**: 100x faster database performance vs network
+- **Network Reliability**: Critical data protected on redundant storage
+- **Cost Optimization**: Expensive fast storage only where needed
+
+### Cache Optimization Hierarchy
+```bash
+# Performance tiers for different workload types
+/dev/shm/cache/          # RAM disk - fastest, volatile, limited size
+/mnt/nvme/cache/         # NVMe SSD - 3-7GB/s, persistent, recommended
+/mnt/ssd/cache/          # SATA SSD - 500MB/s, good balance
+/mnt/nas/cache/          # Network - 100MB/s, legacy compatibility
+```
+
+### Resource Management
+**Container Limits** (prevent resource exhaustion):
+```yaml
+deploy:
+  resources:
+    limits:
+      memory: 8G
+      cpus: '6'
+    reservations:
+      memory: 4G
+      cpus: '2'
+```
+
+**Networking Optimization**:
+```yaml
+# Host networking for performance-critical applications
+network_mode: host
+
+# Bridge networking with port mapping (default)
+network_mode: bridge
+ports:
+  - "8080:8080"
+```
+
+## Security Patterns
+
+### Container Hardening
+```dockerfile
+# Use minimal base images
+FROM alpine:3.18
+
+# Run as non-root user
+RUN addgroup -g 1000 appuser && \
+    adduser -u 1000 -G appuser -s /bin/sh -D appuser
+USER 1000
+
+# Set secure permissions
+COPY --chown=appuser:appuser . /app
+```
+
+### Environment Security
+```bash
+# Secrets management (avoid environment variables for secrets)
+podman secret create db_password password.txt
+podman run --secret db_password myapp:latest
+
+# Network isolation
+podman network create --driver bridge isolated-net
+podman run --network isolated-net myapp:latest
+```
+
+### Image Security
+1. **Vulnerability Scanning**: Regular image scans with tools like Trivy
+2. **Version Pinning**: Use specific tags, avoid `latest`
+3. **Minimal Images**: Distroless or Alpine base images
+4. **Layer Optimization**: Minimize layers, combine RUN commands
+
+## Development Workflows
+
+### Local Development Pattern
+```yaml
+# docker-compose.dev.yml
+version: "3.8"
+services:
+  app:
+    build: .
+    volumes:
+      - .:/app              # Code hot-reload
+      - /app/node_modules   # Preserve dependencies
+    environment:
+      - NODE_ENV=development
+    ports:
+      - "3000:3000"
+```
+
+### Production Deployment Pattern
+```bash
+# Production container with health checks
+podman run -d --name production-app \
+    --restart unless-stopped \
+    --health-cmd="curl -f http://localhost:3000/health || exit 1" \
+    --health-interval=30s \
+    --health-timeout=10s \
+    --health-retries=3 \
+    -p 3000:3000 \
+    myapp:v1.2.3
+```
+
+## Monitoring and Observability
+
+### Health Check Implementation
+```dockerfile
+# Application health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:3000/health || exit 1
+```
+
+### Log Management
+```bash
+# Structured logging with log rotation
+podman run -d --name app \
+    --log-driver journald \
+    --log-opt max-size=10m \
+    --log-opt max-file=3 \
+    myapp:latest
+
+# Centralized logging
+podman logs -f app | logger -t myapp
+```
+
+### Resource Monitoring
+```bash
+# Real-time container metrics
+podman stats --no-stream app
+
+# Historical resource usage
+podman exec app cat /sys/fs/cgroup/memory/memory.usage_in_bytes
+```
+
+## Common Implementation Patterns
+
+### Database Containers
+```yaml
+# Persistent database with backup strategy
+services:
+  postgres:
+    image: postgres:15-alpine
+    environment:
+      POSTGRES_DB: myapp
+      POSTGRES_USER: appuser
+      POSTGRES_PASSWORD_FILE: /run/secrets/db_password
+    volumes:
+      - postgres_data:/var/lib/postgresql/data  # Persistent data
+      - ./backups:/backups                      # Backup mount
+    secrets:
+      - db_password
+```
+
+### Web Application Containers
+```yaml
+# Multi-tier web application
+services:
+  frontend:
+    image: nginx:alpine
+    volumes:
+      - ./nginx.conf:/etc/nginx/nginx.conf:ro
+    ports:
+      - "80:80"
+      - "443:443"
+    depends_on:
+      - backend
+      
+  backend:
+    build: ./api
+    environment:
+      - DATABASE_URL=postgresql://appuser@postgres/myapp
+    depends_on:
+      - postgres
+```
+
+### GPU-Accelerated Workloads
+```bash
+# GPU transcoding/processing container
+podman run -d --name gpu-processor \
+    --device nvidia.com/gpu=all \
+    -e NVIDIA_DRIVER_CAPABILITIES=compute,video \
+    -v "/fast-storage:/cache" \
+    -v "/media:/input:ro" \
+    -v "/output:/output" \
+    gpu-app:latest
+```
+
+## Best Practices
+
+### Production Deployment
+1. **Use specific image tags**: Never use `latest` in production
+2. **Implement health checks**: Application and infrastructure monitoring
+3. **Resource limits**: Prevent resource exhaustion
+4. **Backup strategy**: Regular backups of persistent data
+5. **Security scanning**: Regular vulnerability assessments
+
+### Development Guidelines
+1. **Multi-stage builds**: Separate build and runtime environments
+2. **Environment parity**: Keep dev/staging/prod similar
+3. **Configuration externalization**: Use environment variables and secrets
+4. **Dependency management**: Pin versions, use lock files
+5. **Testing strategy**: Unit, integration, and container tests
+
+### Operational Excellence
+1. **Log aggregation**: Centralized logging strategy
+2. **Metrics collection**: Application and infrastructure metrics
+3. **Alerting**: Proactive monitoring and alerting
+4. **Documentation**: Container documentation and runbooks
+5. **Disaster recovery**: Backup and recovery procedures
+
+## Migration Patterns
+
+### Legacy Application Containerization
+1. **Assessment**: Identify dependencies and requirements
+2. **Dockerfile creation**: Start with appropriate base image
+3. **Configuration externalization**: Move configs to environment variables
+4. **Data persistence**: Identify and volume mount data directories
+5. **Testing**: Validate functionality in containerized environment
+
+### Platform Migration (Docker to Podman)
+```bash
+# Export Docker container configuration
+docker inspect mycontainer > container-config.json
+
+# Convert to Podman run command
+podman run -d --name mycontainer \
+    --memory 4g \
+    --cpus 2 \
+    -v /host/path:/container/path \
+    myimage:tag
+```
+
+This technology context provides comprehensive guidance for implementing Docker containerization strategies in home lab and production environments.
\ No newline at end of file
diff --git a/reference/docker/crash-analysis-summary.md b/docker/examples/crash-analysis-summary.md
similarity index 100%
rename from reference/docker/crash-analysis-summary.md
rename to docker/examples/crash-analysis-summary.md
diff --git a/patterns/docker/distributed-transcoding.md b/docker/examples/distributed-transcoding.md
similarity index 100%
rename from patterns/docker/distributed-transcoding.md
rename to docker/examples/distributed-transcoding.md
diff --git a/docker/examples/docker-iptables-troubleshooting-session.md b/docker/examples/docker-iptables-troubleshooting-session.md
new file mode 100644
index 0000000..3b8e541
--- /dev/null
+++ b/docker/examples/docker-iptables-troubleshooting-session.md
@@ -0,0 +1,262 @@
+# Docker iptables/nftables Backend Troubleshooting Session
+
+## Session Context
+- **Date**: August 8, 2025
+- **System**: Nobara PC (Fedora-based gaming distro)
+- **User**: cal
+- **Working Directory**: `/mnt/NV2/Development/claude-home`
+- **Goal**: Get Docker working to run Tdarr Node container
+
+## System Information
+```bash
+# OS Details
+uname -a
+# Linux nobara-pc 6.15.5-200.nobara.fc42.x86_64 #1 SMP PREEMPT_DYNAMIC Sun Jul  6 11:56:20 UTC 2025 x86_64 GNU/Linux
+
+# Hardware
+# AMD Ryzen 7 7800X3D 8-Core Processor
+# 62GB RAM  
+# NVIDIA GeForce RTX 4080 SUPER
+
+# Distribution
+# Nobara (Fedora 42-based)
+```
+
+## Problem Summary
+Docker daemon fails to start with persistent error:
+```
+failed to start daemon: Error initializing network controller: error obtaining controller instance: failed to register "bridge" driver: failed to create NAT chain DOCKER: COMMAND_FAILED: INVALID_IPV: 'ipv4' is not a valid backend or is unavailable
+```
+
+## Root Cause Analysis
+
+### Initial Discovery
+1. **Missing iptables**: Docker couldn't find `iptables` command in PATH
+2. **Backend conflict**: System using nftables but Docker expects iptables-legacy
+3. **Package inconsistency**: `iptables-nft` package installed but binary missing initially
+
+### Key Findings
+- `dnf list installed | grep -i iptables` initially returned nothing
+- `firewalld` and `nftables` services were both inactive
+- `iptables-nft` package was installed but `/usr/bin/iptables` didn't exist
+- After reinstall, iptables worked but used nftables backend
+- NAT table incompatible: `iptables v1.8.11 (nf_tables): table 'nat' is incompatible, use 'nft' tool.`
+
+## Troubleshooting Steps Performed
+
+### Step 1: Package Investigation
+```bash
+# Check installed iptables packages
+dnf list installed | grep -i iptables
+# Result: No matching packages (surprising!)
+
+# Check service status
+systemctl status nftables    # inactive (dead)
+firewall-cmd --get-backend-type  # firewalld not running
+
+# Check if iptables binary exists
+which iptables  # not found
+/usr/bin/iptables --version  # No such file or directory
+```
+
+### Step 2: Package Reinstallation
+```bash
+# Reinstall iptables-nft package
+sudo dnf reinstall -y iptables-nft
+
+# Verify installation
+rpm -ql iptables-nft | grep bin
+# Shows /usr/bin/iptables should exist
+
+# Test after reinstall
+iptables --version
+# Result: iptables v1.8.11 (nf_tables) - SUCCESS!
+```
+
+### Step 3: Backend Compatibility Testing
+```bash
+# Test NAT table access
+sudo iptables -t nat -L
+# Error: iptables v1.8.11 (nf_tables): table `nat' is incompatible, use 'nft' tool.
+```
+
+### Step 4: Legacy Backend Installation
+```bash
+# Install iptables-legacy
+sudo dnf install -y iptables-legacy iptables-legacy-libs
+
+# Set up alternatives system
+sudo alternatives --install /usr/bin/iptables iptables /usr/bin/iptables-legacy 10
+sudo alternatives --install /usr/bin/ip6tables ip6tables /usr/bin/ip6tables-legacy 10
+
+# Test NAT table with legacy backend
+sudo iptables -t nat -L
+# SUCCESS: Shows empty NAT chains
+```
+
+### Step 5: Docker Restart Attempts
+```bash
+# Remove NVIDIA daemon.json config (potential conflict)
+sudo rm -f /etc/docker/daemon.json
+
+# Load NAT kernel module explicitly
+sudo modprobe iptable_nat
+
+# Try starting firewalld (in case Docker needs it)
+sudo systemctl enable --now firewalld
+
+# Multiple restart attempts
+sudo systemctl start docker
+# ALL FAILED with same NAT chain error
+```
+
+## Current State
+- ✅ iptables-legacy installed and configured
+- ✅ NAT table accessible via `iptables -t nat -L`
+- ✅ All required kernel modules should be available
+- ❌ Docker still fails with NAT chain creation error
+- ❌ Same error persists despite backend switch
+
+## Analysis of Persistent Issue
+
+### Potential Causes
+1. **Kernel State Contamination**: nftables rules/chains may still be active in kernel memory
+2. **Module Loading Order**: iptables vs nftables modules loaded in conflicting order
+3. **Docker Caching**: Docker may be caching the old backend detection
+4. **Firewall Integration**: Docker + firewalld interaction on Fedora/Nobara
+5. **System-Level Backend Selection**: Some system-wide iptables backend lock
+
+### Evidence Supporting Kernel State Theory
+- Error message is identical across all restart attempts
+- iptables command works fine manually
+- NAT table shows properly but Docker can't create chains
+- Issue persists despite configuration changes
+
+## Next Session Action Plan
+
+### Immediate Steps After System Reboot
+1. **Verify Backend Status**:
+   ```bash
+   iptables --version  # Should show legacy
+   sudo iptables -t nat -L  # Should show clean NAT table
+   ```
+
+2. **Check Kernel Modules**:
+   ```bash
+   lsmod | grep -E "(iptable|nf_|ip_tables)"
+   modprobe -l | grep -E "(iptable|nf_table)"
+   ```
+
+3. **Test Docker Start**:
+   ```bash
+   sudo systemctl start docker
+   docker --version
+   ```
+
+### If Issue Persists After Reboot
+
+#### Alternative Approach 1: Docker Configuration Override
+```bash
+# Create daemon.json to disable iptables management
+sudo mkdir -p /etc/docker
+cat <<EOF | sudo tee /etc/docker/daemon.json
+{
+  "iptables": false,
+  "bridge": "none"
+}
+EOF
+
+sudo systemctl start docker
+```
+
+#### Alternative Approach 2: Podman as Docker Alternative
+```bash
+# Install podman as Docker drop-in replacement
+sudo dnf install -y podman podman-docker
+
+# Test with Tdarr container
+podman run --rm ghcr.io/haveagitgat/tdarr_node:latest --help
+```
+
+#### Alternative Approach 3: Docker Desktop
+```bash
+# Consider Docker Desktop for Linux (handles networking differently)
+# May bypass system iptables issues entirely
+```
+
+#### Alternative Approach 4: Deep System Cleanup
+```bash
+# Nuclear option: Remove all networking packages and reinstall
+sudo dnf remove -y iptables* nftables firewalld
+sudo dnf install -y iptables-legacy iptables-nft firewalld
+sudo dnf reinstall -y docker-ce
+```
+
+### Diagnostic Commands for Next Session
+```bash
+# Full network state capture
+ip addr show
+ip route show
+sudo iptables-save > /tmp/iptables-state.txt
+sudo nft list ruleset > /tmp/nft-state.txt
+
+# Docker troubleshooting
+sudo dockerd --debug --log-level=debug > /tmp/docker-debug.log 2>&1 &
+# Kill after 30 seconds and examine log
+
+# System journal deep dive
+journalctl -u docker.service --since="1 hour ago" -o verbose > /tmp/docker-journal.log
+```
+
+## Known Working Configuration Target
+
+### Expected Working State
+- **iptables**: Legacy backend active
+- **Docker**: Running with NAT chain creation successful
+- **Network**: Docker bridge network functional
+- **Containers**: Can start and access network
+
+### Tdarr Node Test Command
+```bash
+cd ~/docker/tdarr-node
+# Update IP in compose file first:
+# serverIP=<TDARR_SERVER_IP>
+docker-compose -f tdarr-node-basic.yml up -d
+```
+
+## Related Documentation Created
+- `/patterns/docker/gpu-acceleration.md` - GPU troubleshooting patterns
+- `/reference/docker/nvidia-troubleshooting.md` - NVIDIA container toolkit
+- `/examples/docker/tdarr-node-local/` - Working configurations
+
+## System Context Notes
+- This is a gaming-focused Nobara distribution
+- May have different default networking than standard Fedora
+- NVIDIA drivers already working (nvidia-smi functional)
+- System has been used for other Docker containers successfully in past
+- Recent NVIDIA container toolkit installation may have triggered the issue
+
+## Success Criteria for Next Session
+1. ✅ Docker service starts without errors
+2. ✅ `docker ps` command works
+3. ✅ Simple container can run: `docker run --rm hello-world`
+4. ✅ Tdarr node container can start (even if can't connect to server yet)
+5. ✅ Network connectivity from containers works
+
+## Escalation Options
+If standard troubleshooting fails:
+1. **Nobara Community**: Check Nobara Discord/forums for similar issues
+2. **Docker Desktop**: Use different Docker implementation
+3. **Podman Migration**: Switch to podman as Docker replacement
+4. **System Reinstall**: Fresh OS install (nuclear option)
+5. **Container Alternatives**: LXC/systemd containers instead of Docker
+
+## Files to Check Next Session
+- `/etc/docker/daemon.json` - Docker configuration
+- `/var/log/docker.log` - Docker service logs  
+- `~/.docker/config.json` - User Docker config
+- `/proc/sys/net/ipv4/ip_forward` - IP forwarding enabled
+- `/etc/systemd/system/docker.service.d/` - Service overrides
+
+---
+*End of troubleshooting session log*
\ No newline at end of file
diff --git a/patterns/docker/gpu-acceleration.md b/docker/examples/gpu-acceleration.md
similarity index 100%
rename from patterns/docker/gpu-acceleration.md
rename to docker/examples/gpu-acceleration.md
diff --git a/examples/docker/multi-stage-builds.md b/docker/examples/multi-stage-builds.md
similarity index 100%
rename from examples/docker/multi-stage-builds.md
rename to docker/examples/multi-stage-builds.md
diff --git a/reference/docker/nvidia-gpu-troubleshooting.md b/docker/examples/nvidia-gpu-troubleshooting.md
similarity index 100%
rename from reference/docker/nvidia-gpu-troubleshooting.md
rename to docker/examples/nvidia-gpu-troubleshooting.md
diff --git a/reference/docker/nvidia-troubleshooting.md b/docker/examples/nvidia-troubleshooting.md
similarity index 100%
rename from reference/docker/nvidia-troubleshooting.md
rename to docker/examples/nvidia-troubleshooting.md
diff --git a/reference/docker/tdarr-container-fixes.md b/docker/examples/tdarr-container-fixes.md
similarity index 100%
rename from reference/docker/tdarr-container-fixes.md
rename to docker/examples/tdarr-container-fixes.md
diff --git a/reference/docker/tdarr-monitoring-configuration.md b/docker/examples/tdarr-monitoring-configuration.md
similarity index 100%
rename from reference/docker/tdarr-monitoring-configuration.md
rename to docker/examples/tdarr-monitoring-configuration.md
diff --git a/examples/docker/tdarr-node-configurations.md b/docker/examples/tdarr-node-configurations.md
similarity index 100%
rename from examples/docker/tdarr-node-configurations.md
rename to docker/examples/tdarr-node-configurations.md
diff --git a/examples/docker/tdarr-node-local/docker-compose-cpu.yml b/docker/examples/tdarr-node-local/docker-compose-cpu.yml
similarity index 100%
rename from examples/docker/tdarr-node-local/docker-compose-cpu.yml
rename to docker/examples/tdarr-node-local/docker-compose-cpu.yml
diff --git a/examples/docker/tdarr-node-local/docker-compose-gpu.yml b/docker/examples/tdarr-node-local/docker-compose-gpu.yml
similarity index 100%
rename from examples/docker/tdarr-node-local/docker-compose-gpu.yml
rename to docker/examples/tdarr-node-local/docker-compose-gpu.yml
diff --git a/examples/docker/tdarr-node-local/start-tdarr-mapped-node.sh b/docker/examples/tdarr-node-local/start-tdarr-mapped-node.sh
similarity index 100%
rename from examples/docker/tdarr-node-local/start-tdarr-mapped-node.sh
rename to docker/examples/tdarr-node-local/start-tdarr-mapped-node.sh
diff --git a/examples/docker/tdarr-server-setup/README.md b/docker/examples/tdarr-server-setup/README.md
similarity index 100%
rename from examples/docker/tdarr-server-setup/README.md
rename to docker/examples/tdarr-server-setup/README.md
diff --git a/examples/docker/tdarr-server-setup/docker-compose.yml b/docker/examples/tdarr-server-setup/docker-compose.yml
similarity index 100%
rename from examples/docker/tdarr-server-setup/docker-compose.yml
rename to docker/examples/tdarr-server-setup/docker-compose.yml
diff --git a/reference/docker/tdarr-troubleshooting.md b/docker/examples/tdarr-troubleshooting.md
similarity index 100%
rename from reference/docker/tdarr-troubleshooting.md
rename to docker/examples/tdarr-troubleshooting.md
diff --git a/reference/docker/troubleshooting.md b/docker/examples/troubleshooting.md
similarity index 100%
rename from reference/docker/troubleshooting.md
rename to docker/examples/troubleshooting.md
diff --git a/docker/troubleshooting.md b/docker/troubleshooting.md
new file mode 100644
index 0000000..4349f3d
--- /dev/null
+++ b/docker/troubleshooting.md
@@ -0,0 +1,466 @@
+# Docker Container Troubleshooting Guide
+
+## Container Startup Issues
+
+### Container Won't Start
+**Check container logs first**:
+```bash
+# Docker
+docker logs <container_name>
+docker logs --tail 50 -f <container_name>
+
+# Podman  
+podman logs <container_name>
+podman logs --tail 50 -f <container_name>
+```
+
+### Common Startup Failures
+
+#### Port Conflicts
+**Symptoms**: `bind: address already in use` error
+**Solution**:
+```bash
+# Find conflicting process
+sudo netstat -tulpn | grep <port>
+docker ps | grep <port>
+
+# Change port mapping
+docker run -p 8081:8080 myapp  # Use different host port
+```
+
+#### Permission Errors
+**Symptoms**: `permission denied` when accessing files/volumes
+**Solutions**:
+```bash
+# Check file ownership
+ls -la /host/volume/path
+
+# Fix ownership (match container user)
+sudo chown -R 1000:1000 /host/volume/path
+
+# Use correct UID/GID in container
+docker run -e PUID=1000 -e PGID=1000 myapp
+```
+
+#### Missing Environment Variables
+**Symptoms**: Application fails with configuration errors
+**Diagnostic**:
+```bash
+# Check container environment
+docker exec -it <container> env
+docker exec -it <container> printenv
+
+# Verify required variables are set
+docker inspect <container> | grep -A 20 "Env"
+```
+
+#### Resource Constraints
+**Symptoms**: Container killed or OOM errors
+**Solutions**:
+```bash
+# Check resource usage
+docker stats <container>
+
+# Increase memory limit
+docker run -m 4g myapp
+
+# Check system resources
+free -h
+df -h
+```
+
+### Debug Running Containers
+```bash
+# Access container shell
+docker exec -it <container> /bin/bash
+docker exec -it <container> /bin/sh  # if bash not available
+
+# Check container processes
+docker exec <container> ps aux
+
+# Check container filesystem
+docker exec <container> ls -la /app
+```
+
+## Build Issues
+
+### Build Failures
+**Clear build cache when encountering issues**:
+```bash
+# Docker
+docker system prune -a
+docker builder prune
+
+# Podman
+podman system prune -a
+podman image prune -a
+```
+
+### Verbose Build Output
+```bash
+# Docker
+docker build --progress=plain --no-cache .
+
+# Podman  
+podman build --layers=false .
+```
+
+### Common Build Problems
+
+#### COPY/ADD Errors
+**Issue**: Files not found during build
+**Solutions**:
+```dockerfile
+# Check .dockerignore file
+# Verify file paths relative to build context
+COPY ./src /app/src  # ✅ Correct
+COPY /absolute/path /app  # ❌ Wrong - no absolute paths
+```
+
+#### Package Installation Failures
+**Issue**: apt/yum/dnf package installation fails
+**Solutions**:
+```dockerfile
+# Update package lists first
+RUN apt-get update && apt-get install -y package-name
+
+# Combine RUN commands to reduce layers
+RUN apt-get update && \
+    apt-get install -y package1 package2 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+```
+
+#### Network Issues During Build
+**Issue**: Cannot reach package repositories
+**Solutions**:
+```bash
+# Check DNS resolution
+docker build --network host .
+
+# Use custom DNS
+docker build --dns 8.8.8.8 .
+```
+
+## GPU Container Issues
+
+### NVIDIA GPU Support Problems
+
+#### Docker Desktop vs Podman on Fedora/Nobara
+**Issue**: Docker Desktop has GPU compatibility issues on Fedora-based systems
+**Symptoms**:
+- `CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected`
+- `unknown or invalid runtime name: nvidia`
+- Device nodes exist but CUDA fails to initialize
+
+**Solution**: Use Podman instead of Docker on Fedora systems
+```bash
+# Verify host GPU works
+nvidia-smi
+
+# Test with Podman (recommended)
+podman run --rm --device nvidia.com/gpu=all ubuntu:20.04 nvidia-smi
+
+# Test with Docker (may fail on Fedora)
+docker run --rm --gpus all ubuntu:20.04 nvidia-smi
+```
+
+#### GPU Container Configuration
+**Working Podman GPU template**:
+```bash
+podman run -d --name gpu-container \
+    --device nvidia.com/gpu=all \
+    --restart unless-stopped \
+    -e NVIDIA_DRIVER_CAPABILITIES=all \
+    -e NVIDIA_VISIBLE_DEVICES=all \
+    myapp:latest
+```
+
+**Working Docker GPU template**:
+```bash
+docker run -d --name gpu-container \
+    --gpus all \
+    --restart unless-stopped \
+    -e NVIDIA_DRIVER_CAPABILITIES=all \
+    -e NVIDIA_VISIBLE_DEVICES=all \
+    myapp:latest
+```
+
+#### GPU Troubleshooting Steps
+1. **Verify Host GPU Access**:
+   ```bash
+   nvidia-smi                    # Should show GPU info
+   lsmod | grep nvidia          # Should show nvidia modules
+   ls -la /dev/nvidia*          # Should show device files
+   ```
+
+2. **Check NVIDIA Container Toolkit**:
+   ```bash
+   rpm -qa | grep nvidia-container-toolkit  # Fedora/RHEL
+   dpkg -l | grep nvidia-container-toolkit  # Ubuntu/Debian
+   nvidia-ctk --version
+   ```
+
+3. **Test GPU in Container**:
+   ```bash
+   # Should show GPU information
+   podman exec gpu-container nvidia-smi
+   
+   # Test CUDA functionality
+   podman exec gpu-container nvidia-ml-py
+   ```
+
+#### Platform-Specific GPU Notes
+**Fedora/Nobara/RHEL**:
+- ✅ Podman: Works out-of-the-box with GPU support
+- ❌ Docker Desktop: Known GPU integration issues
+- Solution: Use Podman for GPU workloads
+
+**Ubuntu/Debian**:
+- ✅ Docker: Generally works well with proper NVIDIA toolkit setup
+- ✅ Podman: Also works well
+- Solution: Either runtime typically works
+
+## Performance Issues
+
+### Resource Monitoring
+**Real-time resource usage**:
+```bash
+# Overall container stats
+docker stats
+podman stats
+
+# Inside container analysis
+docker exec <container> top
+docker exec <container> free -h
+docker exec <container> df -h
+
+# Network usage
+docker exec <container> netstat -i
+```
+
+### Image Size Optimization
+**Analyze image layers**:
+```bash
+# Check image sizes
+docker images --format "table {{.Repository}}\t{{.Tag}}\t{{.Size}}"
+
+# Analyze layer history
+docker history <image>
+
+# Find large files in container
+docker exec <container> du -sh /* | sort -hr
+```
+
+**Optimization strategies**:
+```dockerfile
+# Use multi-stage builds
+FROM node:18 AS builder
+# ... build steps ...
+
+FROM node:18-alpine AS production
+COPY --from=builder /app/dist /app
+# Smaller final image
+
+# Combine RUN commands
+RUN apt-get update && \
+    apt-get install -y package && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Use .dockerignore
+# .dockerignore
+node_modules
+.git
+*.log
+```
+
+### Storage Performance Issues
+**Slow volume performance**:
+```bash
+# Test volume I/O performance
+docker exec <container> dd if=/dev/zero of=/volume/test bs=1M count=1000
+
+# Check volume mount options
+docker inspect <container> | grep -A 10 "Mounts"
+
+# Consider using tmpfs for temporary data
+docker run --tmpfs /tmp myapp
+```
+
+## Network Debugging
+
+### Network Connectivity Issues
+**Inspect network configuration**:
+```bash
+# List networks
+docker network ls
+podman network ls
+
+# Inspect specific network
+docker network inspect <network_name>
+
+# Check container networking
+docker exec <container> ip addr show
+docker exec <container> ip route show
+```
+
+### Service Discovery Problems
+**Test connectivity between containers**:
+```bash
+# Test by container name (same network)
+docker exec container1 ping container2
+
+# Test by IP address
+docker exec container1 ping 172.17.0.3
+
+# Check DNS resolution
+docker exec container1 nslookup container2
+```
+
+### Port Binding Issues
+**Verify port mappings**:
+```bash
+# Check exposed ports
+docker port <container>
+
+# Test external connectivity
+curl localhost:8080
+
+# Check if port is bound to all interfaces
+netstat -tulpn | grep :8080
+```
+
+## Emergency Recovery
+
+### Complete Container Reset
+**Remove all containers and start fresh**:
+```bash
+# Stop all containers
+docker stop $(docker ps -q)
+podman stop --all
+
+# Remove all containers
+docker container prune -f
+podman container prune -f
+
+# Remove all images
+docker image prune -a -f
+podman image prune -a -f
+
+# Remove all volumes (CAUTION: data loss)
+docker volume prune -f
+podman volume prune -f
+
+# Complete system cleanup
+docker system prune -a --volumes -f
+podman system prune -a --volumes -f
+```
+
+### Container Recovery
+**Recover from corrupted container**:
+```bash
+# Create backup of container data
+docker cp <container>:/important/data ./backup/
+
+# Export container filesystem
+docker export <container> > container-backup.tar
+
+# Import and restart
+docker import container-backup.tar new-image:latest
+docker run -d --name new-container new-image:latest
+```
+
+### Data Recovery
+**Recover data from volumes**:
+```bash
+# List volumes
+docker volume ls
+
+# Inspect volume location
+docker volume inspect <volume_name>
+
+# Access volume data directly
+sudo ls -la /var/lib/docker/volumes/<volume_name>/_data
+
+# Mount volume to temporary container
+docker run --rm -v <volume_name>:/data alpine ls -la /data
+```
+
+## Health Check Issues
+
+### Container Health Checks
+**Implement health checks**:
+```dockerfile
+# Dockerfile health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:3000/health || exit 1
+```
+
+**Debug health check failures**:
+```bash
+# Check health status
+docker inspect <container> | grep -A 10 Health
+
+# Manual health check test
+docker exec <container> curl -f http://localhost:3000/health
+
+# Check health check logs
+docker events --filter container=<container>
+```
+
+## Log Analysis
+
+### Log Management
+**View and manage container logs**:
+```bash
+# View recent logs
+docker logs --tail 100 <container>
+
+# Follow logs in real-time
+docker logs -f <container>
+
+# Logs with timestamps
+docker logs -t <container>
+
+# Search logs for errors
+docker logs <container> 2>&1 | grep ERROR
+```
+
+### Log Rotation Issues
+**Configure log rotation to prevent disk filling**:
+```bash
+# Run with log size limits
+docker run --log-opt max-size=10m --log-opt max-file=3 myapp
+
+# Check log file sizes
+sudo du -sh /var/lib/docker/containers/*/
+```
+
+## Platform-Specific Issues
+
+### Fedora/Nobara/RHEL Systems
+- **GPU Support**: Use Podman instead of Docker Desktop
+- **SELinux**: May require container contexts (`-Z` flag)
+- **Firewall**: Configure firewalld for container networking
+
+### Ubuntu/Debian Systems  
+- **AppArmor**: May restrict container operations
+- **Snap Docker**: May have permission issues vs native package
+
+### General Linux Issues
+- **cgroups v2**: Some older containers need cgroups v1
+- **User namespaces**: May cause UID/GID mapping issues
+- **systemd**: Integration differences between Docker/Podman
+
+## Prevention Best Practices
+
+1. **Resource Limits**: Always set memory and CPU limits
+2. **Health Checks**: Implement application health monitoring
+3. **Log Rotation**: Configure to prevent disk space issues
+4. **Security Scanning**: Regular vulnerability scans
+5. **Backup Strategy**: Regular data and configuration backups
+6. **Testing**: Test containers in staging before production
+7. **Documentation**: Document container configurations and dependencies
+
+This troubleshooting guide covers the most common Docker and Podman container issues encountered in home lab and production environments.
\ No newline at end of file
diff --git a/legacy/old-scripts-README.md b/legacy/old-scripts-README.md
new file mode 100644
index 0000000..53f69d3
--- /dev/null
+++ b/legacy/old-scripts-README.md
@@ -0,0 +1,172 @@
+# Scripts Directory
+
+This directory contains operational scripts and utilities for home lab management and automation.
+
+## Directory Structure
+
+```
+scripts/
+├── README.md                    # This documentation
+├── tdarr_monitor.py            # Enhanced Tdarr monitoring with Discord alerts
+├── tdarr/                      # Tdarr automation and scheduling
+├── monitoring/                 # System monitoring and alerting
+└── <future>/                   # Other organized automation subsystems
+```
+
+## Scripts Overview
+
+### `tdarr_monitor.py` - Enhanced Tdarr Monitoring
+
+**Description**: Comprehensive Tdarr monitoring script with stuck job detection and Discord notifications.
+
+**Features**:
+- 📊 Complete Tdarr system monitoring (server, nodes, queue, libraries)
+- 🧠 Short-term memory for stuck job detection
+- 🚨 Discord notifications with rich embeds
+- 💾 Persistent state management
+- ⚙️ Configurable thresholds and alerts
+
+**Quick Start**:
+```bash
+# Basic monitoring
+python3 scripts/tdarr_monitor.py --server http://10.10.0.43:8265 --check all
+
+# Enable stuck job detection with 15-minute threshold
+python3 scripts/tdarr_monitor.py --server http://10.10.0.43:8265 \
+    --check nodes --detect-stuck --stuck-threshold 15
+
+# Full monitoring with Discord alerts (uses default webhook)
+python3 scripts/tdarr_monitor.py --server http://10.10.0.43:8265 \
+    --check all --detect-stuck --discord-alerts
+
+# Test Discord integration (uses default webhook)
+python3 scripts/tdarr_monitor.py --server http://10.10.0.43:8265 --discord-test
+```
+
+**CLI Options**:
+```
+--server              Tdarr server URL (required)
+--check               Type of check: all, status, queue, nodes, libraries, stats, health
+--timeout             Request timeout in seconds (default: 30)
+--output              Output format: json, pretty (default: pretty)
+--verbose             Enable verbose logging
+--detect-stuck        Enable stuck job detection
+--stuck-threshold     Minutes before job considered stuck (default: 30)
+--memory-file         Path to memory state file (default: .claude/tmp/tdarr_memory.pkl)
+--clear-memory        Clear memory state and exit
+--discord-webhook     Discord webhook URL for notifications (default: configured)
+--discord-alerts      Enable Discord alerts for stuck jobs
+--discord-test        Send test Discord message and exit
+```
+
+**Memory Management**:
+- **Persistent State**: Worker snapshots saved to `.claude/tmp/tdarr_memory.pkl`
+- **Automatic Cleanup**: Removes tracking for disappeared workers
+- **Error Recovery**: Graceful handling of corrupted memory files
+
+**Discord Features**:
+- **Two Message Types**: Simple content messages and rich embeds
+- **Stuck Job Alerts**: Detailed embed notifications with file info, progress, duration
+- **System Status**: Health summaries with node details and color-coded status
+- **Customizable**: Colors, fields, titles, descriptions fully configurable
+- **Error Handling**: Graceful failures without breaking monitoring
+
+**Integration Examples**:
+
+*Cron Job for Regular Monitoring*:
+```bash
+# Check every 15 minutes, alert on stuck jobs over 30 minutes
+*/15 * * * * cd /path/to/claude-home && python3 scripts/tdarr_monitor.py \
+    --server http://10.10.0.43:8265 --check nodes --detect-stuck --discord-alerts
+```
+
+*Systemd Service*:
+```ini
+[Unit]
+Description=Tdarr Monitor
+After=network.target
+
+[Service]
+Type=oneshot
+ExecStart=/usr/bin/python3 /path/to/claude-home/scripts/tdarr_monitor.py \
+    --server http://10.10.0.43:8265 --check all --detect-stuck --discord-alerts
+WorkingDirectory=/path/to/claude-home
+User=your-user
+
+[Timer]
+OnCalendar=*:0/15
+Persistent=true
+
+[Install]
+WantedBy=timers.target
+```
+
+**API Data Classes**:
+The script uses strongly-typed dataclasses for all API responses:
+- `ServerStatus` - Server health and version info
+- `NodeStatus` - Node details with stuck job tracking
+- `QueueStatus` - Transcoding queue statistics
+- `LibraryStatus` - Library scan progress
+- `StatisticsStatus` - Overall system statistics
+- `HealthStatus` - Comprehensive health check results
+
+**Error Handling**:
+- Network timeouts and connection errors
+- API endpoint failures
+- JSON parsing errors
+- Discord webhook failures
+- Memory state corruption
+- Missing dependencies
+
+**Dependencies**:
+- `requests` - HTTP client for API calls
+- `pickle` - State serialization
+- Standard library only (no external requirements beyond requests)
+
+---
+
+## Development Guidelines
+
+### Adding New Scripts
+
+1. **Location**: Place scripts in appropriate subdirectories by function
+2. **Documentation**: Include comprehensive docstrings and usage examples
+3. **Error Handling**: Implement robust error handling and logging
+4. **Configuration**: Use CLI arguments and/or config files for flexibility
+5. **Testing**: Include test functionality where applicable
+
+### Naming Conventions
+
+- Use descriptive names: `tdarr_monitor.py` not `monitor.py`
+- Use underscores for Python scripts: `system_health.py`
+- Use hyphens for shell scripts: `backup-system.sh`
+
+### Directory Organization
+
+Create subdirectories for related functionality:
+```
+scripts/
+├── monitoring/          # System monitoring scripts
+├── backup/             # Backup and restore utilities  
+├── network/            # Network management tools
+├── containers/         # Docker/Podman management
+└── maintenance/        # System maintenance tasks
+```
+
+---
+
+## Future Enhancements
+
+### Planned Features
+- **Email Notifications**: SMTP integration for email alerts
+- **Prometheus Metrics**: Export metrics for Grafana dashboards
+- **Webhook Actions**: Trigger external actions on stuck jobs
+- **Multi-Server Support**: Monitor multiple Tdarr instances
+- **Configuration Files**: YAML/JSON config file support
+
+### Contributing
+1. Follow existing code style and patterns
+2. Add comprehensive documentation
+3. Include error handling and logging
+4. Test thoroughly before committing
+5. Update this README with new scripts
\ No newline at end of file
diff --git a/monitoring/CONTEXT.md b/monitoring/CONTEXT.md
new file mode 100644
index 0000000..64d0d53
--- /dev/null
+++ b/monitoring/CONTEXT.md
@@ -0,0 +1,142 @@
+# System Monitoring and Alerting - Technology Context
+
+## Overview
+Comprehensive monitoring and alerting system for home lab infrastructure with focus on automated health checks, Discord notifications, and proactive system maintenance.
+
+## Architecture Patterns
+
+### Distributed Monitoring Strategy
+**Pattern**: Service-specific monitoring with centralized alerting
+- **Tdarr Monitoring**: API-based transcoding health checks
+- **Windows Desktop Monitoring**: Reboot detection and system events
+- **Network Monitoring**: Connectivity and service availability
+- **Container Monitoring**: Docker/Podman health and resource usage
+
+### Alert Management
+**Pattern**: Structured notifications with actionable information
+```bash
+# Discord webhook integration
+curl -X POST "$DISCORD_WEBHOOK" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "content": "**System Alert**\n```\nService: Tdarr\nIssue: Staging timeout\nAction: Automatic cleanup performed\n```\n<@user_id>"
+  }'
+```
+
+## Core Monitoring Components
+
+### Tdarr System Monitoring
+**Purpose**: Monitor transcoding pipeline health and performance
+**Location**: `scripts/tdarr_monitor.py`
+
+**Key Features**:
+- API-based status monitoring with dataclass structures
+- Staging section timeout detection and cleanup
+- Discord notifications with professional formatting
+- Log rotation and retention management
+
+### Windows Desktop Monitoring
+**Purpose**: Track Windows system reboots and power events
+**Location**: `scripts/windows-desktop/`
+
+**Components**:
+- PowerShell monitoring script
+- Scheduled task automation
+- Discord notification integration
+- System event correlation
+
+### Network and Service Monitoring
+**Purpose**: Monitor critical infrastructure availability
+**Implementation**:
+```bash
+# Service health check pattern
+SERVICES="https://homelab.local http://nas.homelab.local"
+for service in $SERVICES; do
+    if curl -sSf --max-time 10 "$service" >/dev/null 2>&1; then
+        echo "✅ $service: Available"
+    else
+        echo "❌ $service: Failed" | send_alert
+    fi
+done
+```
+
+## Automation Patterns
+
+### Cron-Based Scheduling
+**Pattern**: Regular health checks with intelligent alerting
+```bash
+# Monitoring schedule examples
+*/20 * * * * /path/to/tdarr-timeout-monitor.sh    # Every 20 minutes
+0 */6 * * * /path/to/cleanup-temp-dirs.sh         # Every 6 hours
+0 2 * * * /path/to/backup-monitor.sh              # Daily at 2 AM
+```
+
+### Event-Driven Monitoring
+**Pattern**: Reactive monitoring for critical events
+- **System Startup**: Windows boot detection
+- **Service Failures**: Container restart alerts  
+- **Resource Exhaustion**: Disk space warnings
+- **Security Events**: Failed login attempts
+
+## Data Collection and Analysis
+
+### Log Management
+**Pattern**: Centralized logging with rotation
+```bash
+# Log rotation configuration
+LOG_FILE="/var/log/homelab-monitor.log"
+MAX_SIZE="10M"
+RETENTION_DAYS=30
+
+# Rotate logs when size exceeded
+if [ $(stat -c%s "$LOG_FILE") -gt $((10*1024*1024)) ]; then
+    mv "$LOG_FILE" "$LOG_FILE.$(date +%Y%m%d)"
+    touch "$LOG_FILE"
+fi
+```
+
+### Metrics Collection
+**Pattern**: Time-series data for trend analysis
+- **System Metrics**: CPU, memory, disk usage
+- **Service Metrics**: Response times, error rates
+- **Application Metrics**: Transcoding progress, queue sizes
+- **Network Metrics**: Bandwidth usage, latency
+
+## Alert Integration
+
+### Discord Notification System
+**Pattern**: Rich, actionable notifications
+```markdown
+# Professional alert format
+**🔧 System Maintenance**
+Service: Tdarr Transcoding
+Issue: 3 files timed out in staging
+Resolution: Automatic cleanup completed
+Status: System operational
+
+Manual review recommended <@user_id>
+```
+
+### Alert Escalation
+**Pattern**: Tiered alerting based on severity
+1. **Info**: Routine maintenance completed
+2. **Warning**: Service degradation detected
+3. **Critical**: Service failure requiring immediate attention
+4. **Emergency**: System-wide failure requiring manual intervention
+
+## Best Practices Implementation
+
+### Monitoring Strategy
+1. **Proactive**: Monitor trends to predict issues
+2. **Reactive**: Alert on current failures
+3. **Preventive**: Automated cleanup and maintenance
+4. **Comprehensive**: Cover all critical services
+5. **Actionable**: Provide clear resolution paths
+
+### Performance Optimization
+1. **Efficient Polling**: Balance monitoring frequency with resource usage
+2. **Smart Alerting**: Avoid alert fatigue with intelligent filtering
+3. **Resource Management**: Monitor the monitoring system itself
+4. **Scalable Architecture**: Design for growth and additional services
+
+This technology context provides the foundation for implementing comprehensive monitoring and alerting in home lab environments.
\ No newline at end of file
diff --git a/monitoring/examples/cron-job-management.md b/monitoring/examples/cron-job-management.md
new file mode 100644
index 0000000..df56f14
--- /dev/null
+++ b/monitoring/examples/cron-job-management.md
@@ -0,0 +1,326 @@
+# Cron Job Management Patterns
+
+This document outlines the cron job patterns and management strategies used in the home lab environment.
+
+## Current Cron Schedule
+
+### Overview
+```bash
+# Monthly maintenance
+0 2 1 * * /home/cal/bin/ssh_key_maintenance.sh
+
+# Tdarr monitoring and management
+*/10 * * * * python3 /mnt/NV2/Development/claude-home/scripts/tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --detect-stuck --discord-alerts >/dev/null 2>&1
+0 */6 * * * find "/mnt/NV2/tdarr-cache/nobara-pc-gpu-unmapped/temp/" -name "tdarr-workDir2-*" -type d -mmin +360 -exec rm -rf {} \; 2>/dev/null || true
+0 3 * * * find "/mnt/NV2/tdarr-cache/nobara-pc-gpu-unmapped/media" -name "*.temp" -o -name "*.tdarr" -mtime +1 -delete 2>/dev/null || true
+
+# Disabled/legacy jobs
+#*/20 * * * * /mnt/NV2/Development/claude-home/scripts/monitoring/tdarr-timeout-monitor.sh
+```
+
+## Job Categories
+
+### 1. System Maintenance
+**SSH Key Maintenance**
+- **Schedule**: `0 2 1 * *` (Monthly, 1st at 2 AM)
+- **Purpose**: Maintain SSH key security and rotation
+- **Location**: `/home/cal/bin/ssh_key_maintenance.sh`
+- **Priority**: High (security-critical)
+
+### 2. Monitoring & Alerting
+**Tdarr System Monitoring**
+- **Schedule**: `*/10 * * * *` (Every 10 minutes)
+- **Purpose**: Monitor Tdarr nodes, detect stuck jobs, send Discord alerts
+- **Features**:
+  - Stuck job detection (30-minute threshold)
+  - Discord notifications with rich embeds
+  - Persistent memory state tracking
+- **Script**: `/mnt/NV2/Development/claude-home/scripts/tdarr_monitor.py`
+- **Output**: Silent (`>/dev/null 2>&1`)
+
+### 3. Cleanup & Housekeeping
+**Tdarr Work Directory Cleanup**
+- **Schedule**: `0 */6 * * *` (Every 6 hours)
+- **Purpose**: Remove stale Tdarr work directories
+- **Target**: `/mnt/NV2/tdarr-cache/nobara-pc-gpu-unmapped/temp/`
+- **Pattern**: `tdarr-workDir2-*` directories
+- **Age threshold**: 6 hours (`-mmin +360`)
+
+**Failed Tdarr Job Cleanup**
+- **Schedule**: `0 3 * * *` (Daily at 3 AM)
+- **Purpose**: Remove failed transcode artifacts
+- **Target**: `/mnt/NV2/tdarr-cache/nobara-pc-gpu-unmapped/media/`
+- **Patterns**: `*.temp` and `*.tdarr` files
+- **Age threshold**: 24 hours (`-mtime +1`)
+
+## Design Patterns
+
+### 1. Absolute Paths
+**Always use absolute paths in cron jobs**
+```bash
+# Good
+*/10 * * * * python3 /full/path/to/script.py
+
+# Bad - relative paths don't work in cron
+*/10 * * * * python3 scripts/script.py
+```
+
+### 2. Error Handling
+**Standard error suppression pattern**
+```bash
+command 2>/dev/null || true
+```
+- Suppresses stderr to prevent cron emails
+- `|| true` ensures job always exits successfully
+
+### 3. Time-based Cleanup
+**Safe age thresholds for different content types**
+- **Work directories**: 6 hours (short-lived, safe for active jobs)
+- **Temp files**: 24 hours (allows for long transcodes)
+- **Log files**: 7-30 days (depending on importance)
+
+### 4. Resource-aware Scheduling
+**Avoid resource conflicts**
+```bash
+# System maintenance at low-usage times
+0 2 1 * * maintenance_script.sh
+
+# Cleanup during off-peak hours  
+0 3 * * * cleanup_script.sh
+
+# Monitoring with high frequency during active hours
+*/10 * * * * monitor_script.py
+```
+
+## Management Workflow
+
+### Adding New Cron Jobs
+
+1. **Backup current crontab**
+   ```bash
+   crontab -l > /tmp/crontab_backup_$(date +%Y%m%d)
+   ```
+
+2. **Edit safely**
+   ```bash
+   crontab -l > /tmp/new_crontab
+   echo "# New job description" >> /tmp/new_crontab
+   echo "schedule command" >> /tmp/new_crontab
+   crontab /tmp/new_crontab
+   ```
+
+3. **Verify installation**
+   ```bash
+   crontab -l
+   ```
+
+### Proper HERE Document (EOF) Usage
+
+**When building cron files with HERE documents, use proper EOF formatting:**
+
+#### ✅ **Correct Format**
+```bash
+cat > /tmp/new_crontab << 'EOF'
+0 2 1 * * /home/cal/bin/ssh_key_maintenance.sh
+# Tdarr monitoring every 10 minutes
+*/10 * * * * python3 /path/to/script.py --args
+EOF
+```
+
+#### ❌ **Common Mistakes**
+```bash
+# BAD - Causes "EOF not found" errors
+cat >> /tmp/crontab << 'EOF'
+new_cron_job
+EOF
+
+# Results in malformed file with literal "EOF < /dev/null" lines
+```
+
+#### **Key Rules for EOF in Cron Files**
+
+1. **Use `cat >` not `cat >>`** for building complete files
+   ```bash
+   # Good - overwrites file cleanly
+   cat > /tmp/crontab << 'EOF'
+   
+   # Bad - appends and can create malformed files
+   cat >> /tmp/crontab << 'EOF'
+   ```
+
+2. **Quote the EOF delimiter** to prevent variable expansion
+   ```bash
+   # Good - literal content
+   cat > file << 'EOF'
+   
+   # Can cause issues with special characters
+   cat > file << EOF
+   ```
+
+3. **Clean up malformed files** before installing
+   ```bash
+   # Remove EOF artifacts and empty lines
+   head -n -1 /tmp/crontab > /tmp/clean_crontab
+   
+   # Or use grep to remove EOF lines
+   grep -v "^EOF" /tmp/crontab > /tmp/clean_crontab
+   ```
+
+4. **Alternative approach - direct echo method**
+   ```bash
+   crontab -l > /tmp/current_crontab
+   echo "# New job comment" >> /tmp/current_crontab
+   echo "*/10 * * * * /path/to/command" >> /tmp/current_crontab
+   crontab /tmp/current_crontab
+   ```
+
+#### **Debugging EOF Issues**
+
+```bash
+# Check for EOF artifacts in crontab file
+cat -n /tmp/crontab | grep EOF
+
+# Validate crontab syntax before installing
+crontab -T /tmp/crontab  # Some systems support this
+
+# Manual cleanup if needed
+sed '/^EOF/d' /tmp/crontab > /tmp/clean_crontab
+```
+
+### Testing Cron Jobs
+
+**Test command syntax first**
+```bash
+# Test the actual command before scheduling
+python3 /full/path/to/script.py --test
+
+# Check file permissions
+ls -la /path/to/script
+
+# Verify paths exist
+ls -la /target/directory/
+```
+
+**Test with minimal frequency**
+```bash
+# Start with 5-minute intervals for testing
+*/5 * * * * /path/to/new/script.sh
+
+# Monitor logs
+tail -f /var/log/syslog | grep CRON
+```
+
+### Monitoring Cron Jobs
+
+**Check cron logs**
+```bash
+# System cron logs
+sudo journalctl -u cron -f
+
+# User cron logs  
+grep CRON /var/log/syslog | grep $(whoami)
+```
+
+**Verify job execution**
+```bash
+# Check if cleanup actually ran
+ls -la /target/cleanup/directory/
+
+# Monitor script logs
+tail -f /path/to/script/logs/
+```
+
+## Security Considerations
+
+### 1. Path Security
+- Use absolute paths to prevent PATH manipulation
+- Ensure scripts are owned by correct user
+- Set appropriate permissions (750 for scripts)
+
+### 2. Command Injection Prevention
+```bash
+# Good - quoted paths
+find "/path/with spaces/" -name "pattern"
+
+# Bad - unquoted paths vulnerable to injection
+find /path/with spaces/ -name pattern
+```
+
+### 3. Resource Limits
+- Prevent runaway processes with `timeout`
+- Use `ionice` for I/O intensive cleanup jobs
+- Consider `nice` for CPU-intensive tasks
+
+## Troubleshooting
+
+### Common Issues
+
+**Job not running**
+1. Check cron service: `sudo systemctl status cron`
+2. Verify crontab syntax: `crontab -l`
+3. Check file permissions and paths
+4. Review cron logs for errors
+
+**Environment differences**
+- Cron runs with minimal environment
+- Set PATH explicitly if needed
+- Use absolute paths for all commands
+
+**Silent failures**
+- Remove `2>/dev/null` temporarily for debugging
+- Add logging to scripts
+- Check script exit codes
+
+### Debugging Commands
+```bash
+# Test cron environment
+* * * * * env > /tmp/cron_env.txt
+
+# Test script in cron-like environment
+env -i /bin/bash -c 'your_command_here'
+
+# Monitor real-time execution
+sudo tail -f /var/log/syslog | grep CRON
+```
+
+## Best Practices
+
+### 1. Documentation
+- Comment all cron jobs with purpose and schedule
+- Document in this patterns file
+- Include contact info for complex jobs
+
+### 2. Maintenance
+- Regular review of active jobs (quarterly)
+- Remove obsolete jobs promptly
+- Update absolute paths when moving scripts
+
+### 3. Monitoring
+- Implement health checks for critical jobs
+- Use Discord/email notifications for failures
+- Monitor disk space usage from cleanup jobs
+
+### 4. Backup Strategy
+- Backup crontab before changes
+- Version control cron configurations
+- Document restoration procedures
+
+## Future Enhancements
+
+### Planned Additions
+- **Log rotation**: Automated cleanup of application logs
+- **Health checks**: System resource monitoring
+- **Backup verification**: Automated backup integrity checks
+- **Certificate renewal**: SSL/TLS certificate automation
+
+### Migration Considerations
+- **Systemd timers**: Consider migration for complex scheduling
+- **Configuration management**: Ansible or similar for multi-host
+- **Centralized logging**: Aggregated cron job monitoring
+
+---
+
+## Related Documentation
+- [Tdarr Monitoring Script](../scripts/README.md#tdarr_monitorpy---enhanced-tdarr-monitoring)
+- [System Maintenance](../reference/system-maintenance.md)
+- [Discord Integration](../examples/discord-notifications.md)
\ No newline at end of file
diff --git a/scripts/monitoring/README.md b/monitoring/scripts/README.md
similarity index 100%
rename from scripts/monitoring/README.md
rename to monitoring/scripts/README.md
diff --git a/scripts/monitoring/setup-discord-monitoring.md b/monitoring/scripts/setup-discord-monitoring.md
similarity index 100%
rename from scripts/monitoring/setup-discord-monitoring.md
rename to monitoring/scripts/setup-discord-monitoring.md
diff --git a/scripts/monitoring/tdarr-timeout-monitor.sh b/monitoring/scripts/tdarr-timeout-monitor.sh
similarity index 100%
rename from scripts/monitoring/tdarr-timeout-monitor.sh
rename to monitoring/scripts/tdarr-timeout-monitor.sh
diff --git a/monitoring/scripts/tdarr_monitor.py b/monitoring/scripts/tdarr_monitor.py
new file mode 100755
index 0000000..db936f4
--- /dev/null
+++ b/monitoring/scripts/tdarr_monitor.py
@@ -0,0 +1,1234 @@
+#!/usr/bin/env python3
+"""
+Tdarr API Monitoring Script with Stuck Job Detection and Discord Alerts
+
+Monitors Tdarr server via its web API endpoints:
+- Server status and health
+- Queue status and statistics
+- Node status and performance
+- Library scan progress
+- Worker activity
+- Stuck job detection with configurable timeouts
+- Discord notifications for alerts and status updates
+
+Usage:
+    python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check all
+    python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check queue
+    python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes
+    
+    # Enable stuck job detection (30 minute threshold)
+    python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --detect-stuck
+    
+    # Custom stuck threshold (15 minutes)
+    python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check all --detect-stuck --stuck-threshold 15
+    
+    # Enable Discord alerts for stuck jobs (uses default webhook)
+    python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --detect-stuck --discord-alerts
+    
+    # Automatically clear hung workers when detected
+    python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --detect-stuck --clear-hung-workers
+    
+    # Full monitoring with automatic clearing and Discord alerts
+    python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --detect-stuck --clear-hung-workers --discord-alerts
+    
+    # Test Discord integration (uses default webhook)
+    python3 tdarr_monitor.py --server http://10.10.0.43:8265 --discord-test
+    
+    # Enable file logging with custom path and debug level
+    python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --log-file /tmp/tdarr_debug.log --log-level DEBUG
+    
+    # Disable file logging (console only)
+    python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --no-log-file
+    
+    # Clear memory state
+    python3 tdarr_monitor.py --server http://10.10.0.43:8265 --clear-memory
+"""
+
+import argparse
+import json
+import logging
+import logging.handlers
+import sys
+import os
+import pickle
+from dataclasses import dataclass, asdict
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Any, Union
+import requests
+from urllib.parse import urljoin
+
+
+@dataclass
+class WorkerSnapshot:
+    worker_id: str
+    node_id: str
+    worker_type: str
+    file: str
+    percentage: float
+    status: str
+    fps: int
+    eta: str
+    timestamp: datetime
+
+
+@dataclass
+class StuckJob:
+    worker_snapshot: WorkerSnapshot
+    first_seen: datetime
+    stuck_duration_minutes: float
+    is_stuck: bool = True
+
+
+@dataclass
+class MemoryState:
+    worker_snapshots: Dict[str, WorkerSnapshot]
+    stuck_jobs: Dict[str, StuckJob]
+    last_updated: datetime
+
+
+@dataclass
+class ServerStatus:
+    timestamp: str
+    server_url: str
+    status: str
+    error: Optional[str] = None
+    version: Optional[str] = None
+    server_id: Optional[str] = None
+    uptime: Optional[str] = None
+    system_info: Optional[Dict[str, Any]] = None
+
+
+@dataclass
+class QueueStats:
+    total_files: int
+    queued: int
+    processing: int
+    completed: int
+    queue_items: List[Dict[str, Any]]
+
+
+@dataclass
+class QueueStatus:
+    timestamp: str
+    queue_stats: Optional[QueueStats] = None
+    error: Optional[str] = None
+
+
+@dataclass
+class NodeInfo:
+    id: Optional[str]
+    nodeName: Optional[str]
+    status: str
+    lastSeen: Optional[int]
+    version: Optional[str]
+    platform: Optional[str]
+    workers: Dict[str, int]
+    processing: List[Dict[str, Any]]
+
+
+@dataclass
+class NodeSummary:
+    total_nodes: int
+    online_nodes: int
+    offline_nodes: int
+    online_details: List[NodeInfo]
+    offline_details: List[NodeInfo]
+
+
+@dataclass
+class NodeStatus:
+    timestamp: str
+    nodes: List[Dict[str, Any]]
+    node_summary: Optional[NodeSummary] = None
+    stuck_jobs: List[StuckJob] = None
+    error: Optional[str] = None
+
+
+@dataclass
+class LibraryInfo:
+    name: Optional[str]
+    path: Optional[str]
+    file_count: int
+    scan_progress: int
+    last_scan: Optional[str]
+    is_scanning: bool
+
+
+@dataclass
+class ScanStatus:
+    total_libraries: int
+    total_files: int
+    scanning_libraries: int
+
+
+@dataclass
+class LibraryStatus:
+    timestamp: str
+    libraries: List[LibraryInfo]
+    scan_status: Optional[ScanStatus] = None
+    error: Optional[str] = None
+
+
+@dataclass
+class Statistics:
+    total_transcodes: int
+    space_saved: int
+    total_files_processed: int
+    failed_transcodes: int
+    processing_speed: int
+    eta: Optional[str]
+
+
+@dataclass
+class StatisticsStatus:
+    timestamp: str
+    statistics: Optional[Statistics] = None
+    error: Optional[str] = None
+
+
+@dataclass
+class HealthCheck:
+    status: str
+    healthy: bool
+    online_count: Optional[int] = None
+    total_count: Optional[int] = None
+    accessible: Optional[bool] = None
+    total_items: Optional[int] = None
+
+
+@dataclass
+class HealthStatus:
+    timestamp: str
+    overall_status: str
+    checks: Dict[str, HealthCheck]
+
+
+@dataclass
+class DiscordEmbedField:
+    name: str
+    value: str
+    inline: bool = False
+
+
+@dataclass
+class DiscordEmbed:
+    title: str
+    description: str
+    color: int
+    fields: List[DiscordEmbedField] = None
+    timestamp: str = None
+    
+    def __post_init__(self):
+        if self.fields is None:
+            self.fields = []
+        if self.timestamp is None:
+            self.timestamp = datetime.utcnow().isoformat()
+
+
+class DiscordNotifier:
+    def __init__(self, webhook_url: str, timeout: int = 10):
+        """Initialize Discord notifier with webhook URL."""
+        self.webhook_url = webhook_url
+        self.timeout = timeout
+        self.session = requests.Session()
+        self.logger = logging.getLogger(f"{__name__}.DiscordNotifier")
+    
+    def send_content_message(self, content: str, username: str = "Tdarr Monitor") -> bool:
+        """Send a simple content message to Discord.
+        
+        Args:
+            content: The message content to send
+            username: Bot username to display
+            
+        Returns:
+            True if successful, False otherwise
+        """
+        payload = {
+            "content": content,
+            "username": username
+        }
+        
+        return self._send_webhook(payload)
+    
+    def send_embed_message(self, 
+                          title: str,
+                          description: str,
+                          color: int = 0xff6b6b,  # Red by default
+                          fields: List[DiscordEmbedField] = None,
+                          username: str = "Tdarr Monitor") -> bool:
+        """Send an embed message to Discord.
+        
+        Args:
+            title: Embed title
+            description: Embed description
+            color: Embed color (hex integer, default red)
+            fields: List of embed fields
+            username: Bot username to display
+            
+        Returns:
+            True if successful, False otherwise
+        """
+        embed = DiscordEmbed(
+            title=title,
+            description=description,
+            color=color,
+            fields=fields or []
+        )
+        
+        payload = {
+            "username": username,
+            "embeds": [asdict(embed)]
+        }
+        
+        return self._send_webhook(payload)
+    
+    def send_stuck_job_alert(self, stuck_jobs: List[StuckJob]) -> bool:
+        """Send alert for stuck jobs using embed format.
+        
+        Args:
+            stuck_jobs: List of stuck jobs to report
+            
+        Returns:
+            True if successful, False otherwise
+        """
+        if not stuck_jobs:
+            return True
+            
+        # Create embed fields for each stuck job
+        fields = []
+        for i, stuck_job in enumerate(stuck_jobs[:10]):  # Limit to 10 jobs (Discord embed field limit is 25)
+            ws = stuck_job.worker_snapshot
+            field_value = (
+                f"**File:** {os.path.basename(ws.file)}\n"
+                f"**Progress:** {ws.percentage}%\n"
+                f"**Status:** {ws.status}\n"
+                f"**Duration:** {stuck_job.stuck_duration_minutes:.1f} minutes\n"
+                f"**Node:** {ws.node_id}"
+            )
+            
+            fields.append(DiscordEmbedField(
+                name=f"🚨 Stuck Job {i+1}: {ws.worker_id}",
+                value=field_value,
+                inline=True
+            ))
+        
+        # Add summary field if there are more jobs
+        if len(stuck_jobs) > 10:
+            fields.append(DiscordEmbedField(
+                name="Additional Jobs",
+                value=f"... and {len(stuck_jobs) - 10} more stuck jobs",
+                inline=False
+            ))
+        
+        title = f"🚨 Tdarr Stuck Jobs Detected ({len(stuck_jobs)})"
+        description = (
+            f"Detected {len(stuck_jobs)} stuck job{'s' if len(stuck_jobs) != 1 else ''} "
+            f"in your Tdarr system. These jobs may need manual intervention."
+        )
+        
+        return self.send_embed_message(
+            title=title,
+            description=description,
+            color=0xff6b6b,  # Red color for alerts
+            fields=fields
+        )
+    
+    def send_system_status(self, 
+                          server_status: ServerStatus, 
+                          node_status: NodeStatus,
+                          stuck_jobs: List[StuckJob] = None) -> bool:
+        """Send system status summary using embed format.
+        
+        Args:
+            server_status: Server status information
+            node_status: Node status information  
+            stuck_jobs: Optional stuck jobs list
+            
+        Returns:
+            True if successful, False otherwise
+        """
+        # Determine overall health color
+        is_healthy = (
+            server_status.status == "good" and 
+            not server_status.error and
+            not node_status.error and
+            (not stuck_jobs or len(stuck_jobs) == 0)
+        )
+        
+        color = 0x28a745 if is_healthy else 0xff6b6b  # Green if healthy, red if not
+        
+        # Build description
+        description_parts = [
+            f"**Server Status:** {server_status.status.title()}",
+            f"**Version:** {getattr(server_status, 'version', 'Unknown')}"
+        ]
+        
+        if node_status.node_summary:
+            description_parts.extend([
+                f"**Total Nodes:** {node_status.node_summary.total_nodes}",
+                f"**Online Nodes:** {node_status.node_summary.online_nodes}",
+                f"**Offline Nodes:** {node_status.node_summary.offline_nodes}"
+            ])
+        
+        if stuck_jobs:
+            description_parts.append(f"**Stuck Jobs:** {len(stuck_jobs)}")
+        
+        # Add node details as fields
+        fields = []
+        if node_status.node_summary and node_status.node_summary.online_details:
+            for node in node_status.node_summary.online_details:
+                active_workers = len(node.processing) if node.processing else 0
+                field_value = (
+                    f"**Status:** Online\n"
+                    f"**Platform:** {node.platform or 'Unknown'}\n"
+                    f"**Active Workers:** {active_workers}\n"
+                    f"**CPU Workers:** {node.workers.get('cpu', 0)}\n"
+                    f"**GPU Workers:** {node.workers.get('gpu', 0)}"
+                )
+                
+                fields.append(DiscordEmbedField(
+                    name=f"📡 {node.nodeName or node.id}",
+                    value=field_value,
+                    inline=True
+                ))
+        
+        title = "📊 Tdarr System Status"
+        if not is_healthy:
+            title = "⚠️ Tdarr System Alert"
+        
+        return self.send_embed_message(
+            title=title,
+            description="\n".join(description_parts),
+            color=color,
+            fields=fields
+        )
+    
+    def _send_webhook(self, payload: Dict[str, Any]) -> bool:
+        """Send payload to Discord webhook.
+        
+        Args:
+            payload: JSON payload to send
+            
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            response = self.session.post(
+                self.webhook_url,
+                json=payload,
+                timeout=self.timeout
+            )
+            response.raise_for_status()
+            self.logger.info("Discord notification sent successfully")
+            return True
+            
+        except requests.exceptions.RequestException as e:
+            self.logger.error(f"Failed to send Discord notification: {e}")
+            return False
+        except Exception as e:
+            self.logger.error(f"Unexpected error sending Discord notification: {e}")
+            return False
+
+
+class StuckJobDetector:
+    def __init__(self, memory_file: str = ".claude/tmp/tdarr_memory.pkl", stuck_threshold_minutes: int = 30):
+        """Initialize stuck job detector with memory persistence."""
+        self.memory_file = os.path.abspath(memory_file)  # Use absolute path
+        self.stuck_threshold_minutes = stuck_threshold_minutes
+        self.logger = logging.getLogger(f"{__name__}.StuckJobDetector")
+        self.logger.debug(f"Using memory file: {self.memory_file}")
+        self.memory_state = self._load_memory_state()
+        
+        # Ensure memory directory exists
+        os.makedirs(os.path.dirname(memory_file), exist_ok=True)
+    
+    def _load_memory_state(self) -> MemoryState:
+        """Load memory state from disk or create new one."""
+        if os.path.exists(self.memory_file):
+            try:
+                with open(self.memory_file, 'rb') as f:
+                    memory_state = pickle.load(f)
+                    self.logger.debug(f"Loaded memory state: {len(memory_state.worker_snapshots)} workers, {len(memory_state.stuck_jobs)} stuck jobs")
+                    return memory_state
+            except Exception as e:
+                self.logger.warning(f"Failed to load memory state: {e}, creating new state")
+        else:
+            self.logger.debug(f"Memory file {self.memory_file} does not exist, creating new state")
+        
+        return MemoryState(
+            worker_snapshots={},
+            stuck_jobs={},
+            last_updated=datetime.now()
+        )
+    
+    def _save_memory_state(self):
+        """Save memory state to disk."""
+        try:
+            with open(self.memory_file, 'wb') as f:
+                pickle.dump(self.memory_state, f)
+        except Exception as e:
+            self.logger.error(f"Failed to save memory state: {e}")
+    
+    def _create_worker_key(self, node_id: str, worker_id: str) -> str:
+        """Create unique key for worker identification."""
+        return f"{node_id}:{worker_id}"
+    
+    def _is_worker_stuck(self, current: WorkerSnapshot, previous: WorkerSnapshot) -> bool:
+        """Check if worker is stuck based on comparison with previous snapshot."""
+        worker_key = f"{current.node_id}:{current.worker_id}"
+        
+        # Check each condition individually for detailed logging
+        file_same = current.file == previous.file
+        percentage_same = current.percentage == previous.percentage
+        status_same = current.status == previous.status
+        fps_same = current.fps == previous.fps
+        eta_same = current.eta == previous.eta
+        
+        is_stuck = file_same and percentage_same and status_same and fps_same and eta_same
+        
+        # Log detailed comparison info
+        self.logger.debug(f"Worker {worker_key} stuck check:")
+        self.logger.debug(f"  File: '{current.file}' == '{previous.file}' = {file_same}")
+        self.logger.debug(f"  Percentage: {current.percentage}% == {previous.percentage}% = {percentage_same}")
+        self.logger.debug(f"  Status: '{current.status}' == '{previous.status}' = {status_same}")
+        self.logger.debug(f"  FPS: {current.fps} == {previous.fps} = {fps_same}")
+        self.logger.debug(f"  ETA: '{current.eta}' == '{previous.eta}' = {eta_same}")
+        self.logger.debug(f"  → Result: {'STUCK' if is_stuck else 'NOT STUCK'}")
+        
+        # Log INFO level when we detect changes (worker making progress)
+        if not is_stuck:
+            if not percentage_same:
+                self.logger.info(f"Worker {worker_key} making progress: {previous.percentage}% → {current.percentage}%")
+            elif not status_same:
+                self.logger.info(f"Worker {worker_key} status changed: '{previous.status}' → '{current.status}'")
+            elif not file_same:
+                self.logger.info(f"Worker {worker_key} file changed: '{previous.file}' → '{current.file}'")
+        
+        return is_stuck
+    
+    def update_workers(self, nodes_data: Dict[str, Any]) -> List[StuckJob]:
+        """Update worker snapshots and detect stuck jobs."""
+        current_time = datetime.now()
+        current_workers = {}
+        detected_stuck_jobs = []
+        
+        # Extract current worker states from nodes data
+        for node_id, node_data in nodes_data.items():
+            workers = node_data.get('workers', {})
+            for worker_id, worker_data in workers.items():
+                worker_key = self._create_worker_key(node_id, worker_id)
+                
+                # Create current snapshot
+                current_snapshot = WorkerSnapshot(
+                    worker_id=worker_id,
+                    node_id=node_id,
+                    worker_type=worker_data.get('workerType', 'unknown'),
+                    file=worker_data.get('file', ''),
+                    percentage=worker_data.get('percentage', -1),
+                    status=worker_data.get('status', ''),
+                    fps=worker_data.get('fps', 0),
+                    eta=worker_data.get('ETA', ''),
+                    timestamp=current_time
+                )
+                
+                current_workers[worker_key] = current_snapshot
+                
+                # Log all workers being tracked
+                self.logger.debug(f"Tracking worker {worker_key}: {current_snapshot.status} at {current_snapshot.percentage}% on '{current_snapshot.file}'")
+                
+                # Check if worker was previously tracked
+                if worker_key in self.memory_state.worker_snapshots:
+                    previous_snapshot = self.memory_state.worker_snapshots[worker_key]
+                    
+                    # Check if worker is stuck
+                    if self._is_worker_stuck(current_snapshot, previous_snapshot):
+                        # Calculate how long it's been stuck
+                        time_since_previous = (current_time - previous_snapshot.timestamp).total_seconds() / 60
+                        self.logger.debug(f"Worker {worker_key} has been stuck for {time_since_previous:.1f} minutes since last check")
+                        self.logger.debug(f"Worker {worker_key} checking stuck_jobs dict: {list(self.memory_state.stuck_jobs.keys())}")
+                        
+                        if worker_key in self.memory_state.stuck_jobs:
+                            # Already known stuck job, update duration
+                            stuck_job = self.memory_state.stuck_jobs[worker_key]
+                            stuck_duration = current_time - stuck_job.first_seen
+                            stuck_job.stuck_duration_minutes = stuck_duration.total_seconds() / 60
+                            stuck_job.worker_snapshot = current_snapshot
+                            
+                            self.logger.debug(f"Worker {worker_key} known stuck job - duration: {stuck_job.stuck_duration_minutes:.1f} min, threshold: {self.stuck_threshold_minutes} min")
+                            if stuck_job.stuck_duration_minutes >= self.stuck_threshold_minutes:
+                                self.logger.debug(f"Worker {worker_key} EXCEEDS threshold - adding to detected stuck jobs")
+                                detected_stuck_jobs.append(stuck_job)
+                            else:
+                                self.logger.debug(f"Worker {worker_key} below threshold - not flagging yet")
+                        else:
+                            # New stuck job detected - add to memory immediately to start tracking
+                            first_seen = previous_snapshot.timestamp
+                            stuck_duration = current_time - first_seen
+                            stuck_duration_minutes = stuck_duration.total_seconds() / 60
+                            
+                            self.logger.debug(f"Worker {worker_key} NEW stuck job - first_seen: {first_seen}, current: {current_time}")
+                            self.logger.debug(f"Worker {worker_key} NEW stuck job - duration: {stuck_duration_minutes:.1f} min, threshold: {self.stuck_threshold_minutes} min")
+                            
+                            # Create stuck job entry immediately to track duration across runs
+                            stuck_job = StuckJob(
+                                worker_snapshot=current_snapshot,
+                                first_seen=first_seen,
+                                stuck_duration_minutes=stuck_duration_minutes,
+                                is_stuck=True
+                            )
+                            self.memory_state.stuck_jobs[worker_key] = stuck_job
+                            
+                            if stuck_duration_minutes >= self.stuck_threshold_minutes:
+                                self.logger.debug(f"Worker {worker_key} NEW stuck job EXCEEDS threshold - flagging for clearing")
+                                detected_stuck_jobs.append(stuck_job)
+                            else:
+                                self.logger.debug(f"Worker {worker_key} NEW stuck job below threshold - tracking in memory")
+                    else:
+                        # Worker is not stuck, remove from stuck jobs if present
+                        if worker_key in self.memory_state.stuck_jobs:
+                            del self.memory_state.stuck_jobs[worker_key]
+                            self.logger.info(f"Worker {worker_key} is no longer stuck")
+                else:
+                    # New worker, start tracking it
+                    self.logger.info(f"New worker detected: {worker_key} - {current_snapshot.status} at {current_snapshot.percentage}% on '{current_snapshot.file}'")
+        
+        # Clean up stuck jobs for workers that no longer exist
+        stuck_jobs_to_remove = []
+        for worker_key in self.memory_state.stuck_jobs:
+            if worker_key not in current_workers:
+                stuck_jobs_to_remove.append(worker_key)
+        
+        for worker_key in stuck_jobs_to_remove:
+            del self.memory_state.stuck_jobs[worker_key]
+            self.logger.info(f"Removed stuck job tracking for missing worker: {worker_key}")
+        
+        # Update memory state
+        self.memory_state.worker_snapshots = current_workers
+        self.memory_state.last_updated = current_time
+        
+        # Save to disk
+        self._save_memory_state()
+        
+        return detected_stuck_jobs
+    
+    def get_stuck_jobs(self) -> List[StuckJob]:
+        """Get current list of stuck jobs."""
+        return list(self.memory_state.stuck_jobs.values())
+    
+    def clear_memory(self):
+        """Clear all memory state."""
+        self.memory_state = MemoryState(
+            worker_snapshots={},
+            stuck_jobs={},
+            last_updated=datetime.now()
+        )
+        self._save_memory_state()
+        self.logger.info("Memory state cleared")
+
+
+class TdarrMonitor:
+    def __init__(self, server_url: str, timeout: int = 30, enable_stuck_detection: bool = False, 
+                 stuck_threshold_minutes: int = 30, memory_file: str = ".claude/tmp/tdarr_memory.pkl",
+                 discord_webhook_url: str = None, enable_discord_alerts: bool = False,
+                 log_file: Optional[str] = None, log_level: str = "INFO", clear_hung_workers: bool = False):
+        """Initialize Tdarr monitor with server URL."""
+        self.server_url = server_url.rstrip('/')
+        self.timeout = timeout
+        self.session = requests.Session()
+        self.enable_stuck_detection = enable_stuck_detection
+        self.enable_discord_alerts = enable_discord_alerts
+        self.clear_hung_workers_enabled = clear_hung_workers
+        
+        # Configure logging first
+        self._setup_logging(log_file, log_level)
+        self.logger = logging.getLogger(__name__)
+        
+        # Initialize stuck job detector if enabled
+        self.stuck_detector = None
+        if enable_stuck_detection:
+            self.stuck_detector = StuckJobDetector(memory_file, stuck_threshold_minutes)
+        
+        # Initialize Discord notifier if enabled
+        self.discord_notifier = None
+        if enable_discord_alerts:
+            if discord_webhook_url:
+                self.discord_notifier = DiscordNotifier(discord_webhook_url)
+            else:
+                self.logger.warning("Discord alerts enabled but no webhook URL provided")
+    
+    def _setup_logging(self, log_file: Optional[str] = None, log_level: str = "INFO"):
+        """Configure logging with optional file rotation."""
+        # Clear any existing handlers
+        root_logger = logging.getLogger()
+        root_logger.handlers.clear()
+        
+        # Set log level
+        level = getattr(logging, log_level.upper(), logging.INFO)
+        root_logger.setLevel(level)
+        
+        # Create formatter
+        formatter = logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+            datefmt='%Y-%m-%d %H:%M:%S'
+        )
+        
+        # Console handler (for interactive use)
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setFormatter(formatter)
+        root_logger.addHandler(console_handler)
+        
+        # File handler with rotation (if log_file specified)
+        if log_file:
+            # Ensure log directory exists
+            log_dir = os.path.dirname(log_file)
+            if log_dir:
+                os.makedirs(log_dir, exist_ok=True)
+            
+            # Rotating file handler: 10MB max, keep 5 backup files
+            file_handler = logging.handlers.RotatingFileHandler(
+                log_file,
+                maxBytes=10 * 1024 * 1024,  # 10MB
+                backupCount=5,
+                encoding='utf-8'
+            )
+            file_handler.setFormatter(formatter)
+            root_logger.addHandler(file_handler)
+
+    def _make_request(self, endpoint: str) -> Optional[Dict[str, Any]]:
+        """Make HTTP request to Tdarr API endpoint."""
+        url = urljoin(self.server_url, endpoint)
+        
+        try:
+            response = self.session.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            return response.json()
+            
+        except requests.exceptions.RequestException as e:
+            self.logger.error(f"Request failed for {url}: {e}")
+            return None
+        except json.JSONDecodeError as e:
+            self.logger.error(f"JSON decode failed for {url}: {e}")
+            return None
+
+    def clear_hung_workers(self, stuck_jobs: Optional[List[StuckJob]] = None) -> bool:
+        """Clear hung workers via Tdarr API using kill-worker endpoint.
+        
+        Args:
+            stuck_jobs: List of StuckJob objects to clear. Each contains worker and node information.
+            
+        Returns:
+            True if all workers cleared successfully, False otherwise
+        """
+        if not stuck_jobs:
+            self.logger.info("No stuck jobs provided for clearing hung workers")
+            return True
+            
+        success_count = 0
+        total_count = len(stuck_jobs)
+        
+        for stuck_job in stuck_jobs:
+            worker_snapshot = stuck_job.worker_snapshot
+            try:
+                # Use the kill-worker endpoint with correct payload format
+                endpoint = '/api/v2/kill-worker'
+                payload = {
+                    "data": {
+                        "nodeID": worker_snapshot.node_id,
+                        "workerID": worker_snapshot.worker_id
+                    }
+                }
+                
+                url = urljoin(self.server_url, endpoint)
+                response = self.session.post(url, json=payload, timeout=self.timeout)
+                response.raise_for_status()
+                
+                self.logger.info(f"Successfully killed hung worker: {worker_snapshot.node_id}:{worker_snapshot.worker_id}")
+                success_count += 1
+                
+            except requests.exceptions.RequestException as e:
+                self.logger.error(f"Failed to kill worker {worker_snapshot.node_id}:{worker_snapshot.worker_id}: {e}")
+            except Exception as e:
+                self.logger.error(f"Unexpected error killing worker {worker_snapshot.node_id}:{worker_snapshot.worker_id}: {e}")
+        
+        self.logger.info(f"Cleared {success_count}/{total_count} hung workers")
+        return success_count == total_count
+
+    def get_server_status(self) -> ServerStatus:
+        """Get overall server status and configuration."""
+        timestamp = datetime.now().isoformat()
+        
+        # Try to get server info from API
+        data = self._make_request('/api/v2/status')
+        if data:
+            server_status = data.get('status', 'unknown')
+            self.logger.info(f"Server check completed: status={server_status}, version={data.get('version', 'unknown')}")
+            return ServerStatus(
+                timestamp=timestamp,
+                server_url=self.server_url,
+                status=server_status,
+                version=data.get('version'),
+                uptime=data.get('uptime')
+            )
+        else:
+            self.logger.error("Server check failed: Unable to connect to Tdarr server")
+            return ServerStatus(
+                timestamp=timestamp,
+                server_url=self.server_url,
+                status='offline',
+                error='Unable to connect to Tdarr server'
+            )
+
+    def get_queue_status(self) -> QueueStatus:
+        """Get transcoding queue status and statistics."""
+        timestamp = datetime.now().isoformat()
+        
+        # Get queue information
+        data = self._make_request('/api/v2/get-queue')
+        if data:
+            queue_data = data.get('queue', [])
+            
+            # Calculate queue statistics
+            total_files = len(queue_data)
+            queued_files = len([f for f in queue_data if f.get('status') == 'Queued'])
+            processing_files = len([f for f in queue_data if f.get('status') == 'Processing'])
+            completed_files = len([f for f in queue_data if f.get('status') == 'Completed'])
+            
+            queue_stats = QueueStats(
+                total_files=total_files,
+                queued=queued_files,
+                processing=processing_files,
+                completed=completed_files,
+                queue_items=queue_data[:10]  # First 10 items for details
+            )
+            
+            return QueueStatus(
+                timestamp=timestamp,
+                queue_stats=queue_stats
+            )
+        else:
+            return QueueStatus(
+                timestamp=timestamp,
+                error='Unable to fetch queue data'
+            )
+
+    def get_node_status(self) -> NodeStatus:
+        """Get status of all connected nodes."""
+        timestamp = datetime.now().isoformat()
+        
+        # Get nodes information (using correct endpoint)
+        data = self._make_request('/api/v2/get-nodes')
+        if data:
+            # Handle the actual data structure returned by Tdarr API
+            nodes_dict = data if isinstance(data, dict) else {}
+            nodes = []
+            
+            # Process node information
+            online_nodes = []
+            offline_nodes = []
+            
+            for node_id, node_data in nodes_dict.items():
+                node_info = NodeInfo(
+                    id=node_id,
+                    nodeName=node_data.get('nodeName'),
+                    status='online',  # Assume online if in response
+                    lastSeen=None,
+                    version=node_data.get('config', {}).get('version'),
+                    platform=node_data.get('config', {}).get('platform_arch_isdocker'),
+                    workers={
+                        'cpu': len([w for w in node_data.get('workers', {}).values() if 'cpu' in w.get('workerType', '').lower()]),
+                        'gpu': len([w for w in node_data.get('workers', {}).values() if 'gpu' in w.get('workerType', '').lower()])
+                    },
+                    processing=list(node_data.get('workers', {}).values())
+                )
+                
+                online_nodes.append(node_info)
+                nodes.append(node_data)
+            
+            # Check for stuck jobs if detection is enabled
+            stuck_jobs = []
+            if self.stuck_detector:
+                try:
+                    stuck_jobs = self.stuck_detector.update_workers(nodes_dict)
+                    if stuck_jobs:
+                        self.logger.warning(f"Detected {len(stuck_jobs)} stuck jobs")
+                        for stuck_job in stuck_jobs:
+                            self.logger.warning(
+                                f"Stuck job: {stuck_job.worker_snapshot.node_id}:{stuck_job.worker_snapshot.worker_id} "
+                                f"on file '{stuck_job.worker_snapshot.file}' "
+                                f"at {stuck_job.worker_snapshot.percentage}% for {stuck_job.stuck_duration_minutes:.1f} minutes"
+                            )
+                        
+                        # Clear hung workers if enabled
+                        if self.clear_hung_workers_enabled:
+                            try:
+                                clear_success = self.clear_hung_workers(stuck_jobs)
+                                if clear_success:
+                                    self.logger.info(f"Successfully cleared {len(stuck_jobs)} hung workers")
+                                else:
+                                    self.logger.warning("Some hung workers could not be cleared")
+                            except Exception as e:
+                                self.logger.error(f"Error clearing hung workers: {e}")
+                        
+                        # Send Discord notification for stuck jobs
+                        if self.discord_notifier:
+                            try:
+                                self.discord_notifier.send_stuck_job_alert(stuck_jobs)
+                            except Exception as e:
+                                self.logger.error(f"Failed to send Discord stuck job alert: {e}")
+                                
+                except Exception as e:
+                    self.logger.error(f"Error in stuck job detection: {e}")
+            
+            node_summary = NodeSummary(
+                total_nodes=len(nodes),
+                online_nodes=len(online_nodes),
+                offline_nodes=len(offline_nodes),
+                online_details=online_nodes,
+                offline_details=offline_nodes
+            )
+            
+            # Log successful node check with summary
+            if stuck_jobs:
+                self.logger.info(f"Node check completed: {len(nodes)} nodes online, {len(stuck_jobs)} stuck jobs detected")
+            else:
+                self.logger.info(f"Node check completed: {len(nodes)} nodes online, no stuck jobs detected")
+            
+            return NodeStatus(
+                timestamp=timestamp,
+                nodes=nodes,
+                node_summary=node_summary,
+                stuck_jobs=stuck_jobs
+            )
+        else:
+            self.logger.error("Node check failed: Unable to fetch node data")
+            return NodeStatus(
+                timestamp=timestamp,
+                nodes=[],
+                error='Unable to fetch node data'
+            )
+
+    def get_library_status(self) -> LibraryStatus:
+        """Get library scan status and file statistics."""
+        timestamp = datetime.now().isoformat()
+        
+        # Get library information
+        data = self._make_request('/api/v2/get-libraries')
+        if data:
+            libraries = data.get('libraries', [])
+            
+            library_stats = []
+            total_files = 0
+            
+            for lib in libraries:
+                lib_info = LibraryInfo(
+                    name=lib.get('name'),
+                    path=lib.get('path'),
+                    file_count=lib.get('totalFiles', 0),
+                    scan_progress=lib.get('scanProgress', 0),
+                    last_scan=lib.get('lastScan'),
+                    is_scanning=lib.get('isScanning', False)
+                )
+                library_stats.append(lib_info)
+                total_files += lib_info.file_count
+            
+            scan_status = ScanStatus(
+                total_libraries=len(libraries),
+                total_files=total_files,
+                scanning_libraries=len([l for l in library_stats if l.is_scanning])
+            )
+            
+            return LibraryStatus(
+                timestamp=timestamp,
+                libraries=library_stats,
+                scan_status=scan_status
+            )
+        else:
+            return LibraryStatus(
+                timestamp=timestamp,
+                libraries=[],
+                error='Unable to fetch library data'
+            )
+
+    def get_statistics(self) -> StatisticsStatus:
+        """Get overall Tdarr statistics and health metrics."""
+        timestamp = datetime.now().isoformat()
+        
+        # Get statistics
+        data = self._make_request('/api/v2/get-stats')
+        if data:
+            stats = data.get('stats', {})
+            statistics = Statistics(
+                total_transcodes=stats.get('totalTranscodes', 0),
+                space_saved=stats.get('spaceSaved', 0),
+                total_files_processed=stats.get('totalFilesProcessed', 0),
+                failed_transcodes=stats.get('failedTranscodes', 0),
+                processing_speed=stats.get('processingSpeed', 0),
+                eta=stats.get('eta')
+            )
+            
+            return StatisticsStatus(
+                timestamp=timestamp,
+                statistics=statistics
+            )
+        else:
+            return StatisticsStatus(
+                timestamp=timestamp,
+                error='Unable to fetch statistics'
+            )
+
+    def health_check(self) -> HealthStatus:
+        """Perform comprehensive health check."""
+        timestamp = datetime.now().isoformat()
+        
+        # Server connectivity
+        server_status = self.get_server_status()
+        server_check = HealthCheck(
+            status=server_status.status,
+            healthy=server_status.status == 'good'
+        )
+        
+        # Node connectivity  
+        node_status = self.get_node_status()
+        nodes_healthy = (
+            node_status.node_summary.online_nodes > 0 if node_status.node_summary else False
+        ) and not node_status.error
+        
+        nodes_check = HealthCheck(
+            status='online' if nodes_healthy else 'offline',
+            healthy=nodes_healthy,
+            online_count=node_status.node_summary.online_nodes if node_status.node_summary else 0,
+            total_count=node_status.node_summary.total_nodes if node_status.node_summary else 0
+        )
+        
+        # Queue status
+        queue_status = self.get_queue_status()
+        queue_healthy = not queue_status.error
+        queue_check = HealthCheck(
+            status='accessible' if queue_healthy else 'error',
+            healthy=queue_healthy,
+            accessible=queue_healthy,
+            total_items=queue_status.queue_stats.total_files if queue_status.queue_stats else 0
+        )
+        
+        checks = {
+            'server': server_check,
+            'nodes': nodes_check,
+            'queue': queue_check
+        }
+        
+        # Determine overall health
+        all_checks_healthy = all(check.healthy for check in checks.values())
+        overall_status = 'healthy' if all_checks_healthy else 'unhealthy'
+        
+        return HealthStatus(
+            timestamp=timestamp,
+            overall_status=overall_status,
+            checks=checks
+        )
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Monitor Tdarr server via API')
+    parser.add_argument('--server', required=True, help='Tdarr server URL (e.g., http://10.10.0.43:8265)')
+    parser.add_argument('--check', choices=['all', 'status', 'queue', 'nodes', 'libraries', 'stats', 'health'],
+                       default='health', help='Type of check to perform')
+    parser.add_argument('--timeout', type=int, default=30, help='Request timeout in seconds')
+    parser.add_argument('--output', choices=['json', 'pretty'], default='pretty', help='Output format')
+    parser.add_argument('--verbose', action='store_true', help='Enable verbose logging')
+    parser.add_argument('--detect-stuck', action='store_true', help='Enable stuck job detection')
+    parser.add_argument('--stuck-threshold', type=int, default=30, help='Minutes before job is considered stuck (default: 30)')
+    parser.add_argument('--memory-file', default='.claude/tmp/tdarr_memory.pkl', help='Path to memory state file')
+    parser.add_argument('--clear-memory', action='store_true', help='Clear memory state and exit')
+    parser.add_argument('--discord-webhook', 
+                       default='https://discord.com/api/webhooks/1404105821549498398/y2Ud1RK9rzFjv58xbypUfQNe3jrL7ZUq1FkQHa4_dfOHm2ylp93z0f4tY0O8Z-vQgKhD',
+                       help='Discord webhook URL for notifications (default: configured webhook)')
+    parser.add_argument('--discord-alerts', action='store_true', help='Enable Discord alerts for stuck jobs and system status')
+    parser.add_argument('--discord-test', action='store_true', help='Send test Discord message and exit')
+    parser.add_argument('--log-file', default='./scripts/logs/tdarr_monitor.log',
+                       help='Path to log file with rotation (default: ./scripts/logs/tdarr_monitor.log)')
+    parser.add_argument('--log-level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], default='INFO', 
+                       help='Logging level (default: INFO)')
+    parser.add_argument('--no-log-file', action='store_true', help='Disable file logging, console only')
+    parser.add_argument('--clear-hung-workers', action='store_true', help='Clear hung workers via API call when stuck jobs are detected')
+    
+    args = parser.parse_args()
+    
+    if args.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+    
+    # Handle clear memory command
+    if args.clear_memory:
+        if os.path.exists(args.memory_file):
+            os.remove(args.memory_file)
+            print(f"Memory state cleared: {args.memory_file}")
+        else:
+            print(f"Memory file does not exist: {args.memory_file}")
+        sys.exit(0)
+    
+    # Handle Discord test command
+    if args.discord_test:
+        print("Sending Discord test messages...")
+        notifier = DiscordNotifier(args.discord_webhook)
+        
+        # Test content message
+        content_success = notifier.send_content_message(
+            "🧪 **Tdarr Monitor Test** - Content message working correctly!"
+        )
+        
+        # Test embed message
+        test_fields = [
+            DiscordEmbedField("Test Field 1", "This is a test value", True),
+            DiscordEmbedField("Test Field 2", "Another test value", True),
+        ]
+        
+        embed_success = notifier.send_embed_message(
+            title="🧪 Tdarr Monitor Test",
+            description="This is a test embed message to verify Discord integration is working correctly.",
+            color=0x00ff00,  # Green
+            fields=test_fields
+        )
+        
+        if content_success and embed_success:
+            print("✅ Discord test successful! Both content and embed messages sent.")
+            sys.exit(0)
+        else:
+            print("❌ Discord test failed. Check webhook URL and permissions.")
+            sys.exit(1)
+    
+    # Initialize monitor
+    log_file = None if args.no_log_file else args.log_file
+    monitor = TdarrMonitor(
+        args.server, 
+        args.timeout, 
+        enable_stuck_detection=args.detect_stuck,
+        stuck_threshold_minutes=args.stuck_threshold,
+        memory_file=args.memory_file,
+        discord_webhook_url=args.discord_webhook,
+        enable_discord_alerts=args.discord_alerts,
+        log_file=log_file,
+        log_level=args.log_level,
+        clear_hung_workers=args.clear_hung_workers
+    )
+    
+    # Perform requested check
+    monitor.logger.info(f"Starting Tdarr monitoring check: {args.check}, stuck_detection={'enabled' if args.detect_stuck else 'disabled'}, clear_workers={'enabled' if args.clear_hung_workers else 'disabled'}")
+    
+    result = None
+    if args.check == 'all':
+        result = {
+            'server_status': monitor.get_server_status(),
+            'queue_status': monitor.get_queue_status(),
+            'node_status': monitor.get_node_status(),
+            'library_status': monitor.get_library_status(),
+            'statistics': monitor.get_statistics()
+        }
+    elif args.check == 'status':
+        result = monitor.get_server_status()
+    elif args.check == 'queue':
+        result = monitor.get_queue_status()
+    elif args.check == 'nodes':
+        result = monitor.get_node_status()
+    elif args.check == 'libraries':
+        result = monitor.get_library_status()
+    elif args.check == 'stats':
+        result = monitor.get_statistics()
+    elif args.check == 'health':
+        result = monitor.health_check()
+    
+    # Output results
+    if args.output == 'json':
+        # Convert dataclasses to dictionaries for JSON serialization
+        if args.check == 'all':
+            json_result = {}
+            for key, value in result.items():
+                json_result[key] = asdict(value)
+            print(json.dumps(json_result, indent=2))
+        else:
+            print(json.dumps(asdict(result), indent=2))
+    else:
+        # Pretty print format
+        print(f"=== Tdarr Monitor Results - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===")
+        
+        if args.check == 'health' or (hasattr(result, 'overall_status') and result.overall_status):
+            health = result if hasattr(result, 'overall_status') else None
+            if health:
+                status = health.overall_status
+                print(f"Overall Status: {status.upper()}")
+                
+                if health.checks:
+                    print("\nHealth Checks:")
+                    for check_name, check_data in health.checks.items():
+                        status_icon = "✓" if check_data.healthy else "✗"
+                        print(f"  {status_icon} {check_name.title()}: {asdict(check_data)}")
+        
+        # Display stuck jobs if present
+        if args.detect_stuck:
+            if hasattr(result, 'stuck_jobs') and result.stuck_jobs:
+                print(f"\n=== STUCK JOBS DETECTED ({len(result.stuck_jobs)}) ===")
+                for stuck_job in result.stuck_jobs:
+                    print(f"🚨 {stuck_job.worker_snapshot.node_id}:{stuck_job.worker_snapshot.worker_id}")
+                    print(f"   File: {stuck_job.worker_snapshot.file}")
+                    print(f"   Progress: {stuck_job.worker_snapshot.percentage}%")
+                    print(f"   Status: {stuck_job.worker_snapshot.status}")
+                    print(f"   Stuck for: {stuck_job.stuck_duration_minutes:.1f} minutes")
+                    print()
+            elif args.check in ['nodes', 'all']:
+                # Check all results for stuck jobs if 'all' is selected
+                stuck_found = False
+                if args.check == 'all' and isinstance(result, dict):
+                    for section, data in result.items():
+                        if hasattr(data, 'stuck_jobs') and data.stuck_jobs:
+                            if not stuck_found:
+                                print(f"\n=== STUCK JOBS DETECTED ===")
+                                stuck_found = True
+                            for stuck_job in data.stuck_jobs:
+                                print(f"🚨 {stuck_job.worker_snapshot.node_id}:{stuck_job.worker_snapshot.worker_id}")
+                                print(f"   File: {stuck_job.worker_snapshot.file}")
+                                print(f"   Progress: {stuck_job.worker_snapshot.percentage}%")
+                                print(f"   Status: {stuck_job.worker_snapshot.status}")
+                                print(f"   Stuck for: {stuck_job.stuck_duration_minutes:.1f} minutes")
+                                print()
+                
+                if not stuck_found:
+                    print(f"\n✅ No stuck jobs detected (threshold: {args.stuck_threshold} minutes)")
+        
+        if args.check == 'all':
+            for section, data in result.items():
+                print(f"\n=== {section.replace('_', ' ').title()} ===")
+                # Don't print stuck_jobs in JSON format as we already displayed them above
+                if hasattr(data, 'stuck_jobs'):
+                    data_dict = asdict(data)
+                    data_dict.pop('stuck_jobs', None)
+                    print(json.dumps(data_dict, indent=2))
+                else:
+                    print(json.dumps(asdict(data), indent=2))
+        elif args.check != 'health':
+            # Don't print stuck_jobs in JSON format as we already displayed them above
+            if hasattr(result, 'stuck_jobs'):
+                result_dict = asdict(result)
+                result_dict.pop('stuck_jobs', None)
+                print(json.dumps(result_dict, indent=2))
+            else:
+                print(json.dumps(asdict(result), indent=2))
+    
+    # Exit with appropriate code
+    if result:
+        # Check for unhealthy status in health check
+        if isinstance(result, HealthStatus) and result.overall_status == 'unhealthy':
+            sys.exit(1)
+        # Check for errors in individual status objects (all status classes except HealthStatus have error attribute)
+        elif (isinstance(result, (ServerStatus, QueueStatus, NodeStatus, LibraryStatus, StatisticsStatus)) 
+              and result.error):
+            sys.exit(1)
+        # Check for errors in 'all' results
+        elif isinstance(result, dict):
+            for status_obj in result.values():
+                if (isinstance(status_obj, (ServerStatus, QueueStatus, NodeStatus, LibraryStatus, StatisticsStatus)) 
+                    and status_obj.error):
+                    sys.exit(1)
+    
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/scripts/monitoring/windows-desktop/README.md b/monitoring/scripts/windows-desktop/README.md
similarity index 100%
rename from scripts/monitoring/windows-desktop/README.md
rename to monitoring/scripts/windows-desktop/README.md
diff --git a/scripts/monitoring/windows-desktop/windows-reboot-monitor.ps1 b/monitoring/scripts/windows-desktop/windows-reboot-monitor.ps1
similarity index 100%
rename from scripts/monitoring/windows-desktop/windows-reboot-monitor.ps1
rename to monitoring/scripts/windows-desktop/windows-reboot-monitor.ps1
diff --git a/scripts/monitoring/windows-desktop/windows-reboot-task-shutdown.xml b/monitoring/scripts/windows-desktop/windows-reboot-task-shutdown.xml
similarity index 100%
rename from scripts/monitoring/windows-desktop/windows-reboot-task-shutdown.xml
rename to monitoring/scripts/windows-desktop/windows-reboot-task-shutdown.xml
diff --git a/scripts/monitoring/windows-desktop/windows-reboot-task-startup.xml b/monitoring/scripts/windows-desktop/windows-reboot-task-startup.xml
similarity index 100%
rename from scripts/monitoring/windows-desktop/windows-reboot-task-startup.xml
rename to monitoring/scripts/windows-desktop/windows-reboot-task-startup.xml
diff --git a/scripts/monitoring/windows-desktop/windows-setup-instructions.md b/monitoring/scripts/windows-desktop/windows-setup-instructions.md
similarity index 100%
rename from scripts/monitoring/windows-desktop/windows-setup-instructions.md
rename to monitoring/scripts/windows-desktop/windows-setup-instructions.md
diff --git a/monitoring/troubleshooting.md b/monitoring/troubleshooting.md
new file mode 100644
index 0000000..c29ac29
--- /dev/null
+++ b/monitoring/troubleshooting.md
@@ -0,0 +1,414 @@
+# Monitoring System Troubleshooting Guide
+
+## Discord Notification Issues
+
+### Webhook Not Working
+**Symptoms**: No Discord messages received, connection errors
+**Diagnosis**:
+```bash
+# Test webhook manually
+curl -X POST "$DISCORD_WEBHOOK_URL" \
+  -H "Content-Type: application/json" \
+  -d '{"content": "Test message"}'
+
+# Check webhook URL format
+echo $DISCORD_WEBHOOK_URL | grep -E "https://discord.com/api/webhooks/[0-9]+/.+"
+```
+
+**Solutions**:
+```bash
+# Verify webhook URL is correct
+# Format: https://discord.com/api/webhooks/ID/TOKEN
+
+# Test with minimal payload
+curl -X POST "$DISCORD_WEBHOOK_URL" \
+  -H "Content-Type: application/json" \
+  -d '{"content": "✅ Webhook working"}'
+
+# Check for JSON formatting issues
+echo '{"content": "test"}' | jq .  # Validate JSON
+```
+
+### Message Formatting Problems
+**Symptoms**: Malformed messages, broken markdown, missing user pings
+**Common Issues**:
+```bash
+# ❌ Broken JSON escaping
+{"content": "Error: "quotes" break JSON"}
+
+# ✅ Proper JSON escaping  
+{"content": "Error: \"quotes\" properly escaped"}
+
+# ❌ User ping inside code block (doesn't work)
+{"content": "```\nIssue occurred <@user_id>\n```"}
+
+# ✅ User ping outside code block
+{"content": "```\nIssue occurred\n```\nManual intervention needed <@user_id>"}
+```
+
+## Tdarr Monitoring Issues
+
+### Script Not Running
+**Symptoms**: No monitoring alerts, script execution failures
+**Diagnosis**:
+```bash
+# Check cron job status
+crontab -l | grep tdarr-timeout-monitor
+systemctl status cron
+
+# Run script manually for debugging
+bash -x /path/to/tdarr-timeout-monitor.sh
+
+# Check script permissions
+ls -la /path/to/tdarr-timeout-monitor.sh
+```
+
+**Solutions**:
+```bash
+# Fix script permissions
+chmod +x /path/to/tdarr-timeout-monitor.sh
+
+# Reinstall cron job
+crontab -e
+# Add: */20 * * * * /full/path/to/tdarr-timeout-monitor.sh
+
+# Check script environment
+# Ensure PATH and variables are set correctly in script
+```
+
+### API Connection Failures
+**Symptoms**: Cannot connect to Tdarr server, timeout errors
+**Diagnosis**:
+```bash
+# Test Tdarr API manually
+curl -f "http://tdarr-server:8266/api/v2/status"
+
+# Check network connectivity
+ping tdarr-server
+nc -zv tdarr-server 8266
+
+# Verify SSH access to server
+ssh tdarr "docker ps | grep tdarr"
+```
+
+**Solutions**:
+```bash
+# Update server connection in script
+# Verify server IP and port are correct
+
+# Test API endpoints
+curl "http://10.10.0.43:8265/api/v2/status"  # Web port
+curl "http://10.10.0.43:8266/api/v2/status"  # Server port
+
+# Check Tdarr server logs
+ssh tdarr "docker logs tdarr | tail -20"
+```
+
+## Windows Desktop Monitoring Issues
+
+### PowerShell Script Not Running
+**Symptoms**: No reboot notifications from Windows systems
+**Diagnosis**:
+```powershell
+# Check scheduled task status
+Get-ScheduledTask -TaskName "Reboot*" | Get-ScheduledTaskInfo
+
+# Test script execution manually
+PowerShell -ExecutionPolicy Bypass -File "C:\path\to\windows-reboot-monitor.ps1"
+
+# Check PowerShell execution policy
+Get-ExecutionPolicy
+```
+
+**Solutions**:
+```powershell
+# Set execution policy
+Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
+
+# Recreate scheduled tasks
+schtasks /Create /XML "C:\path\to\task.xml" /TN "RebootMonitor"
+
+# Check task trigger configuration
+Get-ScheduledTask -TaskName "RebootMonitor" | Get-ScheduledTaskTrigger
+```
+
+### Network Access from Windows
+**Symptoms**: PowerShell cannot reach Discord webhook
+**Diagnosis**:
+```powershell
+# Test network connectivity
+Test-NetConnection discord.com -Port 443
+
+# Test webhook manually
+Invoke-RestMethod -Uri $webhookUrl -Method Post -Body '{"content":"test"}' -ContentType "application/json"
+
+# Check Windows firewall
+Get-NetFirewallRule | Where-Object {$_.DisplayName -like "*PowerShell*"}
+```
+
+**Solutions**:
+```powershell
+# Allow PowerShell through firewall
+New-NetFirewallRule -DisplayName "PowerShell Outbound" -Direction Outbound -Program "C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe" -Action Allow
+
+# Test with simplified request
+$body = @{content="Test from Windows"} | ConvertTo-Json
+Invoke-RestMethod -Uri $webhookUrl -Method Post -Body $body -ContentType "application/json"
+```
+
+## Log Management Issues
+
+### Log Files Growing Too Large
+**Symptoms**: Disk space filling up, slow log access
+**Diagnosis**:
+```bash
+# Check log file sizes
+du -sh /var/log/homelab-*
+du -sh /tmp/*monitor*.log
+
+# Check available disk space
+df -h /var/log
+df -h /tmp
+```
+
+**Solutions**:
+```bash
+# Implement log rotation
+cat > /etc/logrotate.d/homelab-monitoring << 'EOF'
+/var/log/homelab-*.log {
+    daily
+    missingok
+    rotate 7
+    compress
+    notifempty
+    create 644 root root
+}
+EOF
+
+# Manual log cleanup
+find /tmp -name "*monitor*.log" -size +10M -delete
+truncate -s 0 /tmp/large-log-file.log
+```
+
+### Log Rotation Not Working
+**Symptoms**: Old logs not being cleaned up
+**Diagnosis**:
+```bash
+# Check logrotate status
+systemctl status logrotate
+cat /var/lib/logrotate/status
+
+# Test logrotate configuration
+logrotate -d /etc/logrotate.d/homelab-monitoring
+```
+
+**Solutions**:
+```bash
+# Force log rotation
+logrotate -f /etc/logrotate.d/homelab-monitoring
+
+# Fix logrotate configuration
+sudo nano /etc/logrotate.d/homelab-monitoring
+# Verify syntax and permissions
+```
+
+## Cron Job Issues
+
+### Scheduled Tasks Not Running
+**Symptoms**: Scripts not executing at scheduled times
+**Diagnosis**:
+```bash
+# Check cron service
+systemctl status cron
+systemctl status crond  # RHEL/CentOS
+
+# View cron logs
+grep CRON /var/log/syslog
+journalctl -u cron
+
+# List all cron jobs
+crontab -l
+sudo crontab -l  # System crontab
+```
+
+**Solutions**:
+```bash
+# Restart cron service
+sudo systemctl restart cron
+
+# Fix cron job syntax
+# Ensure absolute paths are used
+# Example: */20 * * * * /full/path/to/script.sh
+
+# Check script permissions and execution
+ls -la /path/to/script.sh
+/path/to/script.sh  # Test manual execution
+```
+
+### Environment Variables in Cron
+**Symptoms**: Scripts work manually but fail in cron
+**Diagnosis**:
+```bash
+# Create test cron job to check environment
+* * * * * env > /tmp/cron-env.txt
+
+# Compare with shell environment
+env > /tmp/shell-env.txt
+diff /tmp/shell-env.txt /tmp/cron-env.txt
+```
+
+**Solutions**:
+```bash
+# Set PATH in crontab
+PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+
+# Or set PATH in script
+#!/bin/bash
+export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
+
+# Source environment if needed
+source /etc/environment
+```
+
+## Network Monitoring Issues
+
+### False Positives
+**Symptoms**: Alerts for services that are actually working
+**Diagnosis**:
+```bash
+# Test monitoring checks manually
+curl -sSf --max-time 10 "https://service.homelab.local"
+ping -c1 -W5 10.10.0.100
+
+# Check for intermittent network issues
+for i in {1..10}; do ping -c1 host || echo "Fail $i"; done
+```
+
+**Solutions**:
+```bash
+# Adjust timeout values
+curl --max-time 30 "$service"  # Increase timeout
+
+# Add retry logic
+for retry in {1..3}; do
+    if curl -sSf "$service" >/dev/null 2>&1; then
+        break
+    elif [ $retry -eq 3 ]; then
+        send_alert "Service $service failed after 3 retries"
+    fi
+    sleep 5
+done
+```
+
+### Missing Alerts
+**Symptoms**: Real failures not triggering notifications
+**Diagnosis**:
+```bash
+# Verify monitoring script logic
+bash -x monitoring-script.sh
+
+# Check if services are actually down
+systemctl status service-name
+curl -v service-url
+```
+
+**Solutions**:
+```bash
+# Lower detection thresholds
+# Increase monitoring frequency
+# Add redundant monitoring methods
+
+# Test alert mechanism
+echo "Test alert" | send_alert_function
+```
+
+## System Resource Issues
+
+### Monitoring Overhead
+**Symptoms**: High CPU/memory usage from monitoring scripts
+**Diagnosis**:
+```bash
+# Monitor the monitoring scripts
+top -p $(pgrep -f monitor)
+ps aux | grep monitor
+
+# Check monitoring frequency
+crontab -l | grep monitor
+```
+
+**Solutions**:
+```bash
+# Reduce monitoring frequency
+# Change from */1 to */5 minutes
+
+# Optimize scripts
+# Remove unnecessary commands
+# Use efficient tools (prefer curl over wget, etc.)
+
+# Add resource limits
+timeout 30 monitoring-script.sh
+```
+
+## Emergency Recovery
+
+### Complete Monitoring Failure
+**Recovery Steps**:
+```bash
+# Restart all monitoring services
+sudo systemctl restart cron
+sudo systemctl restart rsyslog
+
+# Reinstall monitoring scripts
+cd /path/to/scripts
+./install-monitoring.sh
+
+# Test all components
+./test-monitoring.sh
+```
+
+### Discord Integration Lost
+**Quick Recovery**:
+```bash
+# Test webhook
+curl -X POST "$BACKUP_WEBHOOK_URL" -H "Content-Type: application/json" -d '{"content": "Monitoring restored"}'
+
+# Switch to backup webhook if needed
+export DISCORD_WEBHOOK_URL="$BACKUP_WEBHOOK_URL"
+```
+
+## Prevention and Best Practices
+
+### Monitoring Health Checks
+```bash
+#!/bin/bash
+# monitor-the-monitors.sh
+MONITORING_SCRIPTS="/path/to/tdarr-monitor.sh /path/to/network-monitor.sh"
+
+for script in $MONITORING_SCRIPTS; do
+    if [ ! -x "$script" ]; then
+        echo "ALERT: $script not executable" | send_alert
+    fi
+    
+    # Check if script has run recently
+    if [ $(($(date +%s) - $(stat -c %Y "$script.last_run" 2>/dev/null || echo 0))) -gt 3600 ]; then
+        echo "ALERT: $script hasn't run in over an hour" | send_alert
+    fi
+done
+```
+
+### Backup Alerting Channels
+```bash
+# Multiple notification methods
+send_alert() {
+    local message="$1"
+    
+    # Primary: Discord
+    curl -X POST "$DISCORD_WEBHOOK" -d "{\"content\":\"$message\"}" || \
+    # Backup: Email
+    echo "$message" | mail -s "Homelab Alert" admin@domain.com || \
+    # Last resort: Local log
+    echo "$(date): $message" >> /var/log/critical-alerts.log
+}
+```
+
+This troubleshooting guide covers the most common monitoring system issues and provides systematic recovery procedures.
\ No newline at end of file
diff --git a/networking/CONTEXT.md b/networking/CONTEXT.md
new file mode 100644
index 0000000..c348574
--- /dev/null
+++ b/networking/CONTEXT.md
@@ -0,0 +1,309 @@
+# Networking Infrastructure - Technology Context
+
+## Overview
+Home lab networking infrastructure with focus on reverse proxy configuration, SSL/TLS management, SSH key management, and network security. This context covers service discovery, load balancing, and performance optimization patterns.
+
+## Architecture Patterns
+
+### Reverse Proxy and Load Balancing
+**Pattern**: Centralized traffic management with SSL termination
+```nginx
+# Nginx reverse proxy pattern
+upstream backend {
+    server 10.10.0.100:3000;
+    server 10.10.0.101:3000;
+    keepalive 32;
+}
+
+server {
+    listen 443 ssl http2;
+    server_name myapp.homelab.local;
+    
+    ssl_certificate /etc/ssl/certs/homelab.crt;
+    ssl_certificate_key /etc/ssl/private/homelab.key;
+    
+    location / {
+        proxy_pass http://backend;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+    }
+}
+```
+
+### Network Segmentation Strategy
+**Pattern**: VLAN-based isolation with controlled inter-VLAN routing
+```
+Management VLAN: 10.10.0.x/24   # VM management, SSH access
+Services VLAN:   10.10.1.x/24   # Application services
+Storage VLAN:    10.10.2.x/24   # NAS, backup traffic
+DMZ VLAN:        10.10.10.x/24  # External-facing services
+```
+
+## SSH Key Management
+
+### Centralized Key Distribution
+**Pattern**: Automated SSH key deployment with emergency backup
+```bash
+# Primary access key
+~/.ssh/homelab_rsa         # Daily operations key
+
+# Emergency access key
+~/.ssh/emergency_homelab_rsa  # Backup recovery key
+
+# Automated deployment
+for host in $(cat hosts.txt); do
+    ssh-copy-id -i ~/.ssh/homelab_rsa.pub user@$host
+    ssh-copy-id -i ~/.ssh/emergency_homelab_rsa.pub user@$host
+done
+```
+
+### Key Lifecycle Management
+**Pattern**: Regular rotation with zero-downtime deployment
+1. **Generation**: Create new key pairs annually
+2. **Distribution**: Deploy to all managed systems
+3. **Verification**: Test connectivity with new keys
+4. **Rotation**: Remove old keys after verification
+5. **Backup**: Store keys in secure, recoverable location
+
+## Service Discovery and DNS
+
+### Local DNS Resolution
+**Pattern**: Internal DNS for service discovery
+```bind
+# Home lab DNS zones
+homelab.local.     IN  A   10.10.0.16   # DNS server
+proxmox.homelab.local.  IN  A   10.10.0.10   # Hypervisor
+nas.homelab.local.      IN  A   10.10.0.20   # Storage
+tdarr.homelab.local.    IN  A   10.10.0.43   # Media server
+```
+
+### Container Service Discovery
+**Pattern**: Docker network-based service resolution
+```yaml
+# Docker Compose service discovery
+version: "3.8"
+services:
+  web:
+    networks:
+      - frontend
+      - backend
+  api:
+    networks:
+      - backend
+      - database
+  db:
+    networks:
+      - database
+
+networks:
+  frontend:
+    driver: bridge
+  backend:
+    driver: bridge
+  database:
+    driver: bridge
+    internal: true  # No external access
+```
+
+## Security Patterns
+
+### SSH Security Hardening
+**Configuration**: Secure SSH server setup
+```sshd_config
+# /etc/ssh/sshd_config.d/99-homelab-security.conf
+PasswordAuthentication no
+PubkeyAuthentication yes
+PermitRootLogin no
+AllowUsers cal
+Protocol 2
+ClientAliveInterval 300
+ClientAliveCountMax 2
+MaxAuthTries 3
+X11Forwarding no
+```
+
+### Network Access Control
+**Pattern**: Firewall-based service protection
+```bash
+# ufw firewall rules
+ufw default deny incoming
+ufw default allow outgoing
+ufw allow ssh
+ufw allow from 10.10.0.0/24 to any port 22
+ufw allow from 10.10.0.0/24 to any port 80
+ufw allow from 10.10.0.0/24 to any port 443
+```
+
+### SSL/TLS Certificate Management
+**Pattern**: Automated certificate lifecycle
+```bash
+# Let's Encrypt automation
+certbot certonly --nginx \
+    --email admin@homelab.local \
+    --agree-tos \
+    --domains homelab.local,*.homelab.local
+
+# Certificate renewal automation
+0 2 * * * certbot renew --quiet && systemctl reload nginx
+```
+
+## Performance Optimization
+
+### Connection Management
+**Pattern**: Optimized connection handling
+```nginx
+# Nginx performance tuning
+worker_processes auto;
+worker_connections 1024;
+
+keepalive_timeout 65;
+keepalive_requests 1000;
+
+gzip on;
+gzip_vary on;
+gzip_types text/plain text/css application/json application/javascript;
+
+# Connection pooling
+upstream backend {
+    server 10.10.0.100:3000 max_fails=3 fail_timeout=30s;
+    keepalive 32;
+}
+```
+
+### Caching Strategies
+**Pattern**: Multi-level caching architecture
+```nginx
+# Static content caching
+location ~* \.(jpg|jpeg|png|gif|ico|css|js)$ {
+    expires 1y;
+    add_header Cache-Control "public, immutable";
+}
+
+# Proxy caching
+proxy_cache_path /var/cache/nginx levels=1:2 keys_zone=app_cache:10m;
+proxy_cache app_cache;
+proxy_cache_valid 200 302 10m;
+```
+
+## Network Storage Integration
+
+### CIFS/SMB Mount Resilience
+**Pattern**: Robust network filesystem mounting
+```fstab
+//nas.homelab.local/media /mnt/media cifs \
+    credentials=/etc/cifs/credentials,\
+    uid=1000,gid=1000,\
+    file_mode=0644,dir_mode=0755,\
+    iocharset=utf8,\
+    cache=strict,\
+    actimeo=30,\
+    _netdev,\
+    reconnect,\
+    soft,\
+    rsize=1048576,\
+    wsize=1048576 0 0
+```
+
+## Monitoring and Observability
+
+### Network Health Monitoring
+**Pattern**: Automated connectivity verification
+```bash
+#!/bin/bash
+# network-health-check.sh
+HOSTS="10.10.0.10 10.10.0.20 10.10.0.43"
+DNS_SERVERS="10.10.0.16 8.8.8.8"
+
+for host in $HOSTS; do
+    if ping -c1 -W5 $host >/dev/null 2>&1; then
+        echo "✅ $host: Reachable"
+    else
+        echo "❌ $host: Unreachable"
+    fi
+done
+
+for dns in $DNS_SERVERS; do
+    if nslookup google.com $dns >/dev/null 2>&1; then
+        echo "✅ DNS $dns: Working"
+    else
+        echo "❌ DNS $dns: Failed"
+    fi
+done
+```
+
+### Service Availability Monitoring
+**Pattern**: HTTP/HTTPS endpoint monitoring
+```bash
+# Service health check
+SERVICES="https://homelab.local http://proxmox.homelab.local:8006"
+
+for service in $SERVICES; do
+    if curl -sSf --max-time 10 "$service" >/dev/null 2>&1; then
+        echo "✅ $service: Available"
+    else
+        echo "❌ $service: Unavailable"
+    fi
+done
+```
+
+## Common Integration Patterns
+
+### Reverse Proxy with Docker
+**Pattern**: Container service exposure
+```nginx
+# Dynamic service discovery with Docker
+location /api/ {
+    proxy_pass http://api-container:3000/;
+    proxy_set_header Host $host;
+    proxy_set_header X-Real-IP $remote_addr;
+}
+
+location /web/ {
+    proxy_pass http://web-container:8080/;
+    proxy_set_header Upgrade $http_upgrade;
+    proxy_set_header Connection "upgrade";  # WebSocket support
+}
+```
+
+### VPN Integration
+**Pattern**: Secure remote access
+```openvpn
+# OpenVPN server configuration
+port 1194
+proto udp
+dev tun
+ca ca.crt
+cert server.crt
+key server.key
+dh dh.pem
+server 10.8.0.0 255.255.255.0
+push "route 10.10.0.0 255.255.0.0"  # Home lab networks
+keepalive 10 120
+```
+
+## Best Practices
+
+### Security Implementation
+1. **SSH Keys Only**: Disable password authentication everywhere
+2. **Network Segmentation**: Use VLANs for isolation
+3. **Certificate Management**: Automate SSL/TLS certificate lifecycle
+4. **Access Control**: Implement least-privilege networking
+5. **Monitoring**: Continuous network and service monitoring
+
+### Performance Optimization
+1. **Connection Pooling**: Reuse connections for efficiency
+2. **Caching**: Implement multi-level caching strategies
+3. **Compression**: Enable gzip for reduced bandwidth
+4. **Keep-Alives**: Optimize connection persistence
+5. **CDN Strategy**: Cache static content effectively
+
+### Operational Excellence
+1. **Documentation**: Maintain network topology documentation
+2. **Automation**: Script routine network operations
+3. **Backup**: Regular configuration backups
+4. **Testing**: Regular connectivity and performance testing
+5. **Change Management**: Controlled network configuration changes
+
+This technology context provides comprehensive guidance for implementing robust networking infrastructure in home lab environments.
\ No newline at end of file
diff --git a/reference/networking/cifs-mount-resilience-fixes.md b/networking/examples/cifs-mount-resilience-fixes.md
similarity index 100%
rename from reference/networking/cifs-mount-resilience-fixes.md
rename to networking/examples/cifs-mount-resilience-fixes.md
diff --git a/reference/networking/nas-mount-configuration.md b/networking/examples/nas-mount-configuration.md
similarity index 100%
rename from reference/networking/nas-mount-configuration.md
rename to networking/examples/nas-mount-configuration.md
diff --git a/reference/storage/network-filesystem-limitations.md b/networking/examples/network-filesystem-limitations.md
similarity index 100%
rename from reference/storage/network-filesystem-limitations.md
rename to networking/examples/network-filesystem-limitations.md
diff --git a/examples/networking/nginx-config.md b/networking/examples/nginx-config.md
similarity index 100%
rename from examples/networking/nginx-config.md
rename to networking/examples/nginx-config.md
diff --git a/networking/examples/security_improvements.md b/networking/examples/security_improvements.md
new file mode 100644
index 0000000..9dbe5a0
--- /dev/null
+++ b/networking/examples/security_improvements.md
@@ -0,0 +1,99 @@
+# Home Lab Security Improvements
+
+## Current Security Issues
+
+### Critical Issues Found:
+- **Password Authentication**: All servers using password-based SSH authentication
+- **Credential Reuse**: Same password used across 7 home network servers  
+- **Insecure Storage**: Passwords stored in FileZilla (base64 encoded, not encrypted)
+- **Root Access**: Cloud servers using root user accounts
+
+### Risk Assessment:
+- **High**: Password-based authentication vulnerable to brute force attacks
+- **High**: Shared passwords create single point of failure
+- **Medium**: FileZilla credentials accessible to anyone with file system access
+- **Medium**: Root access increases attack surface
+
+## Implemented Solutions
+
+### 1. SSH Key-Based Authentication
+- **Generated separate key pairs** for home lab vs cloud servers
+- **4096-bit RSA keys** for strong encryption
+- **Descriptive key comments** for identification
+
+### 2. SSH Configuration Management
+- **Centralized config** in `~/.ssh/config`
+- **Host aliases** for easy server access
+- **Port forwarding** pre-configured for common services
+- **Security defaults** (ServerAliveInterval, StrictHostKeyChecking)
+
+### 3. Network Segmentation
+- **Home network** (10.10.0.0/24) uses dedicated key
+- **Cloud servers** use separate key pair
+- **Service-specific aliases** for different server roles
+
+## Additional Security Recommendations
+
+### Immediate Actions:
+1. **Deploy SSH keys** using the provided script
+2. **Test key-based authentication** on all servers
+3. **Disable password authentication** once keys work
+4. **Remove FileZilla passwords** after migration
+
+### Server Hardening:
+```bash
+# On each server, edit /etc/ssh/sshd_config:
+PasswordAuthentication no
+PubkeyAuthentication yes
+PermitRootLogin no  # (create non-root user on cloud servers first)
+Port 2222  # Change default SSH port
+AllowUsers cal  # Restrict SSH access
+```
+
+### Monitoring:
+- **SSH login monitoring** with fail2ban
+- **Key rotation schedule** (annually)
+- **Access logging** review
+
+### Future Enhancements:
+- **Certificate-based authentication** (SSH CA)
+- **Multi-factor authentication** (TOTP)
+- **VPN access** for home network
+- **Bastion host** for cloud servers
+
+## Migration Plan
+
+### Phase 1: Key Deployment ✅
+- [x] Generate SSH key pairs
+- [x] Create SSH configuration
+- [x] Document server inventory
+
+### Phase 2: Authentication Migration
+- [ ] Deploy public keys to all servers
+- [ ] Test SSH connections with keys
+- [ ] Verify all services accessible
+
+### Phase 3: Security Lockdown
+- [ ] Disable password authentication
+- [ ] Change default SSH ports
+- [ ] Configure fail2ban
+- [ ] Remove FileZilla credentials
+
+### Phase 4: Monitoring & Maintenance
+- [ ] Set up access logging
+- [ ] Schedule key rotation
+- [ ] Document incident response
+
+## Connection Examples
+
+After setup, you'll connect using simple aliases:
+```bash
+# Instead of: ssh cal@10.10.0.42
+ssh database-apis
+
+# Instead of: ssh root@172.237.147.99  
+ssh akamai
+
+# With automatic port forwarding:
+ssh pihole  # Forwards port 8080 → localhost:80
+```
\ No newline at end of file
diff --git a/networking/examples/server_inventory.yaml b/networking/examples/server_inventory.yaml
new file mode 100644
index 0000000..8f0dd87
--- /dev/null
+++ b/networking/examples/server_inventory.yaml
@@ -0,0 +1,70 @@
+---
+# Home Lab Server Inventory
+# Generated from FileZilla configuration
+
+home_network:
+  subnet: "10.10.0.0/24"
+  servers:
+    database_apis:
+      hostname: "10.10.0.42"
+      port: 22
+      user: "cal"
+      services: ["database", "api"]
+      description: "Database and API services"
+      
+    discord_bots:
+      hostname: "10.10.0.33"
+      port: 22
+      user: "cal"
+      services: ["discord", "bots"]
+      description: "Discord bot hosting"
+      
+    home_docker:
+      hostname: "10.10.0.124"
+      port: 22
+      user: "cal"
+      services: ["docker", "containers"]
+      description: "Main Docker container host"
+      
+    pihole:
+      hostname: "10.10.0.16"
+      port: 22
+      user: "cal"
+      services: ["dns", "adblock"]
+      description: "Pi-hole DNS and ad blocking"
+      
+    sba_pd_bots:
+      hostname: "10.10.0.88"
+      port: 22
+      user: "cal"
+      services: ["bots", "automation"]
+      description: "SBa and PD bot services"
+      
+    tdarr:
+      hostname: "10.10.0.43"
+      port: 22
+      user: "cal"
+      services: ["media", "transcoding"]
+      description: "Tdarr media transcoding"
+      
+    vpn_docker:
+      hostname: "10.10.0.121"
+      port: 22
+      user: "cal"
+      services: ["vpn", "docker"]
+      description: "VPN and Docker services"
+
+remote_servers:
+  akamai_nano:
+    hostname: "172.237.147.99"
+    port: 22
+    user: "root"
+    provider: "akamai"
+    description: "Akamai cloud nano instance"
+    
+  vultr_host:
+    hostname: "45.76.25.231"
+    port: 22
+    user: "root"
+    provider: "vultr"
+    description: "Vultr cloud host"
\ No newline at end of file
diff --git a/examples/networking/ssh-homelab-setup.md b/networking/examples/ssh-homelab-setup.md
similarity index 100%
rename from examples/networking/ssh-homelab-setup.md
rename to networking/examples/ssh-homelab-setup.md
diff --git a/patterns/networking/ssh-key-management.md b/networking/examples/ssh-key-management.md
similarity index 100%
rename from patterns/networking/ssh-key-management.md
rename to networking/examples/ssh-key-management.md
diff --git a/reference/networking/ssh-troubleshooting.md b/networking/examples/ssh-troubleshooting.md
similarity index 100%
rename from reference/networking/ssh-troubleshooting.md
rename to networking/examples/ssh-troubleshooting.md
diff --git a/reference/networking/troubleshooting.md b/networking/examples/troubleshooting.md
similarity index 100%
rename from reference/networking/troubleshooting.md
rename to networking/examples/troubleshooting.md
diff --git a/networking/scripts/ssh_key_maintenance.sh b/networking/scripts/ssh_key_maintenance.sh
new file mode 100755
index 0000000..c7ccb61
--- /dev/null
+++ b/networking/scripts/ssh_key_maintenance.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+# SSH Key Maintenance and Backup Script
+# Run this periodically to maintain key security
+
+echo "🔧 SSH Key Maintenance and Backup"
+
+# Check if NAS is mounted
+if [ ! -d "/mnt/NV2" ]; then
+    echo "❌ ERROR: NAS not mounted at /mnt/NV2"
+    exit 1
+fi
+
+# Create timestamp
+TIMESTAMP=$(date +%Y%m%d-%H%M%S)
+BACKUP_ROOT="/mnt/NV2/ssh-keys"
+BACKUP_DIR="$BACKUP_ROOT/maintenance-$TIMESTAMP"
+
+# Ensure backup directory structure
+mkdir -p "$BACKUP_DIR"
+chmod 700 "$BACKUP_DIR"
+
+echo "📁 Creating maintenance backup in: $BACKUP_DIR"
+
+# Backup current keys and config
+cp ~/.ssh/*_rsa* "$BACKUP_DIR/" 2>/dev/null || true
+cp ~/.ssh/config "$BACKUP_DIR/" 2>/dev/null || true
+cp ~/.ssh/known_hosts "$BACKUP_DIR/" 2>/dev/null || true
+
+# Check key ages and recommend rotation
+echo ""
+echo "🔍 Key Age Analysis:"
+for key in ~/.ssh/*_rsa; do
+    if [ -f "$key" ]; then
+        age_days=$(( ($(date +%s) - $(stat -c %Y "$key")) / 86400 ))
+        basename_key=$(basename "$key")
+        
+        if [ $age_days -gt 365 ]; then
+            echo "⚠️  $basename_key: $age_days days old - ROTATION RECOMMENDED"
+        elif [ $age_days -gt 180 ]; then
+            echo "⚡ $basename_key: $age_days days old - consider rotation"
+        else
+            echo "✅ $basename_key: $age_days days old - OK"
+        fi
+    fi
+done
+
+# Test key accessibility
+echo ""
+echo "🔐 Testing Key Access:"
+for key in ~/.ssh/*_rsa; do
+    if [ -f "$key" ]; then
+        basename_key=$(basename "$key")
+        if ssh-keygen -l -f "$key" >/dev/null 2>&1; then
+            echo "✅ $basename_key: Valid and readable"
+        else
+            echo "❌ $basename_key: CORRUPTED or unreadable"
+        fi
+    fi
+done
+
+# Clean up old backups (keep last 10)
+echo ""
+echo "🧹 Cleaning old backups (keeping last 10):"
+cd "$BACKUP_ROOT"
+ls -dt backup-* maintenance-* 2>/dev/null | tail -n +11 | while read old_backup; do
+    if [ -d "$old_backup" ]; then
+        echo "🗑️  Removing old backup: $old_backup"
+        rm -rf "$old_backup"
+    fi
+done
+
+# Generate maintenance report
+cat > "$BACKUP_DIR/MAINTENANCE_REPORT.md" << EOF
+# SSH Key Maintenance Report
+Generated: $(date)
+Host: $(hostname)
+User: $(whoami)
+
+## Backup Location
+$BACKUP_DIR
+
+## Key Inventory
+$(ls -la ~/.ssh/*_rsa* 2>/dev/null || echo "No SSH keys found")
+
+## SSH Config Status
+$(if [ -f ~/.ssh/config ]; then echo "SSH config exists: ~/.ssh/config"; else echo "No SSH config found"; fi)
+
+## Server Connection Tests
+Run these commands to verify connectivity:
+
+### Primary Keys:
+ssh -o ConnectTimeout=5 database-apis 'echo "DB APIs: OK"'
+ssh -o ConnectTimeout=5 pihole 'echo "PiHole: OK"'
+ssh -o ConnectTimeout=5 akamai 'echo "Akamai: OK"'
+
+### Emergency Keys (if deployed):
+ssh -i ~/.ssh/emergency_homelab_rsa -o ConnectTimeout=5 cal@10.10.0.16 'echo "Emergency Home: OK"'
+ssh -i ~/.ssh/emergency_cloud_rsa -o ConnectTimeout=5 root@172.237.147.99 'echo "Emergency Cloud: OK"'
+
+## Next Maintenance Due
+$(date -d '+3 months')
+
+## Key Rotation Schedule
+- Home lab keys: Annual (generated $(date -r ~/.ssh/homelab_rsa 2>/dev/null || echo "Not found"))
+- Cloud keys: Annual (generated $(date -r ~/.ssh/cloud_servers_rsa 2>/dev/null || echo "Not found"))
+- Emergency keys: Bi-annual
+
+EOF
+
+echo "✅ Maintenance backup completed"
+echo "📄 Report saved: $BACKUP_DIR/MAINTENANCE_REPORT.md"
+echo ""
+echo "💡 Schedule this script to run monthly via cron:"
+echo "   0 2 1 * * /path/to/ssh_key_maintenance.sh"
\ No newline at end of file
diff --git a/networking/troubleshooting.md b/networking/troubleshooting.md
new file mode 100644
index 0000000..35465ef
--- /dev/null
+++ b/networking/troubleshooting.md
@@ -0,0 +1,496 @@
+# Networking Infrastructure Troubleshooting Guide
+
+## SSH Connection Issues
+
+### SSH Authentication Failures
+**Symptoms**: Permission denied, connection refused, timeout
+**Diagnosis**:
+```bash
+# Verbose SSH debugging
+ssh -vvv user@host
+
+# Test different authentication methods
+ssh -o PasswordAuthentication=no user@host
+ssh -o PubkeyAuthentication=yes user@host
+
+# Check local key files
+ls -la ~/.ssh/
+ssh-keygen -lf ~/.ssh/homelab_rsa.pub
+```
+
+**Solutions**:
+```bash
+# Re-deploy SSH keys
+ssh-copy-id -i ~/.ssh/homelab_rsa.pub user@host
+ssh-copy-id -i ~/.ssh/emergency_homelab_rsa.pub user@host
+
+# Fix key permissions
+chmod 600 ~/.ssh/homelab_rsa
+chmod 644 ~/.ssh/homelab_rsa.pub
+chmod 700 ~/.ssh
+
+# Verify remote authorized_keys
+ssh user@host 'chmod 700 ~/.ssh && chmod 600 ~/.ssh/authorized_keys'
+```
+
+### SSH Service Issues
+**Symptoms**: Connection refused, service not running
+**Diagnosis**:
+```bash
+# Check SSH service status
+systemctl status sshd
+ss -tlnp | grep :22
+
+# Test port connectivity
+nc -zv host 22
+nmap -p 22 host
+```
+
+**Solutions**:
+```bash
+# Restart SSH service
+sudo systemctl restart sshd
+sudo systemctl enable sshd
+
+# Check firewall
+sudo ufw status
+sudo ufw allow ssh
+
+# Verify SSH configuration
+sudo sshd -T | grep -E "(passwordauth|pubkeyauth|permitroot)"
+```
+
+## Network Connectivity Problems
+
+### Basic Network Troubleshooting
+**Symptoms**: Cannot reach hosts, timeouts, routing issues
+**Diagnosis**:
+```bash
+# Basic connectivity tests
+ping host
+traceroute host
+mtr host
+
+# Check local network configuration
+ip addr show
+ip route show
+cat /etc/resolv.conf
+```
+
+**Solutions**:
+```bash
+# Restart networking
+sudo systemctl restart networking
+sudo netplan apply  # Ubuntu
+
+# Reset network interface
+sudo ip link set eth0 down
+sudo ip link set eth0 up
+
+# Check default gateway
+sudo ip route add default via 10.10.0.1
+```
+
+### DNS Resolution Issues
+**Symptoms**: Cannot resolve hostnames, slow resolution
+**Diagnosis**:
+```bash
+# Test DNS resolution
+nslookup google.com
+dig google.com
+host google.com
+
+# Check DNS servers
+systemd-resolve --status
+cat /etc/resolv.conf
+```
+
+**Solutions**:
+```bash
+# Temporary DNS fix
+echo "nameserver 8.8.8.8" | sudo tee /etc/resolv.conf
+
+# Restart DNS services
+sudo systemctl restart systemd-resolved
+
+# Flush DNS cache
+sudo systemd-resolve --flush-caches
+```
+
+## Reverse Proxy and Load Balancer Issues
+
+### Nginx Configuration Problems
+**Symptoms**: 502 Bad Gateway, 503 Service Unavailable, SSL errors
+**Diagnosis**:
+```bash
+# Check Nginx status and logs
+systemctl status nginx
+sudo tail -f /var/log/nginx/error.log
+sudo tail -f /var/log/nginx/access.log
+
+# Test Nginx configuration
+sudo nginx -t
+sudo nginx -T  # Show full configuration
+```
+
+**Solutions**:
+```bash
+# Reload Nginx configuration
+sudo nginx -s reload
+
+# Check upstream servers
+curl -I http://backend-server:port
+telnet backend-server port
+
+# Fix common configuration issues
+sudo nano /etc/nginx/sites-available/default
+# Check proxy_pass URLs, upstream definitions
+```
+
+### SSL/TLS Certificate Issues
+**Symptoms**: Certificate warnings, expired certificates, connection errors
+**Diagnosis**:
+```bash
+# Check certificate validity
+openssl s_client -connect host:443 -servername host
+openssl x509 -in /etc/ssl/certs/cert.pem -text -noout
+
+# Check certificate expiry
+openssl x509 -in /etc/ssl/certs/cert.pem -noout -dates
+```
+
+**Solutions**:
+```bash
+# Renew Let's Encrypt certificates
+sudo certbot renew --dry-run
+sudo certbot renew --force-renewal
+
+# Generate self-signed certificate
+sudo openssl req -x509 -nodes -days 365 -newkey rsa:2048 \
+    -keyout /etc/ssl/private/selfsigned.key \
+    -out /etc/ssl/certs/selfsigned.crt
+```
+
+## Network Storage Issues
+
+### CIFS/SMB Mount Problems
+**Symptoms**: Mount failures, connection timeouts, permission errors
+**Diagnosis**:
+```bash
+# Test SMB connectivity
+smbclient -L //nas-server -U username
+testparm  # Test Samba configuration
+
+# Check mount status
+mount | grep cifs
+df -h | grep cifs
+```
+
+**Solutions**:
+```bash
+# Remount with verbose logging
+sudo mount -t cifs //server/share /mnt/point -o username=user,password=pass,vers=3.0
+
+# Fix mount options in /etc/fstab
+//server/share /mnt/point cifs credentials=/etc/cifs/credentials,uid=1000,gid=1000,iocharset=utf8,file_mode=0644,dir_mode=0755,cache=strict,_netdev 0 0
+
+# Test credentials
+sudo cat /etc/cifs/credentials
+# Should contain: username=, password=, domain=
+```
+
+### NFS Mount Issues
+**Symptoms**: Stale file handles, mount hangs, permission denied
+**Diagnosis**:
+```bash
+# Check NFS services
+systemctl status nfs-client.target
+showmount -e nfs-server
+
+# Test NFS connectivity
+rpcinfo -p nfs-server
+```
+
+**Solutions**:
+```bash
+# Restart NFS services
+sudo systemctl restart nfs-client.target
+
+# Remount NFS shares
+sudo umount /mnt/nfs-share
+sudo mount -t nfs server:/path /mnt/nfs-share
+
+# Fix stale file handles
+sudo umount -f /mnt/nfs-share
+sudo mount /mnt/nfs-share
+```
+
+## Firewall and Security Issues
+
+### Port Access Problems
+**Symptoms**: Connection refused, filtered ports, blocked services
+**Diagnosis**:
+```bash
+# Check firewall status
+sudo ufw status verbose
+sudo iptables -L -n -v
+
+# Test port accessibility
+nc -zv host port
+nmap -p port host
+```
+
+**Solutions**:
+```bash
+# Open required ports
+sudo ufw allow ssh
+sudo ufw allow 80/tcp
+sudo ufw allow 443/tcp
+sudo ufw allow from 10.10.0.0/24
+
+# Reset firewall if needed
+sudo ufw --force reset
+sudo ufw enable
+```
+
+### Network Security Issues
+**Symptoms**: Unauthorized access, suspicious traffic, security alerts
+**Diagnosis**:
+```bash
+# Check active connections
+ss -tuln
+netstat -tuln
+
+# Review logs for security events
+sudo tail -f /var/log/auth.log
+sudo tail -f /var/log/syslog | grep -i security
+```
+
+**Solutions**:
+```bash
+# Block suspicious IPs
+sudo ufw deny from suspicious-ip
+
+# Update SSH security
+sudo nano /etc/ssh/sshd_config
+# Set: PasswordAuthentication no, PermitRootLogin no
+sudo systemctl restart sshd
+```
+
+## Service Discovery and DNS Issues
+
+### Local DNS Problems
+**Symptoms**: Services unreachable by hostname, DNS timeouts
+**Diagnosis**:
+```bash
+# Test local DNS resolution
+nslookup service.homelab.local
+dig @10.10.0.16 service.homelab.local
+
+# Check DNS server status
+systemctl status bind9  # or named
+```
+
+**Solutions**:
+```bash
+# Add to /etc/hosts as temporary fix
+echo "10.10.0.100 service.homelab.local" | sudo tee -a /etc/hosts
+
+# Restart DNS services
+sudo systemctl restart bind9
+sudo systemctl restart systemd-resolved
+```
+
+### Container Networking Issues
+**Symptoms**: Containers cannot communicate, service discovery fails
+**Diagnosis**:
+```bash
+# Check Docker networks
+docker network ls
+docker network inspect bridge
+
+# Test container connectivity
+docker exec container1 ping container2
+docker exec container1 nslookup container2
+```
+
+**Solutions**:
+```bash
+# Create custom network
+docker network create --driver bridge app-network
+docker run --network app-network container
+
+# Fix DNS in containers
+docker run --dns 8.8.8.8 container
+```
+
+## Performance Issues
+
+### Network Latency Problems
+**Symptoms**: Slow response times, timeouts, poor performance
+**Diagnosis**:
+```bash
+# Measure network latency
+ping -c 100 host
+mtr --report host
+
+# Check network interface stats
+ip -s link show
+cat /proc/net/dev
+```
+
+**Solutions**:
+```bash
+# Optimize network settings
+echo 'net.core.rmem_max = 134217728' | sudo tee -a /etc/sysctl.conf
+echo 'net.core.wmem_max = 134217728' | sudo tee -a /etc/sysctl.conf
+sudo sysctl -p
+
+# Check for network congestion
+iftop
+nethogs
+```
+
+### Bandwidth Issues
+**Symptoms**: Slow transfers, network congestion, dropped packets
+**Diagnosis**:
+```bash
+# Test bandwidth
+iperf3 -s  # Server
+iperf3 -c server-ip  # Client
+
+# Check interface utilization
+vnstat -i eth0
+```
+
+**Solutions**:
+```bash
+# Implement QoS if needed
+sudo tc qdisc add dev eth0 root fq_codel
+
+# Optimize buffer sizes
+sudo ethtool -G eth0 rx 4096 tx 4096
+```
+
+## Emergency Recovery Procedures
+
+### Network Emergency Recovery
+**Complete network failure recovery**:
+```bash
+# Reset all network configuration
+sudo systemctl stop networking
+sudo ip addr flush eth0
+sudo ip route flush table main
+sudo systemctl start networking
+
+# Manual network configuration
+sudo ip addr add 10.10.0.100/24 dev eth0
+sudo ip route add default via 10.10.0.1
+echo "nameserver 8.8.8.8" | sudo tee /etc/resolv.conf
+```
+
+### SSH Emergency Access
+**When locked out of systems**:
+```bash
+# Use emergency SSH key
+ssh -i ~/.ssh/emergency_homelab_rsa user@host
+
+# Via console access (if available)
+# Use hypervisor console or physical access
+
+# Reset SSH to allow password auth temporarily
+sudo sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config
+sudo systemctl restart sshd
+```
+
+### Service Recovery
+**Critical service restoration**:
+```bash
+# Restart all network services
+sudo systemctl restart networking
+sudo systemctl restart nginx
+sudo systemctl restart sshd
+
+# Emergency firewall disable
+sudo ufw disable  # CAUTION: Only for troubleshooting
+
+# Service-specific recovery
+sudo systemctl restart docker
+sudo systemctl restart systemd-resolved
+```
+
+## Monitoring and Prevention
+
+### Network Health Monitoring
+```bash
+#!/bin/bash
+# network-monitor.sh
+CRITICAL_HOSTS="10.10.0.1 10.10.0.16 nas.homelab.local"
+CRITICAL_SERVICES="https://homelab.local http://proxmox.homelab.local:8006"
+
+for host in $CRITICAL_HOSTS; do
+    if ! ping -c1 -W5 $host >/dev/null 2>&1; then
+        echo "ALERT: $host unreachable" | logger -t network-monitor
+    fi
+done
+
+for service in $CRITICAL_SERVICES; do
+    if ! curl -sSf --max-time 10 "$service" >/dev/null 2>&1; then
+        echo "ALERT: $service unavailable" | logger -t network-monitor
+    fi
+done
+```
+
+### Automated Recovery Scripts
+```bash
+#!/bin/bash
+# network-recovery.sh
+if ! ping -c1 8.8.8.8 >/dev/null 2>&1; then
+    echo "Network down, attempting recovery..."
+    sudo systemctl restart networking
+    sleep 10
+    if ping -c1 8.8.8.8 >/dev/null 2>&1; then
+        echo "Network recovered"
+    else
+        echo "Manual intervention required"
+    fi
+fi
+```
+
+## Quick Reference Commands
+
+### Network Diagnostics
+```bash
+# Connectivity tests
+ping host
+traceroute host
+mtr host
+nc -zv host port
+
+# Service checks
+systemctl status networking
+systemctl status nginx
+systemctl status sshd
+
+# Network configuration
+ip addr show
+ip route show
+ss -tuln
+```
+
+### Emergency Commands
+```bash
+# Network restart
+sudo systemctl restart networking
+
+# SSH emergency access
+ssh -i ~/.ssh/emergency_homelab_rsa user@host
+
+# Firewall quick disable (emergency only)
+sudo ufw disable
+
+# DNS quick fix
+echo "nameserver 8.8.8.8" | sudo tee /etc/resolv.conf
+```
+
+This troubleshooting guide provides comprehensive solutions for common networking issues in home lab environments.
\ No newline at end of file
diff --git a/patterns/docker/README.md b/patterns/docker/README.md
deleted file mode 100644
index 4d4e474..0000000
--- a/patterns/docker/README.md
+++ /dev/null
@@ -1,26 +0,0 @@
-# Docker Patterns
-
-## Container Best Practices
-- Use multi-stage builds for production images
-- Minimize layer count and image size
-- Run containers as non-root users
-- Use specific version tags, avoid `latest`
-- Implement health checks
-
-## Common Patterns
-- **Multi-service applications**: Use docker-compose for local development
-- **Production deployments**: Single-container per service with orchestration
-- **Development environments**: Volume mounts for code changes
-- **CI/CD integration**: Build, test, and push in pipeline stages
-
-## Security Considerations
-- Scan images for vulnerabilities
-- Use distroless or minimal base images
-- Implement resource limits
-- Network isolation between services
-
-## Related Documentation
-- Examples: `/examples/docker/multi-stage-builds.md`
-- Examples: `/examples/docker/compose-patterns.md`
-- Reference: `/reference/docker/troubleshooting.md`
-- Reference: `/reference/docker/security-checklist.md`
\ No newline at end of file
diff --git a/patterns/networking/README.md b/patterns/networking/README.md
deleted file mode 100644
index 15af9a9..0000000
--- a/patterns/networking/README.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# Networking Patterns
-
-## Infrastructure Setup
-- **Reverse proxy** configuration (Nginx/Traefik)
-- **Load balancing** strategies and health checks
-- **SSL/TLS termination** and certificate management
-- **Network segmentation** and VLANs
-
-## Service Discovery
-- **DNS-based** service resolution
-- **Container networking** with Docker networks
-- **Service mesh** patterns for microservices
-- **API gateway** implementation
-
-## Security Patterns
-- **Firewall rules** and port management
-- **VPN setup** for remote access
-- **Zero-trust networking** principles
-- **Network monitoring** and intrusion detection
-
-## Performance Optimization
-- **CDN integration** for static assets
-- **Connection pooling** and keep-alives
-- **Bandwidth management** and QoS
-- **Caching strategies** at network level
-
-## Related Documentation
-- Examples: `/examples/networking/nginx-config.md`
-- Examples: `/examples/networking/vpn-setup.md`
-- Examples: `/examples/networking/load-balancing.md`
-- Reference: `/reference/networking/troubleshooting.md`
-- Reference: `/reference/networking/security.md`
\ No newline at end of file
diff --git a/patterns/vm-management/README.md b/patterns/vm-management/README.md
deleted file mode 100644
index 884cd02..0000000
--- a/patterns/vm-management/README.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# Virtual Machine Management Patterns
-
-## Automated Provisioning
-- **Cloud-init deployment** - Fully automated VM provisioning from first boot
-- **Post-install scripts** - Standardized configuration for existing VMs
-- **SSH key management** - Automated key deployment with emergency backup
-- **Security hardening** - Password auth disabled, firewall configured
-
-## VM Provisioning Strategies
-
-### Template-Based Deployment
-- **Ubuntu Server templates** optimized for home lab environments
-- **Resource allocation** sizing and planning
-- **Network configuration** and VLAN assignment (10.10.0.x networks)
-- **Storage provisioning** and disk management
-
-### Infrastructure as Code
-- **Cloud-init templates** for repeatable VM creation
-- **Bash provisioning scripts** for existing infrastructure
-- **SSH key integration** with existing homelab key management
-- **Docker environment** setup with user permissions
-
-## Lifecycle Management
-- **Automated provisioning** with infrastructure as code
-- **Configuration management** with standardized scripts
-- **Snapshot management** and rollback strategies
-- **Scaling policies** for resource optimization
-
-## Monitoring & Maintenance
-- **Resource monitoring** (CPU, memory, disk, network)
-- **Health checks** and alerting systems
-- **Patch management** and update strategies
-- **Performance tuning** and optimization
-
-## Backup & Recovery
-- **VM-level backups** vs **application-level backups**
-- **Disaster recovery** planning and testing
-- **High availability** configurations
-- **Migration strategies** between hosts
-
-## Implementation Workflows
-
-### New VM Creation (Recommended)
-1. **Create VM in Proxmox** with cloud-init support
-2. **Apply cloud-init template** (`scripts/vm-management/cloud-init-user-data.yaml`)
-3. **Start VM** - fully automated provisioning
-4. **Verify setup** via SSH key authentication
-
-### Existing VM Configuration  
-1. **Run post-install script** (`scripts/vm-management/vm-post-install.sh <ip> <user>`)
-2. **Automated provisioning** handles updates, SSH keys, Docker
-3. **Security hardening** applied automatically
-4. **Test connectivity** and verify Docker installation
-
-## Security Architecture
-- **SSH key-based authentication** only (passwords disabled)
-- **Emergency key backup** for failover access  
-- **User privilege separation** (sudo required, docker group)
-- **Automatic security updates** configured
-- **Network isolation** ready (10.10.0.x internal network)
-
-## Related Documentation
-- **Implementation**: `scripts/vm-management/README.md` - Complete setup guides
-- **SSH Keys**: `patterns/networking/ssh-key-management.md` - Key lifecycle management  
-- **Examples**: `examples/networking/ssh-homelab-setup.md` - SSH integration patterns
-- **Reference**: `reference/vm-management/troubleshooting.md` - Common issues and solutions
\ No newline at end of file
diff --git a/scripts/monitoring/tdarr_monitor.py b/scripts/monitoring/tdarr_monitor.py
deleted file mode 100755
index 2390976..0000000
--- a/scripts/monitoring/tdarr_monitor.py
+++ /dev/null
@@ -1,498 +0,0 @@
-#!/usr/bin/env python3
-"""
-Tdarr API Monitoring Script
-
-Monitors Tdarr server via its web API endpoints:
-- Server status and health
-- Queue status and statistics
-- Node status and performance
-- Library scan progress
-- Worker activity
-
-Usage:
-    python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check all
-    python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check queue
-    python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes
-"""
-
-import argparse
-import json
-import logging
-import sys
-from dataclasses import dataclass, asdict
-from datetime import datetime
-from typing import Dict, List, Optional, Any
-import requests
-from urllib.parse import urljoin
-
-
-@dataclass
-class ServerStatus:
-    timestamp: str
-    server_url: str
-    status: str
-    error: Optional[str] = None
-    version: Optional[str] = None
-    server_id: Optional[str] = None
-    uptime: Optional[str] = None
-    system_info: Optional[Dict[str, Any]] = None
-
-
-@dataclass
-class QueueStats:
-    total_files: int
-    queued: int
-    processing: int
-    completed: int
-    queue_items: List[Dict[str, Any]]
-
-
-@dataclass
-class QueueStatus:
-    timestamp: str
-    queue_stats: Optional[QueueStats] = None
-    error: Optional[str] = None
-
-
-@dataclass
-class NodeInfo:
-    id: Optional[str]
-    nodeName: Optional[str]
-    status: str
-    lastSeen: Optional[int]
-    version: Optional[str]
-    platform: Optional[str]
-    workers: Dict[str, int]
-    processing: List[Dict[str, Any]]
-
-
-@dataclass
-class NodeSummary:
-    total_nodes: int
-    online_nodes: int
-    offline_nodes: int
-    online_details: List[NodeInfo]
-    offline_details: List[NodeInfo]
-
-
-@dataclass
-class NodeStatus:
-    timestamp: str
-    nodes: List[Dict[str, Any]]
-    node_summary: Optional[NodeSummary] = None
-    error: Optional[str] = None
-
-
-@dataclass
-class LibraryInfo:
-    name: Optional[str]
-    path: Optional[str]
-    file_count: int
-    scan_progress: int
-    last_scan: Optional[str]
-    is_scanning: bool
-
-
-@dataclass
-class ScanStatus:
-    total_libraries: int
-    total_files: int
-    scanning_libraries: int
-
-
-@dataclass
-class LibraryStatus:
-    timestamp: str
-    libraries: List[LibraryInfo]
-    scan_status: Optional[ScanStatus] = None
-    error: Optional[str] = None
-
-
-@dataclass
-class Statistics:
-    total_transcodes: int
-    space_saved: int
-    total_files_processed: int
-    failed_transcodes: int
-    processing_speed: int
-    eta: Optional[str]
-
-
-@dataclass
-class StatisticsStatus:
-    timestamp: str
-    statistics: Optional[Statistics] = None
-    error: Optional[str] = None
-
-
-@dataclass
-class HealthCheck:
-    status: str
-    healthy: bool
-    online_count: Optional[int] = None
-    total_count: Optional[int] = None
-    accessible: Optional[bool] = None
-    total_items: Optional[int] = None
-
-
-@dataclass
-class HealthStatus:
-    timestamp: str
-    overall_status: str
-    checks: Dict[str, HealthCheck]
-
-
-class TdarrMonitor:
-    def __init__(self, server_url: str, timeout: int = 30):
-        """Initialize Tdarr monitor with server URL."""
-        self.server_url = server_url.rstrip('/')
-        self.timeout = timeout
-        self.session = requests.Session()
-        
-        # Configure logging
-        logging.basicConfig(
-            level=logging.INFO,
-            format='%(asctime)s - %(levelname)s - %(message)s'
-        )
-        self.logger = logging.getLogger(__name__)
-
-    def _make_request(self, endpoint: str) -> Optional[Dict[str, Any]]:
-        """Make HTTP request to Tdarr API endpoint."""
-        url = urljoin(self.server_url, endpoint)
-        
-        try:
-            response = self.session.get(url, timeout=self.timeout)
-            response.raise_for_status()
-            return response.json()
-            
-        except requests.exceptions.RequestException as e:
-            self.logger.error(f"Request failed for {url}: {e}")
-            return None
-        except json.JSONDecodeError as e:
-            self.logger.error(f"JSON decode failed for {url}: {e}")
-            return None
-
-    def get_server_status(self) -> ServerStatus:
-        """Get overall server status and configuration."""
-        timestamp = datetime.now().isoformat()
-        
-        # Try to get server info from API
-        data = self._make_request('/api/v2/get-server-info')
-        if data:
-            return ServerStatus(
-                timestamp=timestamp,
-                server_url=self.server_url,
-                status='online',
-                version=data.get('version'),
-                server_id=data.get('serverId'),
-                uptime=data.get('uptime'),
-                system_info=data.get('systemInfo', {})
-            )
-        else:
-            return ServerStatus(
-                timestamp=timestamp,
-                server_url=self.server_url,
-                status='offline',
-                error='Unable to connect to Tdarr server'
-            )
-
-    def get_queue_status(self) -> QueueStatus:
-        """Get transcoding queue status and statistics."""
-        timestamp = datetime.now().isoformat()
-        
-        # Get queue information
-        data = self._make_request('/api/v2/get-queue')
-        if data:
-            queue_data = data.get('queue', [])
-            
-            # Calculate queue statistics
-            total_files = len(queue_data)
-            queued_files = len([f for f in queue_data if f.get('status') == 'Queued'])
-            processing_files = len([f for f in queue_data if f.get('status') == 'Processing'])
-            completed_files = len([f for f in queue_data if f.get('status') == 'Completed'])
-            
-            queue_stats = QueueStats(
-                total_files=total_files,
-                queued=queued_files,
-                processing=processing_files,
-                completed=completed_files,
-                queue_items=queue_data[:10]  # First 10 items for details
-            )
-            
-            return QueueStatus(
-                timestamp=timestamp,
-                queue_stats=queue_stats
-            )
-        else:
-            return QueueStatus(
-                timestamp=timestamp,
-                error='Unable to fetch queue data'
-            )
-
-    def get_node_status(self) -> NodeStatus:
-        """Get status of all connected nodes."""
-        timestamp = datetime.now().isoformat()
-        
-        # Get nodes information
-        data = self._make_request('/api/v2/get-nodes')
-        if data:
-            nodes = data.get('nodes', [])
-            
-            # Process node information
-            online_nodes = []
-            offline_nodes = []
-            
-            for node in nodes:
-                node_info = NodeInfo(
-                    id=node.get('_id'),
-                    nodeName=node.get('nodeName'),
-                    status='online' if node.get('lastSeen', 0) > 0 else 'offline',
-                    lastSeen=node.get('lastSeen'),
-                    version=node.get('version'),
-                    platform=node.get('platform'),
-                    workers={
-                        'cpu': node.get('workers', {}).get('CPU', 0),
-                        'gpu': node.get('workers', {}).get('GPU', 0)
-                    },
-                    processing=node.get('currentJobs', [])
-                )
-                
-                if node_info.status == 'online':
-                    online_nodes.append(node_info)
-                else:
-                    offline_nodes.append(node_info)
-            
-            node_summary = NodeSummary(
-                total_nodes=len(nodes),
-                online_nodes=len(online_nodes),
-                offline_nodes=len(offline_nodes),
-                online_details=online_nodes,
-                offline_details=offline_nodes
-            )
-            
-            return NodeStatus(
-                timestamp=timestamp,
-                nodes=nodes,
-                node_summary=node_summary
-            )
-        else:
-            return NodeStatus(
-                timestamp=timestamp,
-                nodes=[],
-                error='Unable to fetch node data'
-            )
-
-    def get_library_status(self) -> LibraryStatus:
-        """Get library scan status and file statistics."""
-        timestamp = datetime.now().isoformat()
-        
-        # Get library information
-        data = self._make_request('/api/v2/get-libraries')
-        if data:
-            libraries = data.get('libraries', [])
-            
-            library_stats = []
-            total_files = 0
-            
-            for lib in libraries:
-                lib_info = LibraryInfo(
-                    name=lib.get('name'),
-                    path=lib.get('path'),
-                    file_count=lib.get('totalFiles', 0),
-                    scan_progress=lib.get('scanProgress', 0),
-                    last_scan=lib.get('lastScan'),
-                    is_scanning=lib.get('isScanning', False)
-                )
-                library_stats.append(lib_info)
-                total_files += lib_info.file_count
-            
-            scan_status = ScanStatus(
-                total_libraries=len(libraries),
-                total_files=total_files,
-                scanning_libraries=len([l for l in library_stats if l.is_scanning])
-            )
-            
-            return LibraryStatus(
-                timestamp=timestamp,
-                libraries=library_stats,
-                scan_status=scan_status
-            )
-        else:
-            return LibraryStatus(
-                timestamp=timestamp,
-                libraries=[],
-                error='Unable to fetch library data'
-            )
-
-    def get_statistics(self) -> StatisticsStatus:
-        """Get overall Tdarr statistics and health metrics."""
-        timestamp = datetime.now().isoformat()
-        
-        # Get statistics
-        data = self._make_request('/api/v2/get-stats')
-        if data:
-            stats = data.get('stats', {})
-            statistics = Statistics(
-                total_transcodes=stats.get('totalTranscodes', 0),
-                space_saved=stats.get('spaceSaved', 0),
-                total_files_processed=stats.get('totalFilesProcessed', 0),
-                failed_transcodes=stats.get('failedTranscodes', 0),
-                processing_speed=stats.get('processingSpeed', 0),
-                eta=stats.get('eta')
-            )
-            
-            return StatisticsStatus(
-                timestamp=timestamp,
-                statistics=statistics
-            )
-        else:
-            return StatisticsStatus(
-                timestamp=timestamp,
-                error='Unable to fetch statistics'
-            )
-
-    def health_check(self) -> HealthStatus:
-        """Perform comprehensive health check."""
-        timestamp = datetime.now().isoformat()
-        
-        # Server connectivity
-        server_status = self.get_server_status()
-        server_check = HealthCheck(
-            status=server_status.status,
-            healthy=server_status.status == 'online'
-        )
-        
-        # Node connectivity  
-        node_status = self.get_node_status()
-        nodes_healthy = (
-            node_status.node_summary.online_nodes > 0 if node_status.node_summary else False
-        ) and not node_status.error
-        
-        nodes_check = HealthCheck(
-            status='online' if nodes_healthy else 'offline',
-            healthy=nodes_healthy,
-            online_count=node_status.node_summary.online_nodes if node_status.node_summary else 0,
-            total_count=node_status.node_summary.total_nodes if node_status.node_summary else 0
-        )
-        
-        # Queue status
-        queue_status = self.get_queue_status()
-        queue_healthy = not queue_status.error
-        queue_check = HealthCheck(
-            status='accessible' if queue_healthy else 'error',
-            healthy=queue_healthy,
-            accessible=queue_healthy,
-            total_items=queue_status.queue_stats.total_files if queue_status.queue_stats else 0
-        )
-        
-        checks = {
-            'server': server_check,
-            'nodes': nodes_check,
-            'queue': queue_check
-        }
-        
-        # Determine overall health
-        all_checks_healthy = all(check.healthy for check in checks.values())
-        overall_status = 'healthy' if all_checks_healthy else 'unhealthy'
-        
-        return HealthStatus(
-            timestamp=timestamp,
-            overall_status=overall_status,
-            checks=checks
-        )
-
-
-def main():
-    parser = argparse.ArgumentParser(description='Monitor Tdarr server via API')
-    parser.add_argument('--server', required=True, help='Tdarr server URL (e.g., http://10.10.0.43:8265)')
-    parser.add_argument('--check', choices=['all', 'status', 'queue', 'nodes', 'libraries', 'stats', 'health'],
-                       default='health', help='Type of check to perform')
-    parser.add_argument('--timeout', type=int, default=30, help='Request timeout in seconds')
-    parser.add_argument('--output', choices=['json', 'pretty'], default='pretty', help='Output format')
-    parser.add_argument('--verbose', action='store_true', help='Enable verbose logging')
-    
-    args = parser.parse_args()
-    
-    if args.verbose:
-        logging.getLogger().setLevel(logging.DEBUG)
-    
-    # Initialize monitor
-    monitor = TdarrMonitor(args.server, args.timeout)
-    
-    # Perform requested check
-    result = None
-    if args.check == 'all':
-        result = {
-            'server_status': monitor.get_server_status(),
-            'queue_status': monitor.get_queue_status(),
-            'node_status': monitor.get_node_status(),
-            'library_status': monitor.get_library_status(),
-            'statistics': monitor.get_statistics()
-        }
-    elif args.check == 'status':
-        result = monitor.get_server_status()
-    elif args.check == 'queue':
-        result = monitor.get_queue_status()
-    elif args.check == 'nodes':
-        result = monitor.get_node_status()
-    elif args.check == 'libraries':
-        result = monitor.get_library_status()
-    elif args.check == 'stats':
-        result = monitor.get_statistics()
-    elif args.check == 'health':
-        result = monitor.health_check()
-    
-    # Output results
-    if args.output == 'json':
-        # Convert dataclasses to dictionaries for JSON serialization
-        if args.check == 'all':
-            json_result = {}
-            for key, value in result.items():
-                json_result[key] = asdict(value)
-            print(json.dumps(json_result, indent=2))
-        else:
-            print(json.dumps(asdict(result), indent=2))
-    else:
-        # Pretty print format
-        print(f"=== Tdarr Monitor Results - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===")
-        
-        if args.check == 'health' or (hasattr(result, 'overall_status') and result.overall_status):
-            health = result if hasattr(result, 'overall_status') else None
-            if health:
-                status = health.overall_status
-                print(f"Overall Status: {status.upper()}")
-                
-                if health.checks:
-                    print("\nHealth Checks:")
-                    for check_name, check_data in health.checks.items():
-                        status_icon = "✓" if check_data.healthy else "✗"
-                        print(f"  {status_icon} {check_name.title()}: {asdict(check_data)}")
-        
-        if args.check == 'all':
-            for section, data in result.items():
-                print(f"\n=== {section.replace('_', ' ').title()} ===")
-                print(json.dumps(asdict(data), indent=2))
-        elif args.check != 'health':
-            print(json.dumps(asdict(result), indent=2))
-    
-    # Exit with appropriate code
-    if result:
-        # Check for unhealthy status in health check
-        if isinstance(result, HealthStatus) and result.overall_status == 'unhealthy':
-            sys.exit(1)
-        # Check for errors in individual status objects (all status classes except HealthStatus have error attribute)
-        elif (isinstance(result, (ServerStatus, QueueStatus, NodeStatus, LibraryStatus, StatisticsStatus)) 
-              and result.error):
-            sys.exit(1)
-        # Check for errors in 'all' results
-        elif isinstance(result, dict):
-            for status_obj in result.values():
-                if (isinstance(status_obj, (ServerStatus, QueueStatus, NodeStatus, LibraryStatus, StatisticsStatus)) 
-                    and status_obj.error):
-                    sys.exit(1)
-    
-    sys.exit(0)
-
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file
diff --git a/scripts/tdarr-manager b/scripts/tdarr-manager
deleted file mode 100755
index 6495d7b..0000000
--- a/scripts/tdarr-manager
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-# Tdarr Manager - Quick access to Tdarr scheduler controls
-# This is a convenience script that forwards to the main manager
-
-SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
-exec "${SCRIPT_DIR}/tdarr/tdarr-schedule-manager.sh" "$@"
\ No newline at end of file
diff --git a/tdarr/CONTEXT.md b/tdarr/CONTEXT.md
new file mode 100644
index 0000000..0f6b399
--- /dev/null
+++ b/tdarr/CONTEXT.md
@@ -0,0 +1,152 @@
+# Tdarr Transcoding System - Technology Context
+
+## Overview
+Tdarr is a distributed transcoding system that converts media files to optimized formats. This implementation uses an intelligent gaming-aware scheduler with unmapped node architecture for optimal performance and system stability.
+
+## Architecture Patterns
+
+### Distributed Unmapped Node Architecture (Recommended)
+**Pattern**: Server-Node separation with local high-speed cache
+- **Server**: Tdarr Server manages queue, web interface, and coordination
+- **Node**: Unmapped nodes with local NVMe cache for processing
+- **Benefits**: 3-5x performance improvement, network I/O reduction, linear scaling
+
+**When to Use**:
+- Multiple transcoding nodes across network
+- High-performance requirements (10GB+ files)
+- Network bandwidth limitations
+- Gaming systems requiring GPU priority management
+
+### Configuration Principles
+1. **Cache Optimization**: Use local NVMe storage for work directories
+2. **Gaming Detection**: Automatic pause during GPU-intensive activities  
+3. **Resource Isolation**: Container limits prevent kernel-level crashes
+4. **Monitoring Integration**: Automated cleanup and Discord notifications
+
+## Core Components
+
+### Gaming-Aware Scheduler
+**Purpose**: Automatically manages Tdarr node to avoid conflicts with gaming
+**Location**: `scripts/tdarr-schedule-manager.sh`
+
+**Key Features**:
+- Detects gaming processes (Steam, Lutris, Wine, etc.)
+- GPU usage monitoring (>15% threshold)
+- Configurable time windows
+- Automated temporary directory cleanup
+
+**Schedule Format**: `"HOUR_START-HOUR_END:DAYS"`
+- `"22-07:daily"` - Overnight transcoding
+- `"09-17:1-5"` - Business hours weekdays only
+- `"14-16:6,7"` - Weekend afternoon window
+
+### Monitoring System
+**Purpose**: Prevents staging section timeouts and system instability
+**Location**: `scripts/monitoring/tdarr-timeout-monitor.sh`
+
+**Capabilities**:
+- Staging timeout detection (300-second hardcoded limit)
+- Automatic work directory cleanup
+- Discord notifications with user pings
+- Log rotation and retention management
+
+### Container Architecture
+**Server Configuration**:
+```yaml
+# Hybrid storage with resource limits
+services:
+  tdarr:
+    image: ghcr.io/haveagitgat/tdarr:latest
+    ports: ["8265:8266"]
+    volumes:
+      - "./tdarr-data:/app/configs"
+      - "/mnt/media:/media"
+```
+
+**Node Configuration**:
+```bash
+# Unmapped node with local cache
+podman run -d \
+  --name tdarr-node-gpu \
+  -e nodeType=unmapped \
+  -v "/mnt/NV2/tdarr-cache:/cache" \
+  --device nvidia.com/gpu=all \
+  ghcr.io/haveagitgat/tdarr_node:latest
+```
+
+## Implementation Patterns
+
+### Performance Optimization
+1. **Local Cache Strategy**: Download → Process → Upload (vs. streaming)
+2. **Resource Limits**: Prevent memory exhaustion and kernel crashes
+3. **Network Resilience**: CIFS mount options for stability
+4. **Automated Cleanup**: Prevent accumulation of stuck directories
+
+### Error Prevention
+1. **Plugin Safety**: Null-safe forEach operations `(streams || []).forEach()`
+2. **Clean Installation**: Avoid custom plugin mounts causing version conflicts
+3. **Container Isolation**: Resource limits prevent system-level crashes
+4. **Network Stability**: Unmapped architecture reduces CIFS dependency
+
+### Gaming Integration
+1. **Process Detection**: Monitor for gaming applications and utilities
+2. **GPU Threshold**: Stop transcoding when GPU usage >15%
+3. **Time Windows**: Respect user-defined allowed transcoding hours
+4. **Manual Override**: Direct start/stop commands bypass scheduler
+
+## Common Workflows
+
+### Initial Setup
+1. Start server with "Allow unmapped Nodes" enabled
+2. Configure node as unmapped with local cache
+3. Install gaming-aware scheduler via cron
+4. Set up monitoring system for automated cleanup
+
+### Troubleshooting Patterns
+1. **forEach Errors**: Clean plugin installation, avoid custom mounts
+2. **Staging Timeouts**: Monitor system handles automatic cleanup
+3. **System Crashes**: Convert to unmapped node architecture
+4. **Network Issues**: Implement CIFS resilience options
+
+### Performance Tuning
+1. **Cache Size**: 100-500GB NVMe for concurrent jobs
+2. **Bandwidth**: Unmapped nodes reduce streaming requirements
+3. **Scaling**: Linear scaling with additional unmapped nodes
+4. **GPU Priority**: Gaming detection ensures responsive system
+
+## Best Practices
+
+### Production Deployment
+- Use unmapped node architecture for stability
+- Implement comprehensive monitoring
+- Configure gaming-aware scheduling for desktop systems
+- Set appropriate container resource limits
+
+### Development Guidelines
+- Test with internal Tdarr test files first
+- Implement null-safety checks in custom plugins
+- Use structured logging for troubleshooting
+- Separate concerns: scheduling, monitoring, processing
+
+### Security Considerations
+- Container isolation prevents system-level failures
+- Resource limits protect against memory exhaustion
+- Network mount resilience prevents kernel crashes
+- Automated cleanup prevents disk space issues
+
+## Migration Patterns
+
+### From Mapped to Unmapped Nodes
+1. Enable "Allow unmapped Nodes" in server options
+2. Update node configuration (add nodeType=unmapped)
+3. Change cache volume to local storage
+4. Remove media volume mapping
+5. Test workflow and monitor performance
+
+### Plugin System Cleanup
+1. Remove all custom plugin mounts
+2. Force server restart to regenerate plugin ZIP
+3. Restart nodes to download fresh plugins
+4. Verify forEach fixes in downloaded plugins
+
+This technology context provides the foundation for implementing, troubleshooting, and optimizing Tdarr transcoding systems in home lab environments.
\ No newline at end of file
diff --git a/tdarr/examples/tdarr-cifs-troubleshooting-2025-08-11.md b/tdarr/examples/tdarr-cifs-troubleshooting-2025-08-11.md
new file mode 100644
index 0000000..ecb9146
--- /dev/null
+++ b/tdarr/examples/tdarr-cifs-troubleshooting-2025-08-11.md
@@ -0,0 +1,143 @@
+# Tdarr CIFS Troubleshooting Session - 2025-08-11
+
+## Problem Statement
+Tdarr unmapped node experiencing persistent download timeouts at 9:08 PM with large files (31GB+ remux), causing "Cancelling" messages and stuck downloads. Downloads would hang for 33+ minutes before timing out, despite container remaining running.
+
+## Initial Hypothesis: Mapped vs Unmapped Node Issue
+**Status**: ❌ **DISPROVEN**
+- Suspected unmapped node timeout configuration differences
+- Windows PC running mapped Tdarr node works fine (slow but stable)
+- Both mapped and unmapped Linux nodes exhibited identical timeout issues
+- **Conclusion**: Architecture type was not the root cause
+
+## Key Insight: Windows vs Linux Performance Difference
+**Observation**: Windows Tdarr node (mapped mode) works without timeouts, Linux nodes (both mapped/unmapped) fail
+**Implication**: Platform-specific issue, likely network stack or CIFS implementation
+
+## Root Cause Discovery Process
+
+### Phase 1: Linux Client CIFS Analysis
+**Method**: Direct CIFS mount testing on Tdarr node machine (nobara-pc)
+
+**Initial CIFS Mount Configuration** (problematic):
+```bash
+//10.10.0.35/media on /mnt/media type cifs (rw,relatime,vers=3.1.1,cache=strict,upcall_target=app,username=root,uid=1000,forceuid,gid=1000,forcegid,addr=10.10.0.35,file_mode=0755,dir_mode=0755,soft,nounix,serverino,mapposix,noperm,reparse=nfs,nativesocket,symlink=native,rsize=4194304,wsize=4194304,bsize=1048576,retrans=1,echo_interval=60,actimeo=30,closetimeo=1,_netdev,x-systemd.automount,x-systemd.device-timeout=10,x-systemd.mount-timeout=30)
+```
+
+**Critical Issues Identified**:
+- `soft` - Mount fails on timeout instead of retrying indefinitely
+- `retrans=1` - Only 1 retry attempt (NFS option, invalid for CIFS)
+- `closetimeo=1` - Very short close timeout (1 second)
+- `cache=strict` - No local caching, poor performance for large files
+- `x-systemd.mount-timeout=30` - 30-second mount timeout
+
+**Optimization Applied**:
+```bash
+//10.10.0.35/media /mnt/media cifs credentials=/home/cal/.samba_credentials,uid=1000,gid=1000,vers=3.1.1,hard,rsize=16777216,wsize=16777216,cache=loose,actimeo=60,echo_interval=30,_netdev,x-systemd.automount,x-systemd.device-timeout=60,x-systemd.mount-timeout=120,noperm 0 0
+```
+
+**Performance Testing Results**:
+- **Local SSD**: `dd` 800MB in 0.217s (4.0 GB/s) - baseline
+- **CIFS 1MB blocks**: 42.7 MB/s - fast, no issues
+- **CIFS 4MB blocks**: 205 MB/s - fast, no issues  
+- **CIFS 8MB blocks**: 83.1 MB/s - **3-minute terminal freeze**
+
+**Critical Discovery**: Block size dependency causing I/O blocking with large transfers
+
+### Phase 2: Tdarr Server-Side Analysis
+**Method**: Test Tdarr API download path directly
+
+**API Test Command**:
+```bash
+curl -X POST "http://10.10.0.43:8265/api/v2/file/download" \
+  -H "Content-Type: application/json" \
+  -d '{"filePath": "/media/Movies/Jumanji (1995)/Jumanji (1995) Remux-1080p Proper.mkv"}' \
+  -o /tmp/tdarr-api-test.mkv
+```
+
+**Results**:
+- **Performance**: 55.7-58.6 MB/s sustained
+- **Progress**: Downloaded 15.3GB of 23GB (66%)
+- **Failure**: **Download hung at 66% completion**
+- **Timing**: Hung after ~5 minutes (consistent with previous timeout patterns)
+
+### Phase 3: Tdarr Server CIFS Configuration Analysis
+**Method**: Examine server-side storage mount
+
+**Server CIFS Mount** (problematic):
+```bash
+//10.10.0.35/media /mnt/truenas-share cifs credentials=/root/.truenascreds,vers=3.1.1,rsize=4194304,wsize=4194304,cache=strict,actimeo=30,echo_interval=60,noperm 0 0
+```
+
+**Server Issues Identified**:
+- **Missing `hard`** - Defaults to `soft` mount behavior
+- `cache=strict` - No local caching (same issue as client)
+- **No retry/timeout extensions** - Uses unreliable kernel defaults
+- **No systemd timeout protection**
+
+## Root Cause Confirmed
+**Primary Issue**: Tdarr server's CIFS mount to TrueNAS using suboptimal configuration
+**Impact**: Large file streaming via Tdarr API hangs when server's CIFS mount hits I/O blocking
+**Evidence**: API download hung at exact same pattern as node timeouts (66% through large file)
+
+## Solution Strategy
+**Fix Tdarr Server CIFS Mount Configuration**:
+```bash
+//10.10.0.35/media /mnt/truenas-share cifs credentials=/root/.truenascreds,vers=3.1.1,hard,rsize=4194304,wsize=4194304,cache=loose,actimeo=60,echo_interval=30,_netdev,x-systemd.device-timeout=60,x-systemd.mount-timeout=120,noperm 0 0
+```
+
+**Key Optimizations**:
+- `hard` - Retry indefinitely instead of timing out
+- `cache=loose` - Enable local caching for large file performance  
+- `actimeo=60` - Longer attribute caching
+- `echo_interval=30` - More frequent keep-alives
+- Extended systemd timeouts for reliability
+
+## Implementation Steps
+1. **Update server `/etc/fstab`** with optimized CIFS configuration
+2. **Remount server storage**:
+   ```bash
+   ssh tdarr "sudo umount /mnt/truenas-share"
+   ssh tdarr "sudo systemctl daemon-reload"  
+   ssh tdarr "sudo mount /mnt/truenas-share"
+   ```
+3. **Test large file API download** to verify fix
+4. **Resume Tdarr transcoding** with confidence in large file handling
+
+## Technical Insights
+
+### CIFS vs SMB Protocol Differences
+- **Windows nodes**: Use native SMB implementation (stable)
+- **Linux nodes**: Use kernel CIFS module (prone to I/O blocking with poor configuration)
+- **Block size sensitivity**: Large block transfers require careful timeout/retry configuration
+
+### Tdarr Architecture Impact
+- **Unmapped nodes**: Download entire files via API before processing (high bandwidth, vulnerable to server CIFS issues)
+- **Mapped nodes**: Stream files during processing (lower bandwidth, still vulnerable to server CIFS issues)
+- **Root cause affects both architectures** since server-side storage access is the bottleneck
+
+### Performance Expectations Post-Fix
+- **Consistent 50-100 MB/s** for large file downloads
+- **No timeout failures** with properly configured hard mounts
+- **Reliable processing** of 31GB+ remux files
+
+## Files Modified
+- **Client**: `/etc/fstab` on nobara-pc (CIFS optimization applied)
+- **Server**: `/etc/fstab` on tdarr server (pending optimization)
+
+## Monitoring and Validation
+- **Success criteria**: Tdarr API download of 23GB+ file completes without hanging
+- **Performance target**: Sustained 50+ MB/s throughout entire transfer
+- **Reliability target**: No timeouts during large file processing
+
+## Session Outcome
+**Status**: ✅ **ROOT CAUSE IDENTIFIED AND SOLUTION READY**
+- Eliminated client-side variables through systematic testing
+- Confirmed server-side CIFS configuration as bottleneck
+- Validated fix strategy through client-side optimization success
+- Ready to implement server-side solution
+
+---
+*Session Date: 2025-08-11*  
+*Duration: ~3 hours*  
+*Methods: Direct testing, API analysis, mount configuration review*
\ No newline at end of file
diff --git a/tdarr/examples/tdarr-node-configurations.md b/tdarr/examples/tdarr-node-configurations.md
new file mode 100644
index 0000000..c5d94b1
--- /dev/null
+++ b/tdarr/examples/tdarr-node-configurations.md
@@ -0,0 +1,183 @@
+# Tdarr Node Container Configurations
+
+## Overview
+Complete examples for running Tdarr transcoding nodes in containers, covering both CPU-only and GPU-accelerated setups.
+
+## CPU-Only Configuration (Docker Compose)
+
+For systems without GPU or when GPU isn't needed:
+
+```yaml
+version: "3.4"
+services:
+  tdarr-node:
+    container_name: tdarr-node-cpu
+    image: ghcr.io/haveagitgat/tdarr_node:latest
+    restart: unless-stopped
+    environment:
+      - TZ=America/Chicago
+      - UMASK_SET=002
+      - nodeName=local-workstation-cpu
+      - serverIP=YOUR_TDARR_SERVER_IP  # Replace with your tdarr server IP
+      - serverPort=8266
+      - inContainer=true
+      - ffmpegVersion=6
+    volumes:
+      # Mount your media from the same NAS share as the server
+      - /path/to/your/media:/media  # Replace with your local media mount
+      # Temp directory for transcoding cache
+      - ./temp:/temp
+```
+
+**Use case**: 
+- CPU-only transcoding
+- Testing Tdarr functionality
+- Systems without dedicated GPU
+- When GPU drivers aren't available
+
+## GPU-Accelerated Configuration (Podman)
+
+**Recommended for Fedora/RHEL/CentOS/Nobara systems:**
+
+### Mapped Node (Direct Media Access)
+```bash
+podman run -d --name tdarr-node-gpu-mapped \
+    --gpus all \
+    --restart unless-stopped \
+    -e TZ=America/Chicago \
+    -e UMASK_SET=002 \
+    -e nodeName=local-workstation-gpu-mapped \
+    -e serverIP=10.10.0.43 \
+    -e serverPort=8266 \
+    -e inContainer=true \
+    -e ffmpegVersion=6 \
+    -e NVIDIA_DRIVER_CAPABILITIES=all \
+    -e NVIDIA_VISIBLE_DEVICES=all \
+    -v /mnt/NV2/tdarr-cache:/cache \
+    -v /mnt/media/TV:/media/TV \
+    -v /mnt/media/Movies:/media/Movies \
+    -v /mnt/media/tdarr/tdarr-cache-clean:/temp \
+    ghcr.io/haveagitgat/tdarr_node:latest
+```
+
+### Unmapped Node (Downloads Files)
+```bash
+podman run -d --name tdarr-node-gpu-unmapped \
+    --gpus all \
+    --restart unless-stopped \
+    -e TZ=America/Chicago \
+    -e UMASK_SET=002 \
+    -e nodeName=local-workstation-gpu-unmapped \
+    -e serverIP=10.10.0.43 \
+    -e serverPort=8266 \
+    -e inContainer=true \
+    -e ffmpegVersion=6 \
+    -e NVIDIA_DRIVER_CAPABILITIES=all \
+    -e NVIDIA_VISIBLE_DEVICES=all \
+    -v /mnt/NV2/tdarr-cache:/cache \
+    -v /mnt/media:/media \
+    -v /mnt/media/tdarr/tdarr-cache-clean:/temp \
+    ghcr.io/haveagitgat/tdarr_node:latest
+```
+
+**Use cases**:
+- **Mapped**: Direct media access, faster processing, no file downloads
+- **Unmapped**: Works when network shares aren't available locally
+- Hardware video encoding/decoding (NVENC/NVDEC)
+- High-performance transcoding with NVMe cache
+- Multiple concurrent streams
+- Fedora-based systems where Podman works better than Docker
+
+## GPU-Accelerated Configuration (Docker)
+
+**For Ubuntu/Debian systems where Docker GPU support works:**
+
+```yaml
+version: "3.4"
+services:
+  tdarr-node:
+    container_name: tdarr-node-gpu
+    image: ghcr.io/haveagitgat/tdarr_node:latest
+    restart: unless-stopped
+    environment:
+      - TZ=America/Chicago
+      - UMASK_SET=002
+      - nodeName=local-workstation-gpu
+      - serverIP=YOUR_TDARR_SERVER_IP
+      - serverPort=8266
+      - inContainer=true
+      - ffmpegVersion=6
+      - NVIDIA_DRIVER_CAPABILITIES=all
+      - NVIDIA_VISIBLE_DEVICES=all
+    volumes:
+      - /path/to/your/media:/media
+      - ./temp:/temp
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            count: all
+            capabilities: [gpu]
+```
+
+## Configuration Parameters
+
+### Required Environment Variables
+- `TZ`: Timezone (e.g., `America/Chicago`)
+- `nodeName`: Unique identifier for this node
+- `serverIP`: IP address of Tdarr server
+- `serverPort`: Tdarr server port (typically 8266)
+- `inContainer`: Set to `true` for containerized deployments
+- `ffmpegVersion`: FFmpeg version to use (6 recommended)
+
+### GPU-Specific Variables
+- `NVIDIA_DRIVER_CAPABILITIES`: Set to `all` for full GPU access
+- `NVIDIA_VISIBLE_DEVICES`: `all` for all GPUs, or specific GPU IDs
+
+### Volume Mounts
+- `/media`: Mount point for media files (must match server configuration)
+- `/temp`: Temporary directory for transcoding cache
+
+## Platform-Specific Recommendations
+
+### Fedora/RHEL/CentOS/Nobara
+- **GPU**: Use Podman (Docker Desktop has GPU issues)
+- **CPU**: Docker or Podman both work fine
+
+### Ubuntu/Debian  
+- **GPU**: Use Docker with nvidia-container-toolkit
+- **CPU**: Docker recommended
+
+### Testing GPU Functionality
+
+Verify GPU access inside container:
+```bash
+# For Podman
+podman exec tdarr-node-gpu nvidia-smi
+
+# For Docker  
+docker exec tdarr-node-gpu nvidia-smi
+```
+
+Test NVENC encoding:
+```bash
+# For Podman
+podman exec tdarr-node-gpu /usr/local/bin/tdarr-ffmpeg -f lavfi -i testsrc2=duration=5:size=1920x1080:rate=30 -c:v h264_nvenc -t 5 /tmp/test.mp4
+
+# For Docker
+docker exec tdarr-node-gpu /usr/local/bin/tdarr-ffmpeg -f lavfi -i testsrc2=duration=5:size=1920x1080:rate=30 -c:v h264_nvenc -t 5 /tmp/test.mp4
+```
+
+## Troubleshooting
+
+- **GPU not detected**: See `reference/docker/nvidia-gpu-troubleshooting.md`
+- **Permission issues**: Ensure proper UMASK_SET and volume permissions
+- **Connection issues**: Verify serverIP and firewall settings
+- **Performance issues**: Monitor CPU/GPU utilization during transcoding
+
+## Related Documentation
+
+- `patterns/docker/gpu-acceleration.md` - GPU acceleration patterns
+- `reference/docker/nvidia-gpu-troubleshooting.md` - Detailed GPU troubleshooting
+- `start-tdarr-gpu-podman.sh` - Ready-to-use Podman startup script
\ No newline at end of file
diff --git a/tdarr/examples/tdarr-node-local/docker-compose-cpu.yml b/tdarr/examples/tdarr-node-local/docker-compose-cpu.yml
new file mode 100644
index 0000000..3c4f574
--- /dev/null
+++ b/tdarr/examples/tdarr-node-local/docker-compose-cpu.yml
@@ -0,0 +1,28 @@
+version: "3.4"
+services:
+  tdarr-node:
+    container_name: tdarr-node-local-cpu
+    image: ghcr.io/haveagitgat/tdarr_node:latest
+    restart: unless-stopped
+    environment:
+      - TZ=America/Chicago
+      - UMASK_SET=002
+      - nodeName=local-workstation-cpu
+      - serverIP=192.168.1.100  # Replace with your Tdarr server IP
+      - serverPort=8266
+      - inContainer=true
+      - ffmpegVersion=6
+    volumes:
+      # Media access (same as server)
+      - /mnt/media:/media  # Replace with your media path
+      # Local transcoding cache
+      - ./temp:/temp
+    # Resource limits for CPU transcoding
+    deploy:
+      resources:
+        limits:
+          cpus: '14'      # Leave some cores for system (16-core = use 14)
+          memory: 32G     # Generous for 4K transcoding
+        reservations:
+          cpus: '8'       # Minimum guaranteed cores
+          memory: 16G
\ No newline at end of file
diff --git a/tdarr/examples/tdarr-node-local/docker-compose-gpu.yml b/tdarr/examples/tdarr-node-local/docker-compose-gpu.yml
new file mode 100644
index 0000000..592e194
--- /dev/null
+++ b/tdarr/examples/tdarr-node-local/docker-compose-gpu.yml
@@ -0,0 +1,45 @@
+version: "3.4"
+services:
+  tdarr-node:
+    container_name: tdarr-node-local-gpu
+    image: ghcr.io/haveagitgat/tdarr_node:latest
+    restart: unless-stopped
+    environment:
+      - TZ=America/Chicago
+      - UMASK_SET=002
+      - nodeName=local-workstation-gpu
+      - serverIP=192.168.1.100  # Replace with your Tdarr server IP
+      - serverPort=8266
+      - inContainer=true
+      - ffmpegVersion=6
+      # NVIDIA environment variables
+      - NVIDIA_DRIVER_CAPABILITIES=all
+      - NVIDIA_VISIBLE_DEVICES=all
+    volumes:
+      # Media access (same as server)
+      - /mnt/media:/media  # Replace with your media path
+      # Local transcoding cache
+      - ./temp:/temp
+    devices:
+      - /dev/dri:/dev/dri  # Intel/AMD GPU fallback
+    
+    # GPU configuration - choose ONE method:
+    
+    # Method 1: Deploy syntax (recommended)
+    deploy:
+      resources:
+        limits:
+          memory: 16G     # GPU transcoding uses less RAM
+        reservations:
+          memory: 8G
+          devices:
+          - driver: nvidia
+            count: all
+            capabilities: [gpu]
+    
+    # Method 2: Runtime (alternative)
+    # runtime: nvidia
+    
+    # Method 3: CDI (future)
+    # devices:
+    #   - nvidia.com/gpu=all
\ No newline at end of file
diff --git a/tdarr/examples/tdarr-node-local/start-tdarr-mapped-node.sh b/tdarr/examples/tdarr-node-local/start-tdarr-mapped-node.sh
new file mode 100755
index 0000000..3b09a8f
--- /dev/null
+++ b/tdarr/examples/tdarr-node-local/start-tdarr-mapped-node.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+# Tdarr Mapped Node with GPU Support - Example Script
+# This script starts a MAPPED Tdarr node container with NVIDIA GPU acceleration using Podman
+# 
+# MAPPED NODES: Direct access to media files via volume mounts
+# Use this approach when you want the node to directly access your media library
+# for local processing without server coordination for file transfers
+#
+# Configure these variables for your setup:
+
+set -e
+
+CONTAINER_NAME="tdarr-node-gpu-mapped"
+SERVER_IP="YOUR_SERVER_IP"        # e.g., "10.10.0.43" or "192.168.1.100"
+SERVER_PORT="8266"                # Default Tdarr server port
+NODE_NAME="YOUR_NODE_NAME"        # e.g., "workstation-gpu" or "local-gpu-node"
+MEDIA_PATH="/path/to/your/media"  # e.g., "/mnt/media" or "/home/user/Videos"
+CACHE_PATH="/path/to/cache"       # e.g., "/mnt/ssd/tdarr-cache"
+
+echo "🚀 Starting MAPPED Tdarr Node with GPU support using Podman..."
+echo "   Media Path: ${MEDIA_PATH}"
+echo "   Cache Path: ${CACHE_PATH}"
+
+# Stop and remove existing container if it exists
+if podman ps -a --format "{{.Names}}" | grep -q "^${CONTAINER_NAME}$"; then
+    echo "🛑 Stopping existing container: ${CONTAINER_NAME}"
+    podman stop "${CONTAINER_NAME}" 2>/dev/null || true
+    podman rm "${CONTAINER_NAME}" 2>/dev/null || true
+fi
+
+# Start Tdarr node with GPU support
+echo "🎬 Starting Tdarr Node container..."
+podman run -d --name "${CONTAINER_NAME}" \
+    --gpus all \
+    --restart unless-stopped \
+    -e TZ=America/Chicago \
+    -e UMASK_SET=002 \
+    -e nodeName="${NODE_NAME}" \
+    -e serverIP="${SERVER_IP}" \
+    -e serverPort="${SERVER_PORT}" \
+    -e inContainer=true \
+    -e ffmpegVersion=6 \
+    -e logLevel=DEBUG \
+    -e NVIDIA_DRIVER_CAPABILITIES=all \
+    -e NVIDIA_VISIBLE_DEVICES=all \
+    -v "${MEDIA_PATH}:/media" \
+    -v "${CACHE_PATH}:/temp" \
+    ghcr.io/haveagitgat/tdarr_node:latest
+
+echo "⏳ Waiting for container to initialize..."
+sleep 5
+
+# Check container status
+if podman ps --format "{{.Names}}" | grep -q "^${CONTAINER_NAME}$"; then
+    echo "✅ Mapped Tdarr Node is running successfully!"
+    echo ""
+    echo "📊 Container Status:"
+    podman ps --filter "name=${CONTAINER_NAME}" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
+    echo ""
+    echo "🔍 Testing GPU Access:"
+    if podman exec "${CONTAINER_NAME}" nvidia-smi --query-gpu=name --format=csv,noheader,nounits 2>/dev/null; then
+        echo "🎉 GPU is accessible in container!"
+    else
+        echo "⚠️  GPU test failed, but container is running"
+    fi
+    echo ""
+    echo "🌐 Connection Details:"
+    echo "   Server: ${SERVER_IP}:${SERVER_PORT}"
+    echo "   Node Name: ${NODE_NAME}"
+    echo ""
+    echo "🧪 Test NVENC encoding:"
+    echo "   podman exec ${CONTAINER_NAME} /usr/local/bin/tdarr-ffmpeg -f lavfi -i testsrc2=duration=5:size=1920x1080:rate=30 -c:v h264_nvenc -preset fast -t 5 /tmp/test.mp4"
+    echo ""
+    echo "📋 Container Management:"
+    echo "   View logs: podman logs ${CONTAINER_NAME}"
+    echo "   Stop:      podman stop ${CONTAINER_NAME}"
+    echo "   Remove:    podman rm ${CONTAINER_NAME}"
+else
+    echo "❌ Failed to start container"
+    echo "📋 Checking logs..."
+    podman logs "${CONTAINER_NAME}" --tail 10
+    exit 1
+fi
\ No newline at end of file
diff --git a/tdarr/examples/tdarr-server-setup/README.md b/tdarr/examples/tdarr-server-setup/README.md
new file mode 100644
index 0000000..d7f4a4d
--- /dev/null
+++ b/tdarr/examples/tdarr-server-setup/README.md
@@ -0,0 +1,69 @@
+# Tdarr Server Setup Example
+
+## Directory Structure
+```
+~/container-data/tdarr/
+├── docker-compose.yml
+├── stonefish-tdarr-plugins/     # Custom plugins
+├── tdarr/
+│   ├── server/                  # Local storage
+│   ├── configs/
+│   └── logs/
+└── temp/                        # Local temp if needed
+```
+
+## Storage Strategy
+
+### Local Storage (Fast Access)
+- **Database**: SQLite requires local filesystem for WAL mode
+- **Configs**: Frequently accessed during startup
+- **Logs**: Regular writes during operation
+
+### Network Storage (Capacity)
+- **Backups**: Infrequent access, large files
+- **Media**: Read-only during transcoding
+- **Cache**: Temporary transcoding files
+
+## Upgrade Process
+
+### Major Version Upgrades
+1. **Backup current state**
+   ```bash
+   docker-compose down
+   cp docker-compose.yml docker-compose.yml.backup
+   ```
+
+2. **For clean start** (recommended for major versions):
+   ```bash
+   # Remove old database
+   sudo rm -rf ./tdarr/server
+   mkdir -p ./tdarr/server
+   
+   # Pull latest image
+   docker-compose pull
+   
+   # Start fresh
+   docker-compose up -d
+   ```
+
+3. **Monitor initialization**
+   ```bash
+   docker-compose logs -f
+   ```
+
+## Common Issues
+
+### Disk Space
+- Monitor local database growth
+- Regular cleanup of old backups
+- Use network storage for large static data
+
+### Permissions
+- Container runs as PUID/PGID (usually 0/0)
+- Ensure proper ownership of mounted directories
+- Use `sudo rm -rf` for root-owned container files
+
+### Network Filesystem Issues
+- SQLite incompatible with NFS/SMB for database
+- Keep database local, only backups on network
+- Monitor transcoding cache disk usage
\ No newline at end of file
diff --git a/tdarr/examples/tdarr-server-setup/docker-compose.yml b/tdarr/examples/tdarr-server-setup/docker-compose.yml
new file mode 100644
index 0000000..4291d43
--- /dev/null
+++ b/tdarr/examples/tdarr-server-setup/docker-compose.yml
@@ -0,0 +1,37 @@
+version: "3.4"
+services:
+  tdarr:
+    container_name: tdarr
+    image: ghcr.io/haveagitgat/tdarr:latest
+    restart: unless-stopped
+    network_mode: bridge
+    ports:
+      - 8265:8265 # webUI port
+      - 8266:8266 # server port
+    environment:
+      - TZ=America/Chicago
+      - PUID=0
+      - PGID=0
+      - UMASK_SET=002
+      - serverIP=0.0.0.0
+      - serverPort=8266
+      - webUIPort=8265
+      - internalNode=false  # Disable for distributed setup
+      - inContainer=true
+      - ffmpegVersion=6
+      - nodeName=docker-server
+    volumes:
+      # Plugin mounts (stonefish example)
+      - ./stonefish-tdarr-plugins/FlowPlugins/:/app/server/Tdarr/Plugins/FlowPlugins/
+      - ./stonefish-tdarr-plugins/FlowPluginsTs/:/app/server/Tdarr/Plugins/FlowPluginsTs/
+      - ./stonefish-tdarr-plugins/Community/:/app/server/Tdarr/Plugins/Community/
+      
+      # Hybrid storage strategy
+      - ./tdarr/server:/app/server  # Local: Database, configs, logs
+      - ./tdarr/configs:/app/configs
+      - ./tdarr/logs:/app/logs
+      - /mnt/truenas-share/tdarr/tdarr-server/Backups:/app/server/Tdarr/Backups  # Network: Backups
+      
+      # Media and cache
+      - /mnt/truenas-share:/media
+      - /mnt/truenas-share/tdarr/tdarr-cache:/temp
\ No newline at end of file
diff --git a/tdarr/scripts/CONTEXT.md b/tdarr/scripts/CONTEXT.md
new file mode 100644
index 0000000..864128b
--- /dev/null
+++ b/tdarr/scripts/CONTEXT.md
@@ -0,0 +1,212 @@
+# Tdarr Scripts - Operational Context
+
+## Script Overview
+This directory contains active operational scripts for Tdarr transcoding automation, gaming-aware scheduling, and system management.
+
+## Core Scripts
+
+### Gaming-Aware Scheduler
+**Primary Script**: `tdarr-schedule-manager.sh`
+**Purpose**: Comprehensive management interface for gaming-aware Tdarr scheduling
+
+**Key Functions**:
+- **Preset Management**: Quick schedule templates (night-only, work-safe, weekend-heavy, gaming-only)
+- **Installation**: Automated cron job setup and configuration  
+- **Status Monitoring**: Real-time status and logging
+- **Configuration**: Interactive schedule editing and validation
+
+**Usage Patterns**:
+```bash
+# Quick setup
+./tdarr-schedule-manager.sh preset work-safe
+./tdarr-schedule-manager.sh install
+
+# Monitoring
+./tdarr-schedule-manager.sh status
+./tdarr-schedule-manager.sh logs
+
+# Testing
+./tdarr-schedule-manager.sh test
+```
+
+### Container Management
+**Start Script**: `start-tdarr-gpu-podman-clean.sh`
+**Purpose**: Launch unmapped Tdarr node with optimized configuration
+
+**Key Features**:
+- **Unmapped Node Configuration**: Local cache for optimal performance
+- **GPU Support**: Full NVIDIA device passthrough
+- **Resource Optimization**: Direct NVMe cache mapping
+- **Clean Architecture**: No media volume dependencies
+
+**Stop Script**: `stop-tdarr-gpu-podman.sh`
+**Purpose**: Graceful container shutdown with cleanup
+
+### Scheduling Engine
+**Core Engine**: `tdarr-cron-check-configurable.sh`
+**Purpose**: Minute-by-minute decision engine for Tdarr state management
+
+**Decision Logic**:
+1. **Gaming Detection**: Check for active gaming processes
+2. **GPU Monitoring**: Verify GPU usage below threshold (15%)
+3. **Time Window Validation**: Ensure current time within allowed schedule
+4. **State Management**: Start/stop Tdarr based on conditions
+
+**Gaming Process Detection**:
+- Steam, Lutris, Heroic Games Launcher
+- Wine, Bottles (Windows compatibility layers)
+- GameMode, MangoHUD (gaming utilities)
+- GPU usage monitoring via nvidia-smi
+
+### Configuration Management
+**Config File**: `tdarr-schedule.conf`
+**Purpose**: Centralized configuration for scheduler behavior
+
+**Configuration Structure**:
+```bash
+# Time blocks: "HOUR_START-HOUR_END:DAYS"
+SCHEDULE_BLOCKS="22-07:daily 09-17:1-5"
+
+# Gaming detection settings
+GPU_THRESHOLD=15
+GAMING_PROCESSES="steam lutris heroic wine bottles gamemode mangohud"
+
+# Operational settings
+LOG_FILE="/tmp/tdarr-scheduler.log"
+CONTAINER_NAME="tdarr-node-gpu"
+```
+
+## Operational Patterns
+
+### Automated Maintenance
+**Cron Integration**: Two automated systems running simultaneously
+1. **Scheduler** (every minute): `tdarr-cron-check-configurable.sh`
+2. **Cleanup** (every 6 hours): Temporary directory maintenance
+
+**Cleanup Automation**:
+```bash
+# Removes abandoned transcoding directories
+0 */6 * * * find /tmp -name "tdarr-workDir2-*" -type d -mmin +360 -exec rm -rf {} \; 2>/dev/null || true
+```
+
+### Logging Strategy
+**Log Location**: `/tmp/tdarr-scheduler.log`
+**Log Format**: Timestamped entries with decision reasoning
+**Log Rotation**: Manual cleanup, focused on recent activity
+
+**Log Examples**:
+```
+[2025-08-13 14:30:01] Gaming detected (steam), stopping Tdarr
+[2025-08-13 14:35:01] Gaming ended, but outside allowed hours (14:35 not in 22-07:daily)
+[2025-08-13 22:00:01] Starting Tdarr (no gaming, within schedule)
+```
+
+### System Integration
+**Gaming Detection**: Real-time process monitoring
+**GPU Monitoring**: nvidia-smi integration for usage thresholds
+**Container Management**: Podman-based lifecycle management
+**Cron Integration**: Standard system scheduler for automation
+
+## Configuration Presets
+
+### Preset Profiles
+**night-only**: `"22-07:daily"` - Overnight transcoding only
+**work-safe**: `"22-07:daily 09-17:1-5"` - Nights + work hours
+**weekend-heavy**: `"22-07:daily 09-17:1-5 08-20:6-7"` - Maximum time
+**gaming-only**: No time limits, gaming detection only
+
+### Schedule Format Specification
+**Format**: `"HOUR_START-HOUR_END:DAYS"`
+**Examples**:
+- `"22-07:daily"` - 10PM to 7AM every day (overnight)
+- `"09-17:1-5"` - 9AM to 5PM Monday-Friday
+- `"14-16:6,7"` - 2PM to 4PM Saturday and Sunday
+- `"08-20:6-7"` - 8AM to 8PM weekends only
+
+## Container Architecture
+
+### Unmapped Node Configuration
+**Architecture Choice**: Local cache with API-based transfers
+**Benefits**: 3-5x performance improvement, reduced network dependency
+
+**Container Environment**:
+```bash
+-e nodeType=unmapped
+-e unmappedNodeCache=/cache
+-e enableGpu=true
+-e TZ=America/New_York
+```
+
+**Volume Configuration**:
+```bash
+# Local high-speed cache (NVMe)
+-v "/mnt/NV2/tdarr-cache:/cache"
+
+# Configuration persistence  
+-v "/mnt/NV2/tdarr-cache-clean:/temp"
+
+# No media volumes (unmapped mode uses API)
+```
+
+### Resource Management
+**GPU Access**: Full NVIDIA device passthrough
+**Memory**: Controlled by container limits
+**CPU**: Shared with host system
+**Storage**: Local NVMe for optimal I/O performance
+
+## Troubleshooting Context
+
+### Common Issues
+1. **Gaming Not Detected**: Check process names in configuration
+2. **Time Window Issues**: Verify schedule block format
+3. **Container Start Failures**: Check GPU device access
+4. **Log File Growth**: Manual cleanup of scheduler logs
+
+### Diagnostic Commands
+```bash
+# Test current conditions
+./tdarr-schedule-manager.sh test
+
+# View real-time logs
+./tdarr-schedule-manager.sh logs
+
+# Check container status
+podman ps | grep tdarr
+
+# Verify GPU access
+podman exec tdarr-node-gpu nvidia-smi
+```
+
+### Recovery Procedures
+```bash
+# Reset to defaults
+./tdarr-schedule-manager.sh preset work-safe
+
+# Reinstall scheduler
+./tdarr-schedule-manager.sh install
+
+# Manual container restart
+./stop-tdarr-gpu-podman.sh
+./start-tdarr-gpu-podman-clean.sh
+```
+
+## Integration Points
+
+### External Dependencies
+- **Podman**: Container runtime for node management
+- **nvidia-smi**: GPU monitoring and device access
+- **cron**: System scheduler for automation
+- **SSH**: Remote server access (monitoring scripts)
+
+### File System Dependencies
+- **Cache Directory**: `/mnt/NV2/tdarr-cache` (local NVMe)
+- **Temp Directory**: `/mnt/NV2/tdarr-cache-clean` (processing space)
+- **Log Files**: `/tmp/tdarr-scheduler.log` (operational logs)
+- **Configuration**: Local `tdarr-schedule.conf` file
+
+### Network Dependencies
+- **Tdarr Server**: API communication for unmapped node operation
+- **Discord Webhooks**: Optional notification integration (via monitoring)
+- **NAS Access**: For final file storage (post-processing only)
+
+This operational context provides comprehensive guidance for managing active Tdarr automation scripts in production environments.
\ No newline at end of file
diff --git a/scripts/tdarr/README.md b/tdarr/scripts/README.md
similarity index 100%
rename from scripts/tdarr/README.md
rename to tdarr/scripts/README.md
diff --git a/scripts/tdarr/start-tdarr-gpu-podman-clean.sh b/tdarr/scripts/start-tdarr-gpu-podman-clean.sh
similarity index 100%
rename from scripts/tdarr/start-tdarr-gpu-podman-clean.sh
rename to tdarr/scripts/start-tdarr-gpu-podman-clean.sh
diff --git a/scripts/tdarr/stop-tdarr-gpu-podman.sh b/tdarr/scripts/stop-tdarr-gpu-podman.sh
similarity index 100%
rename from scripts/tdarr/stop-tdarr-gpu-podman.sh
rename to tdarr/scripts/stop-tdarr-gpu-podman.sh
diff --git a/scripts/tdarr/tdarr-cron-check-configurable.sh b/tdarr/scripts/tdarr-cron-check-configurable.sh
similarity index 100%
rename from scripts/tdarr/tdarr-cron-check-configurable.sh
rename to tdarr/scripts/tdarr-cron-check-configurable.sh
diff --git a/scripts/tdarr/tdarr-schedule-manager.sh b/tdarr/scripts/tdarr-schedule-manager.sh
similarity index 100%
rename from scripts/tdarr/tdarr-schedule-manager.sh
rename to tdarr/scripts/tdarr-schedule-manager.sh
diff --git a/scripts/tdarr/tdarr-schedule.conf b/tdarr/scripts/tdarr-schedule.conf
similarity index 100%
rename from scripts/tdarr/tdarr-schedule.conf
rename to tdarr/scripts/tdarr-schedule.conf
diff --git a/tdarr/troubleshooting.md b/tdarr/troubleshooting.md
new file mode 100644
index 0000000..5b08c1e
--- /dev/null
+++ b/tdarr/troubleshooting.md
@@ -0,0 +1,272 @@
+# Tdarr Troubleshooting Guide
+
+## forEach Error Resolution
+
+### Problem: TypeError: Cannot read properties of undefined (reading 'forEach')
+**Symptoms**: Scanning phase fails at "Tagging video res" step, preventing all transcodes
+**Root Cause**: Custom plugin mounts override community plugins with incompatible versions
+
+### Solution: Clean Plugin Installation
+1. **Remove custom plugin mounts** from docker-compose.yml
+2. **Force plugin regeneration**:
+   ```bash
+   ssh tdarr "docker restart tdarr"
+   podman restart tdarr-node-gpu
+   ```
+3. **Verify clean plugins**: Check for null-safety fixes `(streams || []).forEach()`
+
+### Plugin Safety Patterns
+```javascript
+// ❌ Unsafe - causes forEach errors
+args.variables.ffmpegCommand.streams.forEach()
+
+// ✅ Safe - null-safe forEach
+(args.variables.ffmpegCommand.streams || []).forEach()
+```
+
+## Staging Section Timeout Issues
+
+### Problem: Files removed from staging after 300 seconds
+**Symptoms**: 
+- `.tmp` files stuck in work directories
+- ENOTEMPTY errors during cleanup
+- Subsequent jobs blocked
+
+### Solution: Automated Monitoring System
+**Monitor Script**: `/mnt/NV2/Development/claude-home/scripts/monitoring/tdarr-timeout-monitor.sh`
+
+**Automatic Actions**:
+- Detects staging timeouts every 20 minutes
+- Removes stuck work directories
+- Sends Discord notifications
+- Logs all cleanup activities
+
+### Manual Cleanup Commands
+```bash
+# Check staging section
+ssh tdarr "docker logs tdarr | tail -50"
+
+# Find stuck work directories
+find /mnt/NV2/tdarr-cache -name "tdarr-workDir*" -type d
+
+# Force cleanup stuck directory
+rm -rf /mnt/NV2/tdarr-cache/tdarr-workDir-[ID]
+```
+
+## System Stability Issues
+
+### Problem: Kernel crashes during intensive transcoding
+**Root Cause**: CIFS network issues during large file streaming (mapped nodes)
+
+### Solution: Convert to Unmapped Node Architecture
+1. **Enable unmapped nodes** in server Options
+2. **Update node configuration**:
+   ```bash
+   # Add to container environment
+   -e nodeType=unmapped
+   -e unmappedNodeCache=/cache
+   
+   # Use local cache volume
+   -v "/mnt/NV2/tdarr-cache:/cache"
+   
+   # Remove media volume (no longer needed)
+   ```
+3. **Benefits**: Eliminates CIFS streaming, prevents kernel crashes
+
+### Container Resource Limits
+```yaml
+# Prevent memory exhaustion
+deploy:
+  resources:
+    limits:
+      memory: 8G
+      cpus: '6'
+```
+
+## Gaming Detection Issues
+
+### Problem: Tdarr doesn't stop during gaming
+**Check gaming detection**:
+```bash
+# Test current gaming detection
+./tdarr-schedule-manager.sh test
+
+# View scheduler logs
+tail -f /tmp/tdarr-scheduler.log
+
+# Verify GPU usage detection
+nvidia-smi
+```
+
+### Gaming Process Detection
+**Monitored Processes**:
+- Steam, Lutris, Heroic Games Launcher
+- Wine, Bottles (Windows compatibility)
+- GameMode, MangoHUD (utilities)
+- **GPU usage >15%** (configurable threshold)
+
+### Configuration Adjustments
+```bash
+# Edit gaming detection threshold
+./tdarr-schedule-manager.sh edit
+
+# Apply preset configurations
+./tdarr-schedule-manager.sh preset gaming-only  # No time limits
+./tdarr-schedule-manager.sh preset night-only   # 10PM-7AM only
+```
+
+## Network and Access Issues
+
+### Server Connection Problems
+**Server Access Commands**:
+```bash
+# SSH to Tdarr server
+ssh tdarr
+
+# Check server status
+ssh tdarr "docker ps | grep tdarr"
+
+# View server logs
+ssh tdarr "docker logs tdarr"
+
+# Access server container
+ssh tdarr "docker exec -it tdarr /bin/bash"
+```
+
+### Node Registration Issues
+```bash
+# Check node logs
+podman logs tdarr-node-gpu
+
+# Verify node registration
+# Look for "Node registered" in server logs
+ssh tdarr "docker logs tdarr | grep -i node"
+
+# Test node connectivity
+curl http://10.10.0.43:8265/api/v2/status
+```
+
+## Performance Issues
+
+### Slow Transcoding Performance
+**Diagnosis**:
+1. **Check cache location**: Should be local NVMe, not network
+2. **Verify unmapped mode**: `nodeType=unmapped` in container
+3. **Monitor I/O**: `iotop` during transcoding
+
+**Expected Performance**:
+- **Mapped nodes**: Constant SMB streaming (~100MB/s)
+- **Unmapped nodes**: Download once → Process locally → Upload once
+
+### GPU Utilization Problems
+```bash
+# Monitor GPU usage during transcoding
+watch nvidia-smi
+
+# Check GPU device access in container
+podman exec tdarr-node-gpu nvidia-smi
+
+# Verify NVENC encoder availability
+podman exec tdarr-node-gpu ffmpeg -encoders | grep nvenc
+```
+
+## Plugin System Issues
+
+### Plugin Loading Failures
+**Troubleshooting Steps**:
+1. **Check plugin directory**: Ensure no custom mounts override community plugins
+2. **Verify dependencies**: FlowHelper files (`metadataUtils.js`, `letterboxUtils.js`)
+3. **Test plugin syntax**:
+   ```bash
+   # Test plugin in Node.js
+   node -e "require('./path/to/plugin.js')"
+   ```
+
+### Custom Plugin Integration
+**Safe Integration Pattern**:
+1. **Selective mounting**: Mount only specific required plugins
+2. **Dependency verification**: Include all FlowHelper dependencies
+3. **Version compatibility**: Ensure plugins match Tdarr version
+4. **Null-safety checks**: Add `|| []` to forEach operations
+
+## Monitoring and Logging
+
+### Log Locations
+```bash
+# Scheduler logs
+tail -f /tmp/tdarr-scheduler.log
+
+# Monitor logs  
+tail -f /tmp/tdarr-monitor/monitor.log
+
+# Server logs
+ssh tdarr "docker logs tdarr"
+
+# Node logs
+podman logs tdarr-node-gpu
+```
+
+### Discord Notification Issues
+**Check webhook configuration**:
+```bash
+# Test Discord webhook
+curl -X POST [WEBHOOK_URL] \
+  -H "Content-Type: application/json" \
+  -d '{"content": "Test message"}'
+```
+
+**Common Issues**:
+- JSON escaping in message content
+- Markdown formatting in Discord
+- User ping placement (outside code blocks)
+
+## Emergency Recovery
+
+### Complete System Reset
+```bash
+# Stop all containers
+podman stop tdarr-node-gpu
+ssh tdarr "docker stop tdarr"
+
+# Clean cache directories
+rm -rf /mnt/NV2/tdarr-cache/tdarr-workDir*
+
+# Remove scheduler
+crontab -e  # Delete tdarr lines
+
+# Restart with clean configuration
+./start-tdarr-gpu-podman-clean.sh
+./tdarr-schedule-manager.sh preset work-safe
+./tdarr-schedule-manager.sh install
+```
+
+### Data Recovery
+**Important**: Tdarr processes files in-place, original files remain untouched
+- **Queue data**: Stored in server configuration (`/app/configs`)
+- **Progress data**: Lost on container restart (unmapped nodes)
+- **Cache files**: Safe to delete, will re-download
+
+## Common Error Patterns
+
+### "Copy failed" in Staging Section
+**Cause**: Network timeout during file transfer to unmapped node
+**Solution**: Monitoring system automatically retries
+
+### "ENOTEMPTY" Directory Cleanup Errors  
+**Cause**: Partial downloads leave files in work directories
+**Solution**: Force remove directories, monitoring handles automatically
+
+### Node Disconnection During Processing
+**Cause**: Gaming detection or manual stop during active job
+**Result**: File returns to queue automatically, safe to restart
+
+## Prevention Best Practices
+
+1. **Use unmapped node architecture** for stability
+2. **Implement monitoring system** for automatic cleanup
+3. **Configure gaming-aware scheduling** for desktop systems  
+4. **Set container resource limits** to prevent crashes
+5. **Use clean plugin installation** to avoid forEach errors
+6. **Monitor system resources** during intensive operations
+
+This troubleshooting guide covers the most common issues and their resolutions for production Tdarr deployments.
\ No newline at end of file
diff --git a/vm-management/CONTEXT.md b/vm-management/CONTEXT.md
new file mode 100644
index 0000000..5be2f1e
--- /dev/null
+++ b/vm-management/CONTEXT.md
@@ -0,0 +1,296 @@
+# Virtual Machine Management - Technology Context
+
+## Overview
+Virtual machine management for home lab environments with focus on automated provisioning, infrastructure as code, and security-first configuration. This context covers VM lifecycle management, Proxmox integration, and standardized deployment patterns.
+
+## Architecture Patterns
+
+### Infrastructure as Code (IaC) Approach
+**Pattern**: Declarative VM configuration with repeatable deployments
+```yaml
+# Cloud-init template pattern
+#cloud-config
+users:
+  - name: cal
+    groups: [sudo, docker]
+    ssh_authorized_keys:
+      - ssh-rsa AAAAB3... primary-key
+      - ssh-rsa AAAAB3... emergency-key
+packages:
+  - docker.io
+  - docker-compose
+runcmd:
+  - systemctl enable docker
+  - usermod -aG docker cal
+```
+
+### Template-Based Deployment Strategy
+**Pattern**: Standardized VM templates with cloud-init automation
+- **Base Templates**: Ubuntu Server with cloud-init support
+- **Resource Allocation**: Standardized sizing (2CPU/4GB/20GB baseline)
+- **Network Configuration**: Predefined VLAN assignments (10.10.0.x internal)
+- **Security Hardening**: SSH keys only, password auth disabled
+
+## Provisioning Strategies
+
+### Cloud-Init Deployment (Recommended for New VMs)
+**Purpose**: Fully automated VM provisioning from first boot
+**Implementation**:
+1. Create VM in Proxmox with cloud-init support
+2. Apply standardized cloud-init template
+3. VM configures itself automatically on first boot
+4. No manual intervention required
+
+**Benefits**:
+- Zero-touch deployment
+- Consistent configuration
+- Security hardening from first boot
+- Immediate productivity
+
+### Post-Install Scripting (Existing VMs)
+**Purpose**: Standardize existing VM configurations
+**Implementation**:
+```bash
+./vm-post-install.sh <vm-ip> [username]
+# Automated: updates, SSH keys, Docker, hardening
+```
+
+**Use Cases**:
+- Legacy VM standardization
+- Imported VM configuration
+- Recovery and remediation
+- Incremental improvements
+
+## Security Architecture
+
+### SSH Key-Based Authentication
+**Pattern**: Dual key deployment for security and redundancy
+
+```bash
+# Primary access key
+~/.ssh/homelab_rsa      # Daily operations
+
+# Emergency access key  
+~/.ssh/emergency_homelab_rsa  # Backup/recovery access
+```
+
+**Security Controls**:
+- Password authentication completely disabled
+- Root login prohibited
+- SSH keys managed centrally
+- Automatic key deployment
+
+### User Privilege Management
+**Pattern**: Least privilege with sudo elevation
+```bash
+# User configuration
+username: cal
+groups: [sudo, docker]  # Minimal required groups
+shell: /bin/bash
+sudo: ALL=(ALL) NOPASSWD:ALL  # Operational convenience
+```
+
+**Access Controls**:
+- Non-root user accounts only
+- Sudo required for administrative tasks
+- Docker group for container management
+- SSH key authentication mandatory
+
+### Network Security
+**Pattern**: Network segmentation and access control
+- **Internal Network**: 10.10.0.x/24 for VM communication
+- **Management Access**: SSH (port 22) only
+- **Service Isolation**: Application-specific port exposure
+- **Firewall Ready**: iptables/ufw configuration prepared
+
+## Lifecycle Management Patterns
+
+### VM Creation Workflow
+1. **Template Selection**: Choose appropriate base image
+2. **Resource Allocation**: Size based on workload requirements
+3. **Network Assignment**: VLAN and IP address planning
+4. **Cloud-Init Configuration**: Apply standardized template
+5. **Automated Provisioning**: Zero-touch deployment
+6. **Verification**: Automated connectivity and configuration tests
+
+### Configuration Management
+**Pattern**: Standardized system configuration
+```bash
+# Essential packages
+packages: [
+  "curl", "wget", "git", "vim", "htop", "unzip",
+  "docker.io", "docker-compose-plugin"
+]
+
+# System services
+runcmd:
+  - systemctl enable docker
+  - systemctl enable ssh
+  - systemctl enable unattended-upgrades
+```
+
+### Maintenance Automation
+**Pattern**: Automated updates and maintenance
+- **Security Updates**: Automatic installation enabled
+- **Package Management**: Standardized package selection
+- **Service Management**: Consistent service configuration
+- **Log Management**: Centralized logging ready
+
+## Resource Management
+
+### Sizing Standards
+**Pattern**: Standardized VM resource allocation
+
+```yaml
+# Basic workload (web services, small databases)
+vcpus: 2
+memory: 4096  # 4GB
+disk: 20      # 20GB
+
+# Medium workload (application servers, medium databases)  
+vcpus: 4
+memory: 8192  # 8GB
+disk: 40      # 40GB
+
+# Heavy workload (transcoding, large databases)
+vcpus: 6
+memory: 16384 # 16GB
+disk: 100     # 100GB
+```
+
+### Storage Strategy
+**Pattern**: Application-appropriate storage allocation
+- **System Disk**: OS and applications (20-40GB)
+- **Data Volumes**: Application data (variable)
+- **Backup Storage**: Network-attached for persistence
+- **Cache Storage**: Local fast storage for performance
+
+### Network Planning
+**Pattern**: Structured network addressing
+```yaml
+# Network segments
+management: 10.10.0.x/24    # VM management and SSH access
+services: 10.10.1.x/24      # Application services
+storage: 10.10.2.x/24       # Storage and backup traffic
+dmz: 10.10.10.x/24          # External-facing services
+```
+
+## Monitoring and Operations
+
+### Health Monitoring
+**Pattern**: Automated system health checks
+```bash
+# Resource monitoring
+cpu_usage: <80%
+memory_usage: <90%  
+disk_usage: <85%
+network_connectivity: verified
+
+# Service monitoring
+ssh_service: active
+docker_service: active
+unattended_upgrades: active
+```
+
+### Backup Strategies
+**Pattern**: Multi-tier backup approach
+- **VM Snapshots**: Point-in-time recovery (Proxmox)
+- **Application Data**: Specific application backup procedures
+- **Configuration Backup**: Cloud-init templates and scripts
+- **SSH Keys**: Centralized key management backup
+
+### Performance Tuning
+**Pattern**: Workload-optimized configuration
+```yaml
+# CPU optimization
+cpu_type: host        # Performance over compatibility
+numa: enabled         # NUMA awareness for multi-socket
+
+# Memory optimization  
+ballooning: enabled   # Dynamic memory allocation
+hugepages: disabled   # Unless specifically needed
+
+# Storage optimization
+cache: writethrough   # Balance performance and safety
+io_thread: enabled    # Improve I/O performance
+```
+
+## Integration Patterns
+
+### Container Platform Integration
+**Pattern**: Docker-ready VM deployment
+```bash
+# Automated Docker setup
+- docker.io installation
+- docker-compose plugin
+- User added to docker group
+- Service auto-start enabled
+- Container runtime verified
+```
+
+### SSH Infrastructure Integration
+**Pattern**: Centralized SSH key management
+```bash
+# Key deployment automation
+primary_key: ~/.ssh/homelab_rsa.pub
+emergency_key: ~/.ssh/emergency_homelab_rsa.pub
+backup_system: automated
+rotation_policy: annual
+```
+
+### Network Services Integration
+**Pattern**: Ready for service deployment
+- **Reverse Proxy**: Nginx/Traefik ready configuration
+- **DNS**: Local DNS registration prepared
+- **Certificates**: Let's Encrypt integration ready
+- **Monitoring**: Prometheus/Grafana agent ready
+
+## Common Implementation Workflows
+
+### New VM Deployment
+1. **Create VM** in Proxmox with cloud-init support
+2. **Configure resources** based on workload requirements
+3. **Apply cloud-init template** with standardized configuration
+4. **Start VM** and wait for automated provisioning
+5. **Verify deployment** via SSH key authentication
+6. **Deploy applications** using container or package management
+
+### Existing VM Standardization
+1. **Assess current configuration** and identify gaps
+2. **Run post-install script** for automated updates
+3. **Verify SSH key deployment** and password auth disable
+4. **Test Docker installation** and user permissions
+5. **Update documentation** with new configuration
+6. **Schedule regular maintenance** and monitoring
+
+### VM Migration and Recovery
+1. **Create VM snapshot** before changes
+2. **Export VM configuration** and cloud-init template
+3. **Test recovery procedure** in staging environment
+4. **Document recovery steps** and verification procedures
+5. **Implement backup automation** for critical VMs
+
+## Best Practices
+
+### Security Hardening
+1. **SSH Keys Only**: Disable password authentication completely
+2. **Emergency Access**: Deploy backup SSH keys for recovery
+3. **User Separation**: Non-root users with sudo privileges
+4. **Automatic Updates**: Enable security update automation
+5. **Network Isolation**: Use VLANs and firewall rules
+
+### Operational Excellence
+1. **Infrastructure as Code**: Use cloud-init for reproducible deployments
+2. **Standardization**: Consistent VM sizing and configuration
+3. **Automation**: Minimize manual configuration steps
+4. **Documentation**: Maintain deployment templates and procedures
+5. **Testing**: Verify deployments before production use
+
+### Performance Optimization
+1. **Resource Right-Sizing**: Match resources to workload requirements
+2. **Storage Strategy**: Use appropriate storage tiers
+3. **Network Optimization**: Plan network topology for performance
+4. **Monitoring**: Implement resource usage monitoring
+5. **Capacity Planning**: Plan for growth and scaling
+
+This technology context provides comprehensive guidance for implementing virtual machine management in home lab and production environments using modern IaC principles and security best practices.
\ No newline at end of file
diff --git a/examples/vm-management/proxmox-automation.md b/vm-management/examples/proxmox-automation.md
similarity index 100%
rename from examples/vm-management/proxmox-automation.md
rename to vm-management/examples/proxmox-automation.md
diff --git a/reference/vm-management/troubleshooting.md b/vm-management/examples/troubleshooting.md
similarity index 100%
rename from reference/vm-management/troubleshooting.md
rename to vm-management/examples/troubleshooting.md
diff --git a/scripts/vm-management/README.md b/vm-management/scripts/README.md
similarity index 100%
rename from scripts/vm-management/README.md
rename to vm-management/scripts/README.md
diff --git a/scripts/vm-management/cloud-init-user-data.yaml b/vm-management/scripts/cloud-init-user-data.yaml
similarity index 100%
rename from scripts/vm-management/cloud-init-user-data.yaml
rename to vm-management/scripts/cloud-init-user-data.yaml
diff --git a/scripts/vm-management/vm-post-install.sh b/vm-management/scripts/vm-post-install.sh
similarity index 100%
rename from scripts/vm-management/vm-post-install.sh
rename to vm-management/scripts/vm-post-install.sh
diff --git a/vm-management/troubleshooting.md b/vm-management/troubleshooting.md
new file mode 100644
index 0000000..8fa173c
--- /dev/null
+++ b/vm-management/troubleshooting.md
@@ -0,0 +1,652 @@
+# Virtual Machine Management Troubleshooting Guide
+
+## VM Provisioning Issues
+
+### Cloud-Init Configuration Problems
+
+#### Cloud-Init Not Executing
+**Symptoms**:
+- VM starts but user accounts not created
+- SSH keys not deployed
+- Packages not installed
+- Configuration not applied
+
+**Diagnosis**:
+```bash
+# Check cloud-init status and logs
+ssh root@<vm-ip> 'cloud-init status --long'
+ssh root@<vm-ip> 'cat /var/log/cloud-init.log'
+ssh root@<vm-ip> 'cat /var/log/cloud-init-output.log'
+
+# Verify cloud-init configuration
+ssh root@<vm-ip> 'cloud-init query userdata'
+
+# Check for YAML syntax errors
+ssh root@<vm-ip> 'cloud-init devel schema --config-file /var/lib/cloud/instance/user-data.txt'
+```
+
+**Solutions**:
+```bash
+# Re-run cloud-init (CAUTION: may overwrite changes)
+ssh root@<vm-ip> 'cloud-init clean --logs'
+ssh root@<vm-ip> 'cloud-init init --local'
+ssh root@<vm-ip> 'cloud-init init'
+ssh root@<vm-ip> 'cloud-init modules --mode=config'
+ssh root@<vm-ip> 'cloud-init modules --mode=final'
+
+# Manual user creation if cloud-init fails
+ssh root@<vm-ip> 'useradd -m -s /bin/bash -G sudo,docker cal'
+ssh root@<vm-ip> 'mkdir -p /home/cal/.ssh'
+ssh root@<vm-ip> 'chown cal:cal /home/cal/.ssh'
+ssh root@<vm-ip> 'chmod 700 /home/cal/.ssh'
+```
+
+#### Invalid Cloud-Init YAML
+**Symptoms**:
+- Cloud-init fails with syntax errors
+- Parser errors in cloud-init logs
+- Partial configuration application
+
+**Common YAML Issues**:
+```yaml
+# ❌ Incorrect indentation
+users:
+- name: cal
+groups: [sudo, docker]  # Wrong indentation
+
+# ✅ Correct indentation  
+users:
+  - name: cal
+    groups: [sudo, docker]  # Proper indentation
+
+# ❌ Missing quotes for special characters
+ssh_authorized_keys:
+  - ssh-rsa AAAAB3NzaC1... user@host  # May fail with special chars
+
+# ✅ Quoted strings
+ssh_authorized_keys:
+  - "ssh-rsa AAAAB3NzaC1... user@host"
+```
+
+### VM Boot and Startup Issues
+
+#### VM Won't Start
+**Symptoms**:
+- VM fails to boot from Proxmox
+- Kernel panic messages
+- Boot loop or hanging
+
+**Diagnosis**:
+```bash
+# Check VM configuration
+pvesh get /nodes/pve/qemu/<vmid>/config
+
+# Check resource allocation
+pvesh get /nodes/pve/qemu/<vmid>/status/current
+
+# Review VM logs via Proxmox console
+# Use Proxmox web interface -> VM -> Console
+
+# Check Proxmox host resources
+pvesh get /nodes/pve/status
+```
+
+**Solutions**:
+```bash
+# Increase memory allocation
+pvesh set /nodes/pve/qemu/<vmid>/config -memory 4096
+
+# Reset CPU configuration
+pvesh set /nodes/pve/qemu/<vmid>/config -cpu host -cores 2
+
+# Check and repair disk
+# Stop VM, then:
+pvesh get /nodes/pve/qemu/<vmid>/config | grep scsi0
+# Use fsck on the disk image if needed
+```
+
+#### Resource Constraints
+**Symptoms**:
+- VM extremely slow performance
+- Out-of-memory kills
+- Disk I/O bottlenecks
+
+**Diagnosis**:
+```bash
+# Inside VM resource check
+free -h
+df -h
+iostat 1 5
+vmstat 1 5
+
+# Proxmox host resource check
+pvesh get /nodes/pve/status
+cat /proc/meminfo
+df -h /var/lib/vz
+```
+
+**Solutions**:
+```bash
+# Increase VM resources via Proxmox
+pvesh set /nodes/pve/qemu/<vmid>/config -memory 8192
+pvesh set /nodes/pve/qemu/<vmid>/config -cores 4
+
+# Resize VM disk
+# Proxmox GUI: Hardware -> Hard Disk -> Resize
+# Then extend filesystem:
+sudo growpart /dev/sda 1
+sudo resize2fs /dev/sda1
+```
+
+## SSH Access Issues
+
+### SSH Connection Failures
+
+#### Cannot Connect to VM
+**Symptoms**:
+- Connection timeout
+- Connection refused  
+- Host unreachable
+
+**Diagnosis**:
+```bash
+# Network connectivity tests
+ping <vm-ip>
+traceroute <vm-ip>
+
+# SSH service tests
+nc -zv <vm-ip> 22
+nmap -p 22 <vm-ip>
+
+# From Proxmox console, check SSH service
+systemctl status sshd
+ss -tlnp | grep :22
+```
+
+**Solutions**:
+```bash
+# Via Proxmox console - restart SSH
+systemctl start sshd
+systemctl enable sshd
+
+# Check and configure firewall
+ufw status
+# If blocking SSH:
+ufw allow ssh
+ufw allow 22/tcp
+
+# Network configuration reset
+ip addr show
+dhclient  # For DHCP
+systemctl restart networking
+```
+
+#### SSH Key Authentication Failures
+**Symptoms**:
+- Password prompts despite key installation
+- "Permission denied (publickey)"
+- "No more authentication methods"
+
+**Diagnosis**:
+```bash
+# Verbose SSH debugging
+ssh -vvv cal@<vm-ip>
+
+# Check key files locally
+ls -la ~/.ssh/homelab_rsa*
+ls -la ~/.ssh/emergency_homelab_rsa*
+
+# Via console or password auth, check VM
+ls -la ~/.ssh/
+cat ~/.ssh/authorized_keys
+```
+
+**Solutions**:
+```bash
+# Fix SSH directory permissions
+chmod 700 ~/.ssh
+chmod 600 ~/.ssh/authorized_keys
+chown -R cal:cal ~/.ssh
+
+# Re-deploy SSH keys
+cat > ~/.ssh/authorized_keys << 'EOF'
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQC... # primary key
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQD... # emergency key  
+EOF
+
+# Verify SSH server configuration
+sudo sshd -T | grep -E "(passwordauth|pubkeyauth|permitroot)"
+```
+
+#### SSH Security Configuration Issues
+**Symptoms**:
+- Password authentication still enabled
+- Root login allowed
+- Insecure SSH settings
+
+**Diagnosis**:
+```bash
+# Check effective SSH configuration
+sudo sshd -T | grep -E "(passwordauth|pubkeyauth|permitroot|allowusers)"
+
+# Review SSH config files
+cat /etc/ssh/sshd_config
+ls /etc/ssh/sshd_config.d/
+```
+
+**Solutions**:
+```bash
+# Apply security hardening
+sudo tee /etc/ssh/sshd_config.d/99-homelab-security.conf << 'EOF'
+PasswordAuthentication no
+PubkeyAuthentication yes
+PermitRootLogin no
+AllowUsers cal
+Protocol 2
+ClientAliveInterval 300
+ClientAliveCountMax 2
+MaxAuthTries 3
+X11Forwarding no
+EOF
+
+sudo systemctl restart sshd
+```
+
+## Docker Installation and Configuration Issues
+
+### Docker Installation Failures
+
+#### Package Installation Fails
+**Symptoms**:
+- Docker packages not found
+- GPG key verification errors
+- Repository access failures
+
+**Diagnosis**:
+```bash
+# Test internet connectivity
+ping google.com
+curl -I https://download.docker.com
+
+# Check repository configuration
+cat /etc/apt/sources.list.d/docker.list
+apt-cache policy docker-ce
+
+# Check for package conflicts
+dpkg -l | grep docker
+```
+
+**Solutions**:
+```bash
+# Remove conflicting packages
+sudo apt remove -y docker docker-engine docker.io containerd runc
+
+# Reinstall Docker repository
+sudo mkdir -p /etc/apt/keyrings
+curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
+
+echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list
+
+# Install Docker
+sudo apt update
+sudo apt install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+```
+
+#### Docker Service Issues
+**Symptoms**:
+- Docker daemon won't start
+- Socket connection errors
+- Service failure on boot
+
+**Diagnosis**:
+```bash
+# Check service status
+systemctl status docker
+journalctl -u docker.service -f
+
+# Check system resources
+df -h
+free -h
+
+# Test daemon manually
+sudo dockerd --debug
+```
+
+**Solutions**:
+```bash
+# Restart Docker service
+sudo systemctl stop docker
+sudo systemctl start docker
+sudo systemctl enable docker
+
+# Clear corrupted Docker data
+sudo systemctl stop docker
+sudo rm -rf /var/lib/docker/tmp/*
+sudo systemctl start docker
+
+# Reset Docker configuration
+sudo mv /etc/docker/daemon.json /etc/docker/daemon.json.bak 2>/dev/null || true
+sudo systemctl restart docker
+```
+
+### Docker Permission and Access Issues
+
+#### Permission Denied Errors
+**Symptoms**:
+- Must use sudo for Docker commands
+- "Permission denied" when accessing Docker socket
+- User not in docker group
+
+**Diagnosis**:
+```bash
+# Check user groups
+groups
+groups cal
+getent group docker
+
+# Check Docker socket permissions
+ls -la /var/run/docker.sock
+
+# Verify Docker service is running
+systemctl status docker
+```
+
+**Solutions**:
+```bash
+# Add user to docker group
+sudo usermod -aG docker cal
+
+# Create docker group if missing
+sudo groupadd docker 2>/dev/null || true
+sudo usermod -aG docker cal
+
+# Apply group membership (requires logout/login or):
+newgrp docker
+
+# Fix socket permissions
+sudo chown root:docker /var/run/docker.sock
+sudo chmod 664 /var/run/docker.sock
+```
+
+## Network Configuration Problems
+
+### IP Address and Connectivity Issues
+
+#### Incorrect IP Configuration
+**Symptoms**:
+- VM has wrong IP address
+- No network connectivity
+- Cannot reach default gateway
+
+**Diagnosis**:
+```bash
+# Check network configuration
+ip addr show
+ip route show
+cat /etc/netplan/*.yaml
+
+# Test connectivity
+ping $(ip route | grep default | awk '{print $3}')  # Gateway
+ping 8.8.8.8  # External connectivity
+```
+
+**Solutions**:
+```bash
+# Fix netplan configuration
+sudo tee /etc/netplan/00-installer-config.yaml << 'EOF'
+network:
+  version: 2
+  ethernets:
+    ens18:
+      dhcp4: false
+      addresses: [10.10.0.200/24]
+      gateway4: 10.10.0.1
+      nameservers:
+        addresses: [10.10.0.16, 8.8.8.8]
+EOF
+
+# Apply network configuration
+sudo netplan apply
+```
+
+#### DNS Resolution Problems
+**Symptoms**:
+- Cannot resolve domain names
+- Package downloads fail
+- Host lookup failures
+
+**Diagnosis**:
+```bash
+# Check DNS configuration
+cat /etc/resolv.conf
+systemd-resolve --status
+
+# Test DNS resolution
+nslookup google.com
+dig google.com @8.8.8.8
+```
+
+**Solutions**:
+```bash
+# Fix DNS in netplan (see above example)
+sudo netplan apply
+
+# Temporary DNS fix
+echo "nameserver 8.8.8.8" | sudo tee /etc/resolv.conf
+
+# Restart DNS services
+sudo systemctl restart systemd-resolved
+sudo systemctl restart networking
+```
+
+## System Maintenance Issues
+
+### Package Management Problems
+
+#### Update Failures
+**Symptoms**:
+- apt update fails
+- Repository signature errors
+- Dependency conflicts
+
+**Diagnosis**:
+```bash
+# Check repository status
+sudo apt update
+apt-cache policy
+
+# Check disk space
+df -h /
+df -h /var
+
+# Check for held packages
+apt-mark showhold
+```
+
+**Solutions**:
+```bash
+# Fix broken packages
+sudo apt --fix-broken install
+sudo dpkg --configure -a
+
+# Clean package cache
+sudo apt clean
+sudo apt autoclean
+sudo apt autoremove
+
+# Reset problematic repositories
+sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys <KEYID>
+sudo apt update
+```
+
+### Storage and Disk Space Issues
+
+#### Disk Space Exhaustion
+**Symptoms**:
+- Cannot install packages
+- Docker operations fail
+- System becomes unresponsive
+
+**Diagnosis**:
+```bash
+# Check disk usage
+df -h
+du -sh /home/* /var/* /opt/* 2>/dev/null
+
+# Find large files
+find / -size +100M 2>/dev/null | head -20
+```
+
+**Solutions**:
+```bash
+# Clean system files
+sudo apt clean
+sudo apt autoremove
+sudo journalctl --vacuum-time=7d
+
+# Clean Docker data
+docker system prune -a -f
+docker volume prune -f
+
+# Extend disk (Proxmox GUI: Hardware -> Resize)
+# Then extend filesystem:
+sudo growpart /dev/sda 1
+sudo resize2fs /dev/sda1
+```
+
+## Emergency Recovery Procedures
+
+### SSH Access Recovery
+
+#### Complete SSH Lockout
+**Recovery Steps**:
+1. **Use Proxmox console** for direct VM access
+2. **Reset SSH configuration**:
+   ```bash
+   # Via console
+   sudo cp /etc/ssh/sshd_config.backup /etc/ssh/sshd_config 2>/dev/null || true
+   sudo systemctl restart sshd
+   ```
+3. **Re-enable emergency access**:
+   ```bash
+   # Temporary password access for recovery
+   sudo passwd cal
+   sudo sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config
+   sudo systemctl restart sshd
+   ```
+
+#### Emergency SSH Key Deployment
+**If primary keys fail**:
+```bash
+# Use emergency key
+ssh -i ~/.ssh/emergency_homelab_rsa cal@<vm-ip>
+
+# Or deploy keys via console
+mkdir -p ~/.ssh
+chmod 700 ~/.ssh
+cat > ~/.ssh/authorized_keys << 'EOF'
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQC... # primary key
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQD... # emergency key
+EOF
+chmod 600 ~/.ssh/authorized_keys
+```
+
+### VM Recovery and Rebuild
+
+#### Corrupt VM Recovery
+**Steps**:
+1. **Create snapshot** before attempting recovery
+2. **Export VM data**:
+   ```bash
+   # Backup important data
+   rsync -av cal@<vm-ip>:/home/cal/ ./vm-backup/
+   ```
+3. **Restore from template**:
+   ```bash
+   # Delete corrupt VM
+   pvesh delete /nodes/pve/qemu/<vmid>
+   
+   # Clone from template
+   pvesh create /nodes/pve/qemu/<template-id>/clone -newid <vmid> -name <vm-name>
+   ```
+
+#### Post-Install Script Recovery
+**If automation fails**:
+```bash
+# Run in debug mode
+bash -x ./scripts/vm-management/vm-post-install.sh <vm-ip> <user>
+
+# Manual step execution
+ssh cal@<vm-ip> 'sudo apt update && sudo apt upgrade -y'
+ssh cal@<vm-ip> 'curl -fsSL https://get.docker.com | sh'
+ssh cal@<vm-ip> 'sudo usermod -aG docker cal'
+```
+
+## Prevention and Monitoring
+
+### Pre-Deployment Validation
+```bash
+# Verify prerequisites
+ls -la ~/.ssh/homelab_rsa*
+ls -la ~/.ssh/emergency_homelab_rsa*
+ping 10.10.0.1
+
+# Test cloud-init YAML
+python3 -c "import yaml; yaml.safe_load(open('cloud-init-user-data.yaml'))"
+```
+
+### Health Monitoring Script
+```bash
+#!/bin/bash
+# vm-health-check.sh
+VM_IPS="10.10.0.200 10.10.0.201 10.10.0.202"
+
+for ip in $VM_IPS; do
+    if ssh -o ConnectTimeout=5 -o BatchMode=yes cal@$ip 'uptime' >/dev/null 2>&1; then
+        echo "✅ $ip: SSH OK"
+        # Check Docker
+        if ssh cal@$ip 'docker info >/dev/null 2>&1'; then
+            echo "✅ $ip: Docker OK"
+        else
+            echo "❌ $ip: Docker FAILED"
+        fi
+    else
+        echo "❌ $ip: SSH FAILED"
+    fi
+done
+```
+
+### Automated Backup
+```bash
+# Schedule in crontab: 0 2 * * * /path/to/vm-backup.sh
+#!/bin/bash
+for vm_ip in 10.10.0.{200..210}; do
+    if ping -c1 $vm_ip >/dev/null 2>&1; then
+        rsync -av --exclude='.cache' cal@$vm_ip:/home/cal/ ./backups/$vm_ip/
+    fi
+done
+```
+
+## Quick Reference Commands
+
+### Essential VM Management
+```bash
+# VM control via Proxmox
+pvesh get /nodes/pve/qemu/<vmid>/status/current
+pvesh create /nodes/pve/qemu/<vmid>/status/start
+pvesh create /nodes/pve/qemu/<vmid>/status/stop
+
+# SSH with alternative keys
+ssh -i ~/.ssh/emergency_homelab_rsa cal@<vm-ip>
+
+# System health checks
+free -h && df -h && systemctl status docker
+docker system info && docker system df
+```
+
+### Recovery Resources
+- **SSH Keys Backup**: `/mnt/NV2/ssh-keys/backup-*/`
+- **Proxmox Console**: Direct VM access when SSH fails
+- **Emergency Contact**: Use Discord notifications for critical issues
+
+This troubleshooting guide covers comprehensive recovery procedures for VM management issues in home lab environments.
\ No newline at end of file