diff --git a/examples/vm-management/proxmox-automation.md b/examples/vm-management/proxmox-automation.md new file mode 100644 index 0000000..8e2731b --- /dev/null +++ b/examples/vm-management/proxmox-automation.md @@ -0,0 +1,347 @@ +# Proxmox VM Automation - Complete Examples + +Complete working examples for automated VM provisioning in Proxmox environments using cloud-init and post-installation scripts. + +## Overview + +This guide provides real-world examples for automating Ubuntu Server VM deployment with: +- SSH key-based authentication +- Docker environment setup +- Security hardening +- System updates and maintenance + +## Example 1: Cloud-Init VM Creation + +### Proxmox VM Setup +```bash +# Create VM with cloud-init support in Proxmox +# VM ID: 200, Name: homelab-docker-01, IP: 10.10.0.200 + +# 1. Create VM through Proxmox web interface or CLI +pvesh create /nodes/pve/qemu -vmid 200 -name homelab-docker-01 \ + -memory 2048 -cores 2 -sockets 1 -cpu cputype=host \ + -net0 virtio,bridge=vmbr0 \ + -ide2 local:cloudinit \ + -boot c -bootdisk scsi0 \ + -scsi0 local-lvm:10 \ + -ostype l26 +``` + +### Cloud-Init Configuration +Use the complete `cloud-init-user-data.yaml` template: + +```yaml +#cloud-config +hostname: homelab-docker-01 +timezone: America/New_York + +users: + - name: cal + groups: [sudo, docker] + shell: /bin/bash + sudo: ALL=(ALL) NOPASSWD:ALL + ssh_authorized_keys: + - ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDVVK02rOeeIw1e... homelab-cal@nobara-pc + - ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQCPzqHDgdK9TlN4uzumBZKEOGt... emergency-homelab-cal@nobara-pc + +ssh_pwauth: false +disable_root: true +package_update: true +package_upgrade: true + +# Docker installation and configuration... +# (Full template available in scripts/vm-management/cloud-init-user-data.yaml) +``` + +### Deployment Process +1. **Create VM** with cloud-init drive attached +2. **Paste cloud-init config** into User Data field +3. **Set network** configuration (IP: 10.10.0.200/24, Gateway: 10.10.0.1) +4. **Start VM** - automatic provisioning begins +5. **Wait 3-5 minutes** for first boot provisioning +6. **Test connectivity**: `ssh cal@10.10.0.200` + +## Example 2: Post-Installation Script + +### Scenario: Existing VM Needs Provisioning +```bash +# VM already exists at 10.10.0.150 with password authentication +# Need to configure SSH keys, Docker, and security hardening + +# Run the post-installation script +cd /mnt/NV2/Development/claude-home +./scripts/vm-management/vm-post-install.sh 10.10.0.150 cal +``` + +### Script Execution Flow +``` +🔐 Starting VM provisioning for 10.10.0.150 as user cal +✅ System updates completed +✅ SSH key authentication successful +✅ SSH hardened (password authentication disabled) +✅ Docker and Docker Compose installed successfully +✅ User environment configured +🎉 VM provisioning completed successfully! + +=== PROVISIONING SUMMARY === +VM IP: 10.10.0.150 +User: cal +Hostname: homelab-vm-03 +Docker: Docker version 24.0.7, build afdd53b +``` + +## Example 3: Bulk VM Provisioning + +### Multiple VM Deployment Script +```bash +#!/bin/bash +# Bulk VM provisioning for home lab expansion +# Creates and configures multiple VMs with sequential IPs + +VM_BASE_ID=300 +VM_BASE_IP="10.10.0" +VM_COUNT=3 + +for i in $(seq 1 $VM_COUNT); do + VM_ID=$((VM_BASE_ID + i)) + VM_IP="${VM_BASE_IP}.$((200 + i))" + VM_NAME="homelab-worker-$(printf "%02d" $i)" + + echo "Creating VM: $VM_NAME (ID: $VM_ID, IP: $VM_IP)" + + # Create VM in Proxmox (adapt to your environment) + pvesh create /nodes/pve/qemu -vmid $VM_ID -name $VM_NAME \ + -memory 2048 -cores 2 \ + -net0 virtio,bridge=vmbr0 \ + -ide2 local:cloudinit \ + -scsi0 local-lvm:20 + + # Set cloud-init configuration + pvesh set /nodes/pve/qemu/$VM_ID/config \ + -ipconfig0 ip=$VM_IP/24,gw=10.10.0.1 \ + -nameserver 10.10.0.16 \ + -user cal \ + -sshkeys /path/to/encoded/ssh-keys + + # Start VM + pvesh create /nodes/pve/qemu/$VM_ID/status/start + + echo "VM $VM_NAME created and starting..." + sleep 30 # Wait for boot +done + +echo "Bulk provisioning completed. VMs available at:" +for i in $(seq 1 $VM_COUNT); do + echo " homelab-worker-$(printf "%02d" $i): ${VM_BASE_IP}.$((200 + i))" +done +``` + +## Example 4: Docker Service Deployment + +### Post-Provisioning Service Setup +After VM provisioning, deploy services using Docker Compose: + +```bash +# Connect to newly provisioned VM +ssh cal@10.10.0.200 + +# Create service directory +mkdir -p ~/services/nginx-proxy +cd ~/services/nginx-proxy + +# Create docker-compose.yml +cat > docker-compose.yml << 'EOF' +version: '3.8' +services: + nginx: + image: nginx:alpine + ports: + - "80:80" + - "443:443" + volumes: + - ./config:/etc/nginx/conf.d:ro + - ./ssl:/etc/nginx/ssl:ro + restart: unless-stopped + + portainer: + image: portainer/portainer-ce:latest + ports: + - "9000:9000" + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - portainer_data:/data + restart: unless-stopped + +volumes: + portainer_data: +EOF + +# Deploy services +docker compose up -d +docker compose logs -f +``` + +## Example 5: VM Template Creation + +### Creating Reusable VM Templates +```bash +# 1. Create and fully configure a base VM +./scripts/vm-management/vm-post-install.sh 10.10.0.199 cal + +# 2. Clean up the VM for template use +ssh cal@10.10.0.199 ' + # Clear bash history + history -c && history -w + + # Clear logs + sudo truncate -s 0 /var/log/*log + sudo truncate -s 0 /var/log/syslog + + # Clear SSH host keys (regenerated on first boot) + sudo rm -f /etc/ssh/ssh_host_* + + # Clear machine ID + sudo truncate -s 0 /etc/machine-id + + # Clear network configuration + sudo rm -f /etc/udev/rules.d/70-persistent-net.rules + + # Shutdown + sudo shutdown -h now +' + +# 3. Convert VM to template in Proxmox +pvesh create /nodes/pve/qemu/199/template + +# 4. Clone template to create new VMs +pvesh create /nodes/pve/qemu/199/clone -newid 201 -name homelab-from-template +``` + +## Example 6: Monitoring and Verification + +### Post-Deployment Health Checks +```bash +#!/bin/bash +# VM health check script +# Usage: ./vm-health-check.sh + +VM_IP="$1" + +echo "🔍 VM Health Check: $VM_IP" + +# Test SSH connectivity +if ssh -o ConnectTimeout=5 cal@$VM_IP 'echo "SSH OK"' >/dev/null 2>&1; then + echo "✅ SSH: Connected" +else + echo "❌ SSH: Failed" + exit 1 +fi + +# Check system info +INFO=$(ssh cal@$VM_IP ' + echo "Hostname: $(hostname)" + echo "Uptime: $(uptime -p)" + echo "Memory: $(free -h | grep Mem | awk "{print \$3\"/\"\$2}")" + echo "Disk: $(df -h / | tail -1 | awk "{print \$3\"/\"\$2\" (\"\$5\" used)\"}")" +') + +echo "$INFO" + +# Test Docker +if ssh cal@$VM_IP 'docker --version && docker compose version' >/dev/null 2>&1; then + echo "✅ Docker: Installed and working" +else + echo "❌ Docker: Not working" +fi + +# Test Docker permissions +if ssh cal@$VM_IP 'docker run --rm hello-world' >/dev/null 2>&1; then + echo "✅ Docker: User permissions OK" +else + echo "❌ Docker: User permissions issue" +fi + +echo "🎉 Health check completed" +``` + +## Common Workflows + +### 1. New Development Environment +```bash +# Create VM with cloud-init +# Wait for provisioning +# Deploy development stack +ssh cal@10.10.0.201 ' + mkdir -p ~/dev + cd ~/dev + + # Clone your project + git clone https://github.com/your-org/project.git + cd project + + # Start development environment + docker compose -f docker-compose.dev.yml up -d +' +``` + +### 2. Service Migration +```bash +# Provision new VM +./scripts/vm-management/vm-post-install.sh 10.10.0.202 cal + +# Copy service configuration from existing server +scp -r cal@10.10.0.124:~/services/app-name cal@10.10.0.202:~/services/ + +# Start service on new VM +ssh cal@10.10.0.202 'cd ~/services/app-name && docker compose up -d' +``` + +### 3. Testing Environment +```bash +# Quick test environment with cloud-init +# Use minimal resources: 1GB RAM, 1 CPU, 10GB disk +# Deploy test configuration +# Run automated tests +# Destroy when done +``` + +## Troubleshooting Examples + +### VM Won't Start After Cloud-Init +```bash +# Check cloud-init logs +ssh cal@ 'sudo cloud-init status --long' +ssh cal@ 'sudo cat /var/log/cloud-init-output.log' + +# Manual cloud-init re-run if needed +ssh cal@ 'sudo cloud-init clean && sudo cloud-init init' +``` + +### SSH Key Issues +```bash +# Verify keys are properly installed +ssh cal@ 'cat ~/.ssh/authorized_keys | wc -l' # Should show 2 keys + +# Test specific key +ssh -i ~/.ssh/homelab_rsa cal@ 'echo "Primary key works"' +ssh -i ~/.ssh/emergency_homelab_rsa cal@ 'echo "Emergency key works"' +``` + +### Docker Permission Problems +```bash +# Check user groups +ssh cal@ 'groups' # Should include 'docker' + +# Re-login to apply group membership +ssh cal@ 'newgrp docker' + +# Test Docker access +ssh cal@ 'docker ps' +``` + +## Related Documentation + +- **Scripts**: `scripts/vm-management/README.md` - Implementation details +- **Patterns**: `patterns/vm-management/README.md` - Architectural guidance +- **SSH Setup**: `examples/networking/ssh-homelab-setup.md` - Key management +- **Docker**: `examples/docker/` - Container deployment patterns \ No newline at end of file diff --git a/patterns/vm-management/README.md b/patterns/vm-management/README.md index fcd1995..884cd02 100644 --- a/patterns/vm-management/README.md +++ b/patterns/vm-management/README.md @@ -1,14 +1,28 @@ # Virtual Machine Management Patterns -## VM Provisioning -- **Template-based deployment** for consistency +## Automated Provisioning +- **Cloud-init deployment** - Fully automated VM provisioning from first boot +- **Post-install scripts** - Standardized configuration for existing VMs +- **SSH key management** - Automated key deployment with emergency backup +- **Security hardening** - Password auth disabled, firewall configured + +## VM Provisioning Strategies + +### Template-Based Deployment +- **Ubuntu Server templates** optimized for home lab environments - **Resource allocation** sizing and planning -- **Network configuration** and VLAN assignment +- **Network configuration** and VLAN assignment (10.10.0.x networks) - **Storage provisioning** and disk management +### Infrastructure as Code +- **Cloud-init templates** for repeatable VM creation +- **Bash provisioning scripts** for existing infrastructure +- **SSH key integration** with existing homelab key management +- **Docker environment** setup with user permissions + ## Lifecycle Management - **Automated provisioning** with infrastructure as code -- **Configuration management** with Ansible/Puppet +- **Configuration management** with standardized scripts - **Snapshot management** and rollback strategies - **Scaling policies** for resource optimization @@ -24,9 +38,29 @@ - **High availability** configurations - **Migration strategies** between hosts +## Implementation Workflows + +### New VM Creation (Recommended) +1. **Create VM in Proxmox** with cloud-init support +2. **Apply cloud-init template** (`scripts/vm-management/cloud-init-user-data.yaml`) +3. **Start VM** - fully automated provisioning +4. **Verify setup** via SSH key authentication + +### Existing VM Configuration +1. **Run post-install script** (`scripts/vm-management/vm-post-install.sh `) +2. **Automated provisioning** handles updates, SSH keys, Docker +3. **Security hardening** applied automatically +4. **Test connectivity** and verify Docker installation + +## Security Architecture +- **SSH key-based authentication** only (passwords disabled) +- **Emergency key backup** for failover access +- **User privilege separation** (sudo required, docker group) +- **Automatic security updates** configured +- **Network isolation** ready (10.10.0.x internal network) + ## Related Documentation -- Examples: `/examples/vm-management/proxmox-automation.md` -- Examples: `/examples/vm-management/ansible-provisioning.md` -- Examples: `/examples/vm-management/backup-strategies.md` -- Reference: `/reference/vm-management/troubleshooting.md` -- Reference: `/reference/vm-management/performance.md` \ No newline at end of file +- **Implementation**: `scripts/vm-management/README.md` - Complete setup guides +- **SSH Keys**: `patterns/networking/ssh-key-management.md` - Key lifecycle management +- **Examples**: `examples/networking/ssh-homelab-setup.md` - SSH integration patterns +- **Reference**: `reference/vm-management/troubleshooting.md` - Common issues and solutions \ No newline at end of file diff --git a/reference/docker/tdarr-monitoring-configuration.md b/reference/docker/tdarr-monitoring-configuration.md index 647da73..c276812 100644 --- a/reference/docker/tdarr-monitoring-configuration.md +++ b/reference/docker/tdarr-monitoring-configuration.md @@ -262,21 +262,131 @@ find /tmp/tdarr-monitor/ -name "*.log" -mtime +7 -delete - **Webhook Rotation**: Update webhook URL if Discord server changes - **Threshold Tuning**: Adjust 15-minute interval based on operational experience +## API-Based Monitoring Enhancement + +### Tdarr API Monitoring Script +**Location**: `tdarr_monitor.py` - Comprehensive API-based monitoring client +**Server**: http://10.10.0.43:8265 (main Tdarr server) +**Dependencies**: Python 3 with `requests` library + +#### Key Features +- **Server Health Monitoring**: Version, uptime, connectivity status +- **Queue Management**: Processing statistics, queue depth, item details +- **Node Status Tracking**: Online/offline nodes, worker counts, active jobs +- **Library Scan Progress**: File counts, scan status, completion percentage +- **Overall Statistics**: Transcoding metrics, space saved, processing speeds +- **Comprehensive Health Checks**: Multi-component status assessment + +#### API Endpoints Monitored +- `/api/v2/get-server-info` - Server version, system info, uptime +- `/api/v2/get-queue` - Current queue status and processing items +- `/api/v2/get-nodes` - Connected nodes and their status +- `/api/v2/get-libraries` - Library scan progress and file counts +- `/api/v2/get-stats` - Overall transcoding statistics and metrics + +#### Usage Examples +```bash +# Quick health check +python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check health + +# Queue status monitoring +python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check queue --output json + +# Node performance check +python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --verbose + +# Complete system status +python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check all +``` + +#### Integration with Existing Discord Monitoring +```bash +#!/bin/bash +# Enhanced health monitoring with API integration +HEALTH_OUTPUT=$(python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check health --output json) +STATUS=$(echo "$HEALTH_OUTPUT" | jq -r '.overall_status') + +if [ "$STATUS" != "healthy" ]; then + # Extract specific issues + SERVER_HEALTHY=$(echo "$HEALTH_OUTPUT" | jq -r '.checks.server.healthy') + NODE_COUNT=$(echo "$HEALTH_OUTPUT" | jq -r '.checks.nodes.online_count') + QUEUE_HEALTHY=$(echo "$HEALTH_OUTPUT" | jq -r '.checks.queue.healthy') + + MESSAGE="🎬 **Tdarr API Health Alert**\n" + MESSAGE+="Overall Status: **$STATUS**\n\n" + + if [ "$SERVER_HEALTHY" != "true" ]; then + MESSAGE+="❌ Server: Offline or unreachable\n" + fi + + if [ "$NODE_COUNT" == "0" ]; then + MESSAGE+="❌ Nodes: No online transcoding nodes\n" + fi + + if [ "$QUEUE_HEALTHY" != "true" ]; then + MESSAGE+="❌ Queue: Unable to access queue data\n" + fi + + MESSAGE+="\nCheck server status and node connectivity." + + # Send via existing Discord webhook + send_discord_message "$MESSAGE" +fi +``` + +#### Cron-based API Monitoring +```bash +# API health check every 5 minutes (complement log-based monitoring) +*/5 * * * * /path/to/tdarr_monitor.py --server http://10.10.0.43:8265 --check health >> /tmp/tdarr-api-health.log 2>&1 + +# Full status check hourly for detailed metrics +0 * * * * /path/to/tdarr_monitor.py --server http://10.10.0.43:8265 --check all --output json > /tmp/tdarr-status.json +``` + +#### Gaming Scheduler Integration +```bash +# Before starting transcoding, verify server health via API +if python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check health; then + echo "Tdarr server healthy, proceeding with scheduled start" + ./start-tdarr-gpu-podman-clean.sh +else + echo "Tdarr server unhealthy, skipping scheduled start" + # Send alert via existing Discord system + send_discord_message "🎮 **Gaming Scheduler Alert**: Tdarr server unhealthy, skipping transcoding session" +fi +``` + +#### Monitoring Metrics Available +- **Server Metrics**: Uptime, version, system info, connectivity +- **Queue Metrics**: Total items, processing count, queued count, completed count +- **Node Metrics**: Online/offline status, worker counts, active jobs, heartbeat +- **Library Metrics**: Total files, scan progress, library count, scan status +- **Performance Metrics**: Total transcodes, space saved, processing speed, error rates + ## Integration with Other Systems +### Current Monitoring Stack +- **Log-based Monitoring**: Timeout and worker stall detection (15-minute polling) +- **API-based Monitoring**: Real-time health and performance metrics (5-minute polling) +- **Discord Integration**: Unified alerting for both monitoring methods +- **Gaming Scheduler**: API health checks before transcoding sessions + ### Future Enhancements -- **Grafana Integration**: Add metrics collection for dashboard visualization -- **Prometheus Metrics**: Export timing and error rate metrics -- **Home Assistant**: Integrate with home automation for additional alerting -- **Email Backup**: Secondary notification method for critical alerts +- **Grafana Integration**: Visualize API metrics (queue depth, processing rates, node performance) +- **Prometheus Metrics**: Export API data for time-series analysis +- **Home Assistant**: Integrate server status with home automation +- **Email Backup**: Secondary notification method for critical API alerts +- **Metric Correlation**: Combine log-based alerts with API performance data ### Related Documentation - [NAS Mount Configuration](../networking/nas-mount-configuration.md) - SMB optimization context -- [Tdarr Troubleshooting](tdarr-troubleshooting.md) - Worker timeout background +- [Tdarr Troubleshooting](tdarr-troubleshooting.md) - Worker timeout background - [SSH Key Management](../networking/ssh-key-management.md) - Server access setup +- [Tdarr Gaming Scheduler](../../scripts/tdarr/README.md) - Gaming-aware automation --- -**Status**: ✅ Active and Configured -**Last Updated**: August 10, 2025 -**Next Review**: September 10, 2025 -**Discord Channel**: Homelab monitoring alerts configured and tested \ No newline at end of file +**Status**: ✅ Log-based monitoring active, API monitoring script created +**Last Updated**: August 12, 2025 +**Next Review**: September 12, 2025 +**Discord Channel**: Homelab monitoring alerts configured and tested +**API Endpoint**: http://10.10.0.43:8265 (verified accessible) \ No newline at end of file diff --git a/reference/vm-management/troubleshooting.md b/reference/vm-management/troubleshooting.md new file mode 100644 index 0000000..c5a6356 --- /dev/null +++ b/reference/vm-management/troubleshooting.md @@ -0,0 +1,530 @@ +# VM Management Troubleshooting Guide + +Complete troubleshooting guide for Proxmox VM provisioning, SSH connectivity, Docker installation, and common configuration issues. + +## Common Issues and Solutions + +### 1. VM Provisioning Failures + +#### Cloud-Init Not Working +**Symptoms:** +- VM starts but cloud-init configuration not applied +- User account not created +- SSH keys not installed + +**Diagnosis:** +```bash +# Check cloud-init status +ssh root@ 'cloud-init status --long' + +# View cloud-init logs +ssh root@ 'cat /var/log/cloud-init.log' +ssh root@ 'cat /var/log/cloud-init-output.log' + +# Check cloud-init configuration +ssh root@ 'cloud-init query userdata' +``` + +**Solutions:** +```bash +# Re-run cloud-init (if safe to do so) +ssh root@ 'cloud-init clean --logs' +ssh root@ 'cloud-init init --local' +ssh root@ 'cloud-init init' +ssh root@ 'cloud-init modules --mode=config' +ssh root@ 'cloud-init modules --mode=final' + +# Force user creation if missing +ssh root@ 'useradd -m -s /bin/bash -G sudo,docker cal' + +# Fix YAML syntax in cloud-init if needed +# Common issues: incorrect indentation, missing quotes +``` + +#### VM Won't Start +**Symptoms:** +- VM fails to boot +- Kernel panic or boot errors +- Hangs during startup + +**Diagnosis:** +```bash +# Check VM configuration in Proxmox +pvesh get /nodes/pve/qemu//config + +# View console output +# Use Proxmox web interface Console tab + +# Check VM resource allocation +pvesh get /nodes/pve/qemu//status/current +``` + +**Solutions:** +```bash +# Increase memory if low +pvesh set /nodes/pve/qemu//config -memory 2048 + +# Check disk space and format +pvesh get /nodes/pve/storage + +# Reset to safe configuration +pvesh set /nodes/pve/qemu//config -cpu host -cores 2 +``` + +### 2. SSH Connection Issues + +#### Cannot Connect to VM +**Symptoms:** +- Connection timeout +- Connection refused +- Host unreachable + +**Diagnosis:** +```bash +# Test network connectivity +ping + +# Check SSH port +nc -zv 22 +nmap -p 22 + +# Check from Proxmox console +# Use Proxmox web interface -> VM -> Console +systemctl status sshd +netstat -tlnp | grep :22 +``` + +**Solutions:** +```bash +# Start SSH service (via console) +systemctl start sshd +systemctl enable sshd + +# Check firewall (via console) +ufw status +# If active and blocking SSH: +ufw allow ssh + +# Reset network configuration +ip addr show +dhclient # If using DHCP +systemctl restart networking +``` + +#### SSH Key Authentication Fails +**Symptoms:** +- Password prompts despite keys being installed +- Permission denied (publickey) +- "No more authentication methods to try" + +**Diagnosis:** +```bash +# Verbose SSH connection +ssh -vvv cal@ + +# Check authorized_keys file (via console or password auth) +ls -la ~/.ssh/ +cat ~/.ssh/authorized_keys +``` + +**Solutions:** +```bash +# Fix file permissions +chmod 700 ~/.ssh +chmod 600 ~/.ssh/authorized_keys + +# Verify key content +cat ~/.ssh/authorized_keys | wc -l # Should show 2 keys + +# Re-deploy keys manually +cat ~/.ssh/homelab_rsa.pub >> ~/.ssh/authorized_keys +cat ~/.ssh/emergency_homelab_rsa.pub >> ~/.ssh/authorized_keys + +# Check SSH configuration +sudo grep -E "(PubkeyAuth|PasswordAuth)" /etc/ssh/sshd_config +sudo systemctl restart sshd +``` + +#### SSH Configuration Problems +**Symptoms:** +- SSH works but with wrong settings +- Root access when it should be disabled +- Password authentication enabled + +**Diagnosis:** +```bash +# Check effective SSH configuration +sudo sshd -T | grep -E "(passwordauth|pubkeyauth|permitroot)" + +# View SSH configuration files +cat /etc/ssh/sshd_config +ls /etc/ssh/sshd_config.d/ +``` + +**Solutions:** +```bash +# Apply security hardening manually +sudo tee /etc/ssh/sshd_config.d/99-homelab-security.conf << 'EOF' +PasswordAuthentication no +PubkeyAuthentication yes +PermitRootLogin no +AllowUsers cal +Protocol 2 +ClientAliveInterval 300 +ClientAliveCountMax 2 +MaxAuthTries 3 +EOF + +sudo systemctl restart sshd +``` + +### 3. Docker Installation Issues + +#### Docker Installation Fails +**Symptoms:** +- Docker packages not found +- GPG key verification fails +- Permission denied errors + +**Diagnosis:** +```bash +# Check internet connectivity +ping google.com +curl -I https://download.docker.com + +# Check repository configuration +cat /etc/apt/sources.list.d/docker.list +apt-cache policy docker-ce + +# Check for conflicting packages +dpkg -l | grep docker +``` + +**Solutions:** +```bash +# Remove conflicting packages +sudo apt remove -y docker docker-engine docker.io containerd runc + +# Re-add Docker repository +sudo mkdir -p /etc/apt/keyrings +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg +echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list + +# Install Docker +sudo apt update +sudo apt install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +``` + +#### Docker Permission Issues +**Symptoms:** +- "Permission denied" when running docker commands +- Must use sudo for docker commands +- User not in docker group + +**Diagnosis:** +```bash +# Check user groups +groups +groups cal + +# Check docker group exists +getent group docker + +# Check docker service +systemctl status docker +``` + +**Solutions:** +```bash +# Add user to docker group +sudo usermod -aG docker cal + +# Create docker group if missing +sudo groupadd docker +sudo usermod -aG docker cal + +# Apply group membership (logout/login or) +newgrp docker + +# Fix socket permissions +sudo chown root:docker /var/run/docker.sock +sudo chmod 664 /var/run/docker.sock +``` + +#### Docker Service Won't Start +**Symptoms:** +- Docker daemon not running +- Socket connection errors +- systemctl shows failed status + +**Diagnosis:** +```bash +# Check service status +systemctl status docker +journalctl -u docker.service -f + +# Check daemon logs +sudo dockerd --debug + +# Check system resources +df -h +free -h +``` + +**Solutions:** +```bash +# Restart Docker service +sudo systemctl restart docker +sudo systemctl enable docker + +# Clear Docker data if corrupted +sudo systemctl stop docker +sudo rm -rf /var/lib/docker/tmp/* +sudo systemctl start docker + +# Reset Docker configuration +sudo systemctl stop docker +sudo mv /etc/docker/daemon.json /etc/docker/daemon.json.bak +sudo systemctl start docker +``` + +### 4. System Update Issues + +#### Package Update Failures +**Symptoms:** +- apt update fails +- Repository errors +- Dependency conflicts + +**Diagnosis:** +```bash +# Check repository status +sudo apt update +cat /etc/apt/sources.list +ls /etc/apt/sources.list.d/ + +# Check disk space +df -h / +df -h /var +``` + +**Solutions:** +```bash +# Fix broken packages +sudo apt --fix-broken install +sudo dpkg --configure -a + +# Clean package cache +sudo apt clean +sudo apt autoclean +sudo apt autoremove + +# Reset sources if needed +sudo cp /etc/apt/sources.list /etc/apt/sources.list.backup +# Manually edit to use main Ubuntu repositories +``` + +### 5. Network Configuration Problems + +#### IP Configuration Issues +**Symptoms:** +- VM has wrong IP address +- No network connectivity +- DNS resolution fails + +**Diagnosis:** +```bash +# Check network configuration +ip addr show +ip route show +cat /etc/netplan/*.yaml + +# Test connectivity +ping 10.10.0.1 # Gateway +ping 8.8.8.8 # External DNS +nslookup google.com +``` + +**Solutions:** +```bash +# Fix netplan configuration +sudo nano /etc/netplan/00-installer-config.yaml + +# Example correct configuration: +network: + version: 2 + ethernets: + ens18: + dhcp4: false + addresses: [10.10.0.200/24] + gateway4: 10.10.0.1 + nameservers: + addresses: [10.10.0.16, 8.8.8.8] + +# Apply configuration +sudo netplan apply +``` + +#### DNS Resolution Problems +**Symptoms:** +- Cannot resolve domain names +- Package installation fails +- Hostname lookups fail + +**Diagnosis:** +```bash +# Check DNS configuration +cat /etc/resolv.conf +systemd-resolve --status + +# Test DNS +nslookup google.com +dig google.com +``` + +**Solutions:** +```bash +# Fix DNS in netplan +sudo nano /etc/netplan/00-installer-config.yaml +# Add nameservers section as shown above + +# Temporary DNS fix +echo "nameserver 8.8.8.8" | sudo tee /etc/resolv.conf + +# Restart networking +sudo netplan apply +sudo systemctl restart systemd-resolved +``` + +### 6. Storage and Disk Issues + +#### Disk Space Problems +**Symptoms:** +- VM runs out of disk space +- Cannot install packages +- Docker images won't download + +**Diagnosis:** +```bash +# Check disk usage +df -h +du -sh /home/* +du -sh /var/* + +# Check for large files +find / -size +100M 2>/dev/null | head -10 +``` + +**Solutions:** +```bash +# Clean system +sudo apt clean +sudo apt autoremove +docker system prune -a + +# Extend disk in Proxmox (if needed) +# Use Proxmox web interface: VM -> Hardware -> Hard Disk -> Resize + +# Extend filesystem after disk resize +sudo growpart /dev/sda 1 +sudo resize2fs /dev/sda1 +``` + +## Advanced Troubleshooting + +### Post-Install Script Debug Mode +```bash +# Run script with debug output +bash -x ./scripts/vm-management/vm-post-install.sh + +# Check specific steps manually +ssh cal@ 'docker --version' +ssh cal@ 'sudo systemctl status sshd' +ssh cal@ 'cat ~/.ssh/authorized_keys | wc -l' +``` + +### Recovery Procedures + +#### Emergency SSH Access +```bash +# If primary SSH key fails, use emergency key +ssh -i ~/.ssh/emergency_homelab_rsa cal@ + +# If all SSH access fails, use Proxmox console +# VM -> Console in Proxmox web interface + +# Reset SSH configuration +sudo cp /etc/ssh/sshd_config.backup /etc/ssh/sshd_config +sudo systemctl restart sshd +``` + +#### Complete VM Reset +```bash +# If VM is completely broken, restore from template +pvesh delete /nodes/pve/qemu/ +pvesh create /nodes/pve/qemu//clone -newid -name + +# Or re-run cloud-init provisioning +# Delete VM and recreate with same cloud-init configuration +``` + +## Prevention Best Practices + +### Pre-Deployment Checks +```bash +# Verify SSH keys exist +ls -la ~/.ssh/homelab_rsa* +ls -la ~/.ssh/emergency_homelab_rsa* + +# Test network connectivity to target subnet +ping 10.10.0.1 + +# Verify Proxmox storage space +pvesh get /nodes/pve/storage +``` + +### Monitoring and Alerts +```bash +# Create health check script +#!/bin/bash +# vm-health-monitor.sh +for ip in 10.10.0.{200..210}; do + if ssh -o ConnectTimeout=5 cal@$ip 'uptime' >/dev/null 2>&1; then + echo "✅ $ip: OK" + else + echo "❌ $ip: FAILED" + fi +done + +# Schedule regular checks +# Add to crontab: */15 * * * * /path/to/vm-health-monitor.sh +``` + +## Emergency Contacts and Resources + +### Documentation Links +- **Proxmox Documentation**: https://pve.proxmox.com/wiki/ +- **Cloud-Init Documentation**: https://cloud-init.readthedocs.io/ +- **Docker Installation Guide**: https://docs.docker.com/engine/install/ubuntu/ + +### Recovery Information +- **SSH Keys Location**: `/mnt/NV2/ssh-keys/backup-*/` +- **Emergency Access**: Use Proxmox console for direct VM access +- **Backup Strategy**: VM snapshots before major changes + +### Quick Reference Commands +```bash +# VM Status +pvesh get /nodes/pve/qemu//status/current + +# Start/Stop VM +pvesh create /nodes/pve/qemu//status/start +pvesh create /nodes/pve/qemu//status/stop + +# SSH with different key +ssh -i ~/.ssh/emergency_homelab_rsa cal@ + +# Docker system info +docker system info +docker system df +``` \ No newline at end of file diff --git a/scripts/monitoring/tdarr-timeout-monitor.sh b/scripts/monitoring/tdarr-timeout-monitor.sh index b99bfba..dffda3c 100755 --- a/scripts/monitoring/tdarr-timeout-monitor.sh +++ b/scripts/monitoring/tdarr-timeout-monitor.sh @@ -430,10 +430,10 @@ main() { log_error "Stuck work directory check failed" fi - # Optional: Check for successes (comment out if too noisy) - # if ! check_completions; then - # log_error "Completion check failed" - # fi + Optional: Check for successes (comment out if too noisy) + if ! check_completions; then + log_error "Completion check failed" + fi # Update timestamp echo "$CURRENT_TIME" > "$LAST_CHECK_FILE" diff --git a/scripts/tdarr/start-tdarr-gpu-podman-clean.sh b/scripts/tdarr/start-tdarr-gpu-podman-clean.sh index 0ebdbf5..a45fcc4 100755 --- a/scripts/tdarr/start-tdarr-gpu-podman-clean.sh +++ b/scripts/tdarr/start-tdarr-gpu-podman-clean.sh @@ -1,15 +1,47 @@ #!/bin/bash -# Tdarr Mapped Node with GPU Support - NVMe Cache Optimization -# This script starts a mapped Tdarr node with local NVMe cache +# Tdarr Unmapped Node with GPU Support - System Stability Optimized +# This script starts an unmapped Tdarr node with resource limits and local NVMe cache +# Updated 2025-08-11: Added container security measures to prevent kernel crashes +# Updated 2025-08-11: Fixed GPU parameter to use Podman CDI standard (--device nvidia.com/gpu=all) set -e -CONTAINER_NAME="tdarr-node-gpu-mapped" +CONTAINER_NAME="tdarr-node-gpu-unmapped" SERVER_IP="10.10.0.43" SERVER_PORT="8266" # Standard server port -NODE_NAME="nobara-pc-gpu-mapped" +NODE_NAME="nobara-pc-gpu-unmapped" -echo "🚀 Starting MAPPED Tdarr Node with GPU support using Podman..." +echo "🚀 Starting UNMAPPED Tdarr Node with GPU support and resource limits..." + +# Check for root privileges (required for memlock and other resource limits) +if [ "$EUID" -ne 0 ]; then + echo "" + echo "❌ This script requires root privileges for secure container resource limits." + echo "" + echo "🔒 Root privileges needed for:" + echo " - Memory lock limits (512MB) - prevent GPU memory exhaustion" + echo " - System-level resource limits - protect against container resource abuse" + echo " - GPU device access - privileged container operations" + echo " - Memory/CPU/I/O constraints - full cgroups resource control" + echo "" + echo "🚀 Please run with sudo:" + echo " sudo $0" + echo "" + exit 1 +fi + +# Check system requirements +echo "🔍 Checking system requirements..." +if ! command -v nvidia-smi &> /dev/null; then + echo "⚠️ Warning: nvidia-smi not found. GPU access may not work." +fi + +if [ ! -f "/etc/cdi/nvidia.yaml" ]; then + echo "⚠️ Warning: NVIDIA CDI configuration not found at /etc/cdi/nvidia.yaml" + echo " Run: nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml" +fi + +echo "✅ Running with root privileges - full resource limits enabled" # Stop and remove existing container if it exists if podman ps -a --format "{{.Names}}" | grep -q "^${CONTAINER_NAME}$"; then @@ -18,11 +50,19 @@ if podman ps -a --format "{{.Names}}" | grep -q "^${CONTAINER_NAME}$"; then podman rm "${CONTAINER_NAME}" 2>/dev/null || true fi -# Start Tdarr node with GPU support - MAPPED VERSION -echo "🎬 Starting Mapped Tdarr Node container..." +# Start Tdarr node with GPU support - UNMAPPED VERSION with Resource Limits +echo "🎬 Starting Unmapped Tdarr Node container with resource limits..." podman run -d --name "${CONTAINER_NAME}" \ - --gpus all \ + --device nvidia.com/gpu=all \ --restart unless-stopped \ + --memory=32g \ + --memory-swap=40g \ + --cpus="14" \ + --pids-limit=1000 \ + --ulimit nofile=65536:65536 \ + --ulimit memlock=536870912:536870912 \ + --device-read-bps /dev/nvme0n1:1g \ + --device-write-bps /dev/nvme0n1:1g \ -e TZ=America/Chicago \ -e UMASK_SET=002 \ -e nodeName="${NODE_NAME}" \ @@ -33,10 +73,9 @@ podman run -d --name "${CONTAINER_NAME}" \ -e logLevel=DEBUG \ -e NVIDIA_DRIVER_CAPABILITIES=all \ -e NVIDIA_VISIBLE_DEVICES=all \ + -e nodeType=unmapped \ + -e unmappedNodeCache=/cache \ -v "/mnt/NV2/tdarr-cache:/cache" \ - -v "/mnt/media/TV:/media/TV" \ - -v "/mnt/media/Movies:/media/Movies" \ - -v "/mnt/media/tdarr/tdarr-cache-clean:/temp" \ ghcr.io/haveagitgat/tdarr_node:latest echo "⏳ Waiting for container to initialize..." @@ -44,27 +83,46 @@ sleep 5 # Check container status if podman ps --format "{{.Names}}" | grep -q "^${CONTAINER_NAME}$"; then - echo "✅ Mapped Tdarr Node is running successfully!" + echo "✅ Unmapped Tdarr Node is running successfully!" echo "" echo "📊 Container Status:" podman ps --filter "name=${CONTAINER_NAME}" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" echo "" - echo "🔍 Testing GPU Access:" + echo "🔍 Testing GPU Access (using Podman CDI standard):" if podman exec "${CONTAINER_NAME}" nvidia-smi --query-gpu=name --format=csv,noheader,nounits 2>/dev/null; then echo "🎉 GPU is accessible in container!" else echo "⚠️ GPU test failed, but container is running" + echo " Check: sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml" fi echo "" + echo "🛡️ Resource Limits Applied:" + echo " Memory: 32GB limit + 8GB swap" + echo " CPUs: 14 cores (2 reserved for system)" + echo " PIDs: 1000 limit" + echo " NVMe I/O: 1GB/s read/write limit" + echo "" + echo "🏗️ Architecture: Unmapped Node" + echo " - No direct media volume mounts" + echo " - Downloads files to local NVMe cache" + echo " - Prevents CIFS streaming during transcoding" + echo " - Eliminates kernel memory corruption risk" + echo "" echo "🌐 Connection Details:" echo " Server: ${SERVER_IP}:${SERVER_PORT}" echo " Node Name: ${NODE_NAME}" + echo " Node Type: Unmapped" echo " Web UI: http://${SERVER_IP}:8265" echo "" echo "📋 Container Management:" echo " View logs: podman logs ${CONTAINER_NAME}" echo " Stop: podman stop ${CONTAINER_NAME}" echo " Remove: podman rm ${CONTAINER_NAME}" + echo "" + echo "⚠️ Important Configuration Requirements:" + echo " - Server must have 'Allow unmapped Nodes' enabled" + echo " - NVIDIA CDI configuration required for GPU access" + echo " - cgroups V2 recommended for full resource limit support" else echo "❌ Failed to start container" echo "📋 Checking logs..." diff --git a/scripts/tdarr/tdarr-schedule-manager.sh b/scripts/tdarr/tdarr-schedule-manager.sh index aa9968c..4ca0785 100755 --- a/scripts/tdarr/tdarr-schedule-manager.sh +++ b/scripts/tdarr/tdarr-schedule-manager.sh @@ -188,7 +188,7 @@ show_status() { echo "=====================" # Container status - if podman ps --format "{{.Names}}\t{{.Status}}" | grep -q "tdarr-node-gpu"; then + if podman ps --format "{{.Names}}\t{{.Status}}" | grep -q "tdarr-node-gpu-unmapped"; then echo "🟢 Tdarr Container: RUNNING" podman ps --format "{{.Names}}\t{{.Status}}\t{{.CreatedAt}}" | grep tdarr else diff --git a/scripts/vm-management/README.md b/scripts/vm-management/README.md new file mode 100644 index 0000000..87d6fd7 --- /dev/null +++ b/scripts/vm-management/README.md @@ -0,0 +1,158 @@ +# VM Management and Provisioning + +Automated VM provisioning scripts for Proxmox environments with SSH key deployment, system updates, and Docker installation. + +## Files + +### `vm-post-install.sh` +Post-installation provisioning script for existing VMs. + +**Usage:** +```bash +./vm-post-install.sh [ssh-user] +``` + +**Example:** +```bash +./vm-post-install.sh 10.10.0.100 cal +``` + +**Features:** +- ✅ System updates and essential packages +- ✅ SSH key deployment (primary + emergency keys) +- ✅ SSH security hardening (disable password auth) +- ✅ Docker and Docker Compose installation +- ✅ User environment setup with aliases +- ✅ Automatic security updates configuration + +**Requirements:** +- Target VM must have SSH access enabled initially +- Homelab SSH keys must exist: `~/.ssh/homelab_rsa` and `~/.ssh/emergency_homelab_rsa` +- Initial connection may require password authentication + +### `cloud-init-user-data.yaml` +Cloud-init configuration for fully automated VM provisioning in Proxmox. + +**Usage:** +1. Copy contents of this file +2. In Proxmox, create VM with cloud-init support +3. Paste the YAML content into the "User Data" field +4. Start the VM + +**Features:** +- ✅ User creation with sudo privileges +- ✅ SSH keys pre-installed (no password auth needed) +- ✅ Automatic package updates +- ✅ Docker and Docker Compose installation +- ✅ Security hardening from first boot +- ✅ Useful bash aliases and environment setup +- ✅ Welcome message with system status + +## Quick Start + +### Option 1: Post-Installation Script (Existing VMs) +```bash +# Make script executable +chmod +x scripts/vm-management/vm-post-install.sh + +# Provision an existing VM +./scripts/vm-management/vm-post-install.sh 10.10.0.100 cal +``` + +### Option 2: Cloud-Init (New VMs in Proxmox) +1. Create new VM in Proxmox with cloud-init support +2. Go to Cloud-Init tab +3. Copy contents of `cloud-init-user-data.yaml` +4. Paste into "User Data" field +5. Start VM - it will be fully provisioned automatically + +## SSH Key Management Integration + +Both provisioning methods integrate with the existing homelab SSH key management: + +- **Primary Key**: `~/.ssh/homelab_rsa` - Daily use authentication +- **Emergency Key**: `~/.ssh/emergency_homelab_rsa` - Backup access +- **Security**: Password authentication disabled after key deployment +- **Backup**: Keys are managed by existing SSH backup system + +## Post-Provisioning Verification + +After provisioning, verify the setup: + +```bash +# Test SSH access with key +ssh cal@ + +# Verify Docker installation +docker --version +docker compose version +docker run --rm hello-world + +# Check user groups +groups cal +# Should include: cal sudo docker + +# Verify SSH security +sudo sshd -T | grep -E "(passwordauth|pubkeyauth|permitroot)" +# Should show: +# passwordauthentication no +# pubkeyauthentication yes +# permitrootlogin no +``` + +## Customization + +### Modifying SSH Keys +Edit the SSH public keys in `cloud-init-user-data.yaml` or ensure your local SSH keys match the expected paths for the post-install script. + +### Changing Default User +Update the username in both scripts (default: `cal`): +- In `vm-post-install.sh`: Change `SSH_USER="${2:-cal}"` +- In `cloud-init-user-data.yaml`: Change the user configuration section + +### Additional Packages +Add packages to: +- **Post-install script**: Add to the `apt install` command +- **Cloud-init**: Add to the `packages:` section + +### Custom Aliases +Modify bash aliases in: +- **Post-install script**: Update the aliases added to `~/.bashrc` +- **Cloud-init**: Edit the `.bash_aliases` file content + +## Troubleshooting + +### Script Fails to Connect +- Verify VM is accessible: `ping ` +- Check SSH service: `nc -z 22` +- Ensure initial password/key authentication works + +### Docker Installation Issues +- Check internet connectivity on VM +- Verify Docker GPG key download succeeded +- Review Docker service status: `systemctl status docker` + +### SSH Key Authentication Problems +- Verify key file permissions (600 for private, 644 for public) +- Check authorized_keys file on target VM +- Test manual key-based connection + +### Cloud-Init Not Working +- Check Proxmox cloud-init support is enabled for VM +- Verify YAML syntax is valid +- Review cloud-init logs: `sudo cloud-init status --long` + +## Security Notes + +- Password authentication is completely disabled after provisioning +- Only key-based SSH access allowed +- Emergency keys provide backup access +- Automatic security updates enabled +- User has sudo privileges but requires proper SSH key authentication +- Docker group membership allows container management without sudo + +## Related Documentation + +- SSH Key Management: `patterns/networking/ssh-key-management.md` +- SSH Setup Examples: `examples/networking/ssh-homelab-setup.md` +- Docker Patterns: `patterns/docker/` \ No newline at end of file diff --git a/scripts/vm-management/cloud-init-user-data.yaml b/scripts/vm-management/cloud-init-user-data.yaml new file mode 100644 index 0000000..23d1ffe --- /dev/null +++ b/scripts/vm-management/cloud-init-user-data.yaml @@ -0,0 +1,154 @@ +#cloud-config +# +# Proxmox Cloud-Init User Data Template +# Use this as the user data for VM provisioning in Proxmox +# This automates: updates, SSH keys, Docker installation, security hardening +# + +# System configuration +hostname: homelab-vm +timezone: America/New_York +locale: en_US.UTF-8 + +# User configuration +users: + - name: cal + groups: [sudo, docker] + shell: /bin/bash + sudo: ALL=(ALL) NOPASSWD:ALL + ssh_authorized_keys: + - ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDVVK02rOeeIw1e7BkGlUtSCVrO66dcYM6wGC6snL/lDSxRWi3ABinOj7a74SJ12uuBjpj1Ui6fGbYLvLHCzrMQaijOKpno4BQmulVLT2hKw6xBszQlA7Z2NpP/CiieUsCYBj+vfn743grLRkb6jgVEw2E3TCkNOord0zfmPp6EIre7clUuPUlP75IZMpttbqXhOTjn8Kfq+8+a/iHln4LLDBJbaz/J31Wxnk+eI5tdVz9nO0LHxMxfTbS0HTAAQaP4RRdU+7f455p1xlSIFhZJ0EEXPPTTy7GU07QonzdppS99G+f95kw2mgkX6RmK4j3y7tw01eNV9eWj/hoSvad4xKpMfQbeKXT9EuqsLgzyoUrxaDhEuGH2bZhxMXiQn7AL5kdKfZAaChoy17B+tcSQdZBc1FyHMVYZoJyPYH5kDQKq1qVkHoyQ/H9szKsAIGXzTGwqWMa/5R5FzgrppwpP+dQMqP84DmlY9EeRISegFEKNz2dTKerB/sG4S1bp4EmNUYfdWZUW7ROdS5KOcjExouBlQmksucZpn3sJ6TLDzBN47dw8SCAPuuZzyAXWVAP7GLu7is+cb+jQMh+Twv6LPwVl6SpnpEJxJAG2ijlFgTL9DlHnrcZB7Rilumb3oO5+uKMae+3EYI7jsvbqZlyCIYF6nOB+La1eEIrQyncqdQ== homelab-cal@nobara-pc + - ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQCPzqHDgdK9TlN4uzumBZKEOGtxHaQPK2/m7vy/A9bNLRprG7BO0fXsdRi3wRBzPdta/fMERw/jDTsr+Nvofl8y7eaIAemZ3dJ7DsMVdlNCROrnEwAcBS0JnOKfx0YLqqGvTuXGFQuX9PCzGhGSfjOgn6+o2+WdrFW21fvFxMc2mV6Ds1/m8VO9rqPjAqRvuUl3LJPv+/jYHpt7otv4zJOg+guWO7gxUe0lVF7fBW54Cl3e7fMic8MjAdGvkaruTIC4uOqNMYKJjGQEYrlWB4vhI0ZAkLi6vxTm36bwy6jeTy9Mcl9pNM2ATyQzo8rA/OdG5ifHBSd1jrzT5Uj00mntDku73hQptMhakRj2PUUjRXmk1SPYi+Ts6iFg1xc0oeIXmd/lLXfod1K4avAKzuiOyubSVVp1K5RYYPT4tdaSA8yraEA9F23CaD2Cw+AjAGXKcaOuHZvFY8r8I/vb/Y7c+JQr6uAhM1aJnXGFKKwo6YFTJmv93gB5/yTBpmXAkRlwplcxnxvvjA3i/PC3H22XDqJyjmU29F9PjVMga82I4iU1cyJ+3zMbu4e1biJK+jSPYIr+qXdzvvLrrSG/ajg+9flc8HHhEs/A5EAm2naRR739xvKOIQboRsV67UgZ4TH1aJZNzQEkA70qGSGsZNPtVsbzS/4YlHH9CPawnv3ICQ== emergency-homelab-cal@nobara-pc + +# SSH configuration +ssh_pwauth: false +disable_root: true +ssh_genkeytypes: [] + +# Package management +package_update: true +package_upgrade: true +package_reboot_if_required: true + +packages: + - curl + - wget + - git + - vim + - htop + - unzip + - software-properties-common + - apt-transport-https + - ca-certificates + - gnupg + - lsb-release + - unattended-upgrades + +# Docker installation via runcmd +runcmd: + # Configure automatic security updates + - dpkg-reconfigure -plow unattended-upgrades + + # Remove any old Docker installations + - apt-get remove -y docker docker-engine docker.io containerd runc || true + + # Add Docker GPG key and repository + - mkdir -p /etc/apt/keyrings + - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg + - echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null + + # Install Docker + - apt-get update + - apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + + # Enable Docker service + - systemctl enable docker + - systemctl start docker + + # Add cal user to docker group (will take effect after next login) + - usermod -aG docker cal + + # Test Docker installation + - docker run --rm hello-world + +# Write configuration files +write_files: + # SSH hardening configuration + - path: /etc/ssh/sshd_config.d/99-homelab-security.conf + content: | + PasswordAuthentication no + PubkeyAuthentication yes + PermitRootLogin no + AllowUsers cal + Protocol 2 + ClientAliveInterval 300 + ClientAliveCountMax 2 + MaxAuthTries 3 + permissions: '0644' + + # Useful bash aliases for the cal user + - path: /home/cal/.bash_aliases + content: | + # Docker aliases + alias dps='docker ps' + alias dlog='docker logs' + alias dexec='docker exec -it' + alias dstop='docker stop $(docker ps -q)' + alias dprune='docker system prune -f' + + # Docker Compose aliases + alias dc='docker compose' + alias dcup='docker compose up -d' + alias dcdown='docker compose down' + alias dclogs='docker compose logs -f' + + # System aliases + alias ll='ls -alF' + alias la='ls -A' + alias l='ls -CF' + alias ..='cd ..' + alias ...='cd ../..' + alias grep='grep --color=auto' + alias fgrep='fgrep --color=auto' + alias egrep='egrep --color=auto' + owner: cal:cal + permissions: '0644' + + # Welcome message with system info + - path: /etc/motd + content: | + + ╔══════════════════════════════════════════════════════╗ + ║ Home Lab VM - Ready ║ + ║ ║ + ║ Docker & Docker Compose: Installed ✓ ║ + ║ SSH Keys: Deployed ✓ ║ + ║ Security: Hardened ✓ ║ + ║ Updates: Automatic ✓ ║ + ║ ║ + ║ Quick Commands: ║ + ║ docker --version ║ + ║ docker compose version ║ + ║ docker run --rm hello-world ║ + ║ ║ + ╚══════════════════════════════════════════════════════╝ + + permissions: '0644' + +# Final commands to run after everything else +final_message: | + VM provisioning completed successfully! + + Installed software: + - Docker & Docker Compose + - Essential system packages + - Automatic security updates + + Security configuration: + - SSH key-based authentication only + - Password authentication disabled + - User 'cal' added to docker group + + Connect via SSH: ssh cal@ + + System is ready for Docker container deployment! \ No newline at end of file diff --git a/scripts/vm-management/vm-post-install.sh b/scripts/vm-management/vm-post-install.sh new file mode 100755 index 0000000..7f92c30 --- /dev/null +++ b/scripts/vm-management/vm-post-install.sh @@ -0,0 +1,246 @@ +#!/bin/bash +# +# VM Post-Installation Provisioning Script +# Handles: System updates, SSH key deployment, Docker installation +# +# Usage: ./vm-post-install.sh [ssh-user] +# Example: ./vm-post-install.sh 10.10.0.100 cal +# + +set -e + +# Configuration +VM_IP="$1" +SSH_USER="${2:-cal}" +HOMELAB_KEY="$HOME/.ssh/homelab_rsa" +EMERGENCY_KEY="$HOME/.ssh/emergency_homelab_rsa" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Logging function +log() { + echo -e "${GREEN}[$(date '+%H:%M:%S')]${NC} $1" +} + +warn() { + echo -e "${YELLOW}[$(date '+%H:%M:%S')] WARNING:${NC} $1" +} + +error() { + echo -e "${RED}[$(date '+%H:%M:%S')] ERROR:${NC} $1" + exit 1 +} + +# Validation +if [ -z "$VM_IP" ]; then + error "Usage: $0 [ssh-user]" +fi + +if [ ! -f "$HOMELAB_KEY" ]; then + error "Homelab SSH key not found at $HOMELAB_KEY" +fi + +if [ ! -f "$EMERGENCY_KEY" ]; then + error "Emergency SSH key not found at $EMERGENCY_KEY" +fi + +log "Starting VM provisioning for $VM_IP as user $SSH_USER" + +# Test initial connectivity (assume password auth is still enabled) +log "Testing initial connectivity to $VM_IP..." +if ! timeout 10 bash -c "nc -z $VM_IP 22" >/dev/null 2>&1; then + error "Cannot reach $VM_IP on port 22" +fi + +# Function to run commands on remote VM +run_remote() { + local cmd="$1" + local use_key="${2:-false}" + + if [ "$use_key" = "true" ]; then + ssh -i "$HOMELAB_KEY" -o StrictHostKeyChecking=accept-new "$SSH_USER@$VM_IP" "$cmd" + else + # Initial connection might need password + ssh -o StrictHostKeyChecking=accept-new "$SSH_USER@$VM_IP" "$cmd" + fi +} + +# Function to copy files to remote VM +copy_to_remote() { + local src="$1" + local dest="$2" + local use_key="${3:-false}" + + if [ "$use_key" = "true" ]; then + scp -i "$HOMELAB_KEY" -o StrictHostKeyChecking=accept-new "$src" "$SSH_USER@$VM_IP:$dest" + else + scp -o StrictHostKeyChecking=accept-new "$src" "$SSH_USER@$VM_IP:$dest" + fi +} + +# Step 1: System Updates +log "Step 1: Running system updates..." +run_remote " + echo 'Updating package lists...' + sudo apt update + echo 'Upgrading packages...' + sudo apt upgrade -y + echo 'Installing essential packages...' + sudo apt install -y curl wget git vim htop unzip software-properties-common apt-transport-https ca-certificates gnupg lsb-release + echo 'Cleaning up...' + sudo apt autoremove -y + sudo apt autoclean +" +log "✅ System updates completed" + +# Step 2: SSH Key Installation +log "Step 2: Installing SSH keys..." + +# Create .ssh directory if it doesn't exist +run_remote "mkdir -p ~/.ssh && chmod 700 ~/.ssh" + +# Deploy primary homelab key +log "Installing primary homelab SSH key..." +cat "$HOMELAB_KEY.pub" | run_remote "cat >> ~/.ssh/authorized_keys" + +# Deploy emergency key +log "Installing emergency SSH key..." +cat "$EMERGENCY_KEY.pub" | run_remote "cat >> ~/.ssh/authorized_keys" + +# Set proper permissions +run_remote "chmod 600 ~/.ssh/authorized_keys" + +# Test key-based authentication +log "Testing key-based authentication..." +if run_remote "echo 'SSH key authentication working'" true; then + log "✅ SSH key authentication successful" +else + error "SSH key authentication failed" +fi + +# Step 3: Secure SSH Configuration +log "Step 3: Securing SSH configuration..." +run_remote " + sudo cp /etc/ssh/sshd_config /etc/ssh/sshd_config.backup + sudo sed -i 's/#PasswordAuthentication yes/PasswordAuthentication no/' /etc/ssh/sshd_config + sudo sed -i 's/PasswordAuthentication yes/PasswordAuthentication no/' /etc/ssh/sshd_config + sudo sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config + sudo sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config + sudo systemctl restart sshd +" true +log "✅ SSH hardened (password authentication disabled)" + +# Step 4: Docker Installation +log "Step 4: Installing Docker and Docker Compose..." +run_remote " + # Remove any old Docker installations + sudo apt remove -y docker docker-engine docker.io containerd runc 2>/dev/null || true + + # Add Docker GPG key + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg + + # Add Docker repository + echo \"deb [arch=\$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu \$(lsb_release -cs) stable\" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null + + # Update package list + sudo apt update + + # Install Docker + sudo apt install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + + # Add user to docker group + sudo usermod -aG docker $USER + + # Enable Docker service + sudo systemctl enable docker + sudo systemctl start docker +" true + +# Verify Docker installation +log "Verifying Docker installation..." +run_remote " + docker --version + docker compose version + sudo docker run --rm hello-world +" true +log "✅ Docker and Docker Compose installed successfully" + +# Step 5: System Configuration +log "Step 5: Additional system configuration..." +run_remote " + # Configure automatic security updates + sudo apt install -y unattended-upgrades + sudo dpkg-reconfigure -plow unattended-upgrades + + # Set timezone (adjust as needed) + sudo timedatectl set-timezone America/New_York + + # Configure system hostname if needed + echo 'System configuration completed' +" true + +# Step 6: Create useful aliases and environment +log "Step 6: Setting up user environment..." +run_remote " + # Create useful aliases + cat >> ~/.bashrc << 'EOF' + +# Docker aliases +alias dps='docker ps' +alias dlog='docker logs' +alias dexec='docker exec -it' +alias dstop='docker stop \$(docker ps -q)' +alias dprune='docker system prune -f' + +# System aliases +alias ll='ls -alF' +alias la='ls -A' +alias l='ls -CF' +alias ..='cd ..' +alias ...='cd ../..' + +# Docker Compose aliases +alias dc='docker compose' +alias dcup='docker compose up -d' +alias dcdown='docker compose down' +alias dclogs='docker compose logs -f' +EOF + + # Source the updated bashrc + source ~/.bashrc 2>/dev/null || true +" true + +log "✅ User environment configured" + +# Step 7: Final verification +log "Step 7: Final system verification..." +HOSTNAME=$(run_remote "hostname" true) +KERNEL=$(run_remote "uname -r" true) +DOCKER_VERSION=$(run_remote "docker --version" true) +UPTIME=$(run_remote "uptime" true) + +log "🎉 VM provisioning completed successfully!" +echo "" +echo -e "${BLUE}=== PROVISIONING SUMMARY ===${NC}" +echo -e "${GREEN}VM IP:${NC} $VM_IP" +echo -e "${GREEN}User:${NC} $SSH_USER" +echo -e "${GREEN}Hostname:${NC} $HOSTNAME" +echo -e "${GREEN}Kernel:${NC} $KERNEL" +echo -e "${GREEN}Docker:${NC} $DOCKER_VERSION" +echo -e "${GREEN}Uptime:${NC} $UPTIME" +echo "" +echo -e "${YELLOW}Next Steps:${NC}" +echo "1. SSH access: ssh $SSH_USER@$VM_IP" +echo "2. Test Docker: ssh $SSH_USER@$VM_IP 'docker run --rm hello-world'" +echo "3. Deploy applications using Docker Compose" +echo "" +echo -e "${YELLOW}Security Notes:${NC}" +echo "- Password authentication is DISABLED" +echo "- Only key-based SSH access is allowed" +echo "- Both primary and emergency SSH keys are installed" +echo "- Automatic security updates are enabled" \ No newline at end of file