From aed1f54d9193f153bef8115cedde100910c986ee Mon Sep 17 00:00:00 2001 From: Cal Corum Date: Tue, 12 Aug 2025 12:15:41 -0500 Subject: [PATCH] CLAUDE: Add comprehensive Tdarr API monitoring with dataclass-based status tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add tdarr_monitor.py: Python-based API monitoring client with type-safe dataclasses - ServerStatus, QueueStatus, NodeStatus, LibraryStatus, StatisticsStatus, HealthStatus - Support for health checks, queue monitoring, node status, library scans - JSON and pretty-print output formats with proper exit codes - Integration with existing Discord monitoring system - Create scripts/monitoring/README.md: Complete monitoring documentation - Comprehensive usage examples and command-line options - Integration patterns with gaming-aware scheduling - Best practices for automated health monitoring - Update CLAUDE.md: Enhanced Tdarr keyword triggers and documentation structure - Add "monitoring" and "api" keywords to automatically load monitoring docs - Reference new tdarr_monitor.py with dataclass-based status tracking - Update documentation structure to show monitoring script location 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- CLAUDE.md | 5 +- scripts/monitoring/README.md | 117 +++++++ scripts/monitoring/tdarr_monitor.py | 498 ++++++++++++++++++++++++++++ 3 files changed, 619 insertions(+), 1 deletion(-) create mode 100644 scripts/monitoring/README.md create mode 100755 scripts/monitoring/tdarr_monitor.py diff --git a/CLAUDE.md b/CLAUDE.md index f638860..a9f52a2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -120,11 +120,13 @@ When user mentions specific terms, automatically load relevant docs: - Load: `examples/vm-management/` **Tdarr Keywords** -- "tdarr", "transcode", "ffmpeg", "gpu transcoding", "nvenc", "forEach error", "gaming detection", "scheduler" +- "tdarr", "transcode", "ffmpeg", "gpu transcoding", "nvenc", "forEach error", "gaming detection", "scheduler", "monitoring", "api" - Load: `reference/docker/tdarr-troubleshooting.md` - Load: `patterns/docker/distributed-transcoding.md` - Load: `scripts/tdarr/README.md` (for automation and scheduling) + - Load: `scripts/monitoring/README.md` (for monitoring and health checks) - Note: Gaming-aware scheduling system with configurable time windows available + - Note: Comprehensive API monitoring available via `tdarr_monitor.py` with dataclass-based status tracking **Windows Monitoring Keywords** - "windows reboot", "discord notification", "system monitor", "windows desktop", "power outage", "windows update" @@ -154,6 +156,7 @@ When user mentions specific terms, automatically load relevant docs: /scripts/ # Active scripts and utilities for home lab operations ├── tdarr/ # Tdarr automation with gaming-aware scheduling ├── monitoring/ # System monitoring and alerting + │ ├── tdarr_monitor.py # Comprehensive Tdarr API monitoring with dataclasses │ └── windows-desktop/ # Windows reboot monitoring with Discord notifications └── / # Other organized automation subsystems ``` diff --git a/scripts/monitoring/README.md b/scripts/monitoring/README.md new file mode 100644 index 0000000..e244041 --- /dev/null +++ b/scripts/monitoring/README.md @@ -0,0 +1,117 @@ +# Monitoring Scripts + +This directory contains various monitoring scripts and tools for the home lab infrastructure. + +## Available Scripts + +### Tdarr Monitoring + +#### tdarr_monitor.py +A comprehensive Python-based monitoring tool for Tdarr media transcoding servers. Features dataclass-based return types for improved type safety and IDE support. + +**Features:** +- Server status and health monitoring +- Queue status and statistics tracking +- Node connectivity and performance monitoring +- Library scan progress monitoring +- Worker activity tracking +- Comprehensive health checks +- JSON and pretty-print output formats +- Configurable timeouts and logging + +**Usage:** +```bash +# Basic health check +./tdarr_monitor.py --server http://10.10.0.43:8265 --check health + +# Monitor queue status +./tdarr_monitor.py --server http://10.10.0.43:8265 --check queue + +# Get all status information +./tdarr_monitor.py --server http://10.10.0.43:8265 --check all --output json + +# Monitor nodes with verbose logging +./tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes --verbose +``` + +**Available Checks:** +- `health` - Comprehensive health check (default) +- `status` - Server status and configuration +- `queue` - Transcoding queue statistics +- `nodes` - Connected nodes status +- `libraries` - Library scan progress +- `stats` - Overall transcoding statistics +- `all` - All checks combined + +**Output Formats:** +- `pretty` - Human-readable format (default) +- `json` - Structured JSON output + +**Exit Codes:** +- `0` - Success, all systems healthy +- `1` - Error or unhealthy status detected + +**Requirements:** +- Python 3.7+ +- `requests` library +- Access to Tdarr server API endpoints + +#### tdarr-timeout-monitor.sh +Shell script for monitoring Tdarr timeouts and system status. + +**Usage:** +```bash +./tdarr-timeout-monitor.sh +``` + +### System Monitoring + +#### Windows Desktop Monitoring +Complete Windows desktop monitoring system with Discord notifications for reboots and system events. + +Location: `windows-desktop/` +- Full setup instructions in `windows-desktop/README.md` +- PowerShell monitoring scripts +- Windows Task Scheduler integration +- Discord webhook notifications + +**Features:** +- Automatic reboot detection +- System startup/shutdown monitoring +- Discord notifications with timestamps +- Configurable monitoring intervals +- Windows Task Scheduler integration + +### Setup and Configuration + +#### Discord Integration +See `setup-discord-monitoring.md` for Discord webhook setup instructions. + +## Integration with Home Lab + +### Tdarr Keywords Trigger +When working with Tdarr-related tasks, the following documentation is automatically loaded: +- `reference/docker/tdarr-troubleshooting.md` +- `patterns/docker/distributed-transcoding.md` +- `scripts/tdarr/README.md` + +### Gaming-Aware Scheduling +The monitoring scripts integrate with the gaming-aware Tdarr scheduling system that provides: +- Configurable time windows for transcoding +- Gaming session detection +- Automated resource management +- Smart scheduling to avoid performance conflicts + +## Best Practices + +1. **Regular Monitoring**: Set up cron jobs or scheduled tasks for regular status checks +2. **Health Checks**: Use the health check endpoints for automated monitoring +3. **Logging**: Enable verbose logging for troubleshooting +4. **Timeout Configuration**: Adjust timeouts based on network conditions +5. **Error Handling**: Monitor exit codes for automated alerting + +## Related Documentation + +- `/patterns/docker/distributed-transcoding.md` - Tdarr architecture patterns +- `/reference/docker/tdarr-troubleshooting.md` - Troubleshooting guide +- `/scripts/tdarr/README.md` - Tdarr management scripts \ No newline at end of file diff --git a/scripts/monitoring/tdarr_monitor.py b/scripts/monitoring/tdarr_monitor.py new file mode 100755 index 0000000..2390976 --- /dev/null +++ b/scripts/monitoring/tdarr_monitor.py @@ -0,0 +1,498 @@ +#!/usr/bin/env python3 +""" +Tdarr API Monitoring Script + +Monitors Tdarr server via its web API endpoints: +- Server status and health +- Queue status and statistics +- Node status and performance +- Library scan progress +- Worker activity + +Usage: + python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check all + python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check queue + python3 tdarr_monitor.py --server http://10.10.0.43:8265 --check nodes +""" + +import argparse +import json +import logging +import sys +from dataclasses import dataclass, asdict +from datetime import datetime +from typing import Dict, List, Optional, Any +import requests +from urllib.parse import urljoin + + +@dataclass +class ServerStatus: + timestamp: str + server_url: str + status: str + error: Optional[str] = None + version: Optional[str] = None + server_id: Optional[str] = None + uptime: Optional[str] = None + system_info: Optional[Dict[str, Any]] = None + + +@dataclass +class QueueStats: + total_files: int + queued: int + processing: int + completed: int + queue_items: List[Dict[str, Any]] + + +@dataclass +class QueueStatus: + timestamp: str + queue_stats: Optional[QueueStats] = None + error: Optional[str] = None + + +@dataclass +class NodeInfo: + id: Optional[str] + nodeName: Optional[str] + status: str + lastSeen: Optional[int] + version: Optional[str] + platform: Optional[str] + workers: Dict[str, int] + processing: List[Dict[str, Any]] + + +@dataclass +class NodeSummary: + total_nodes: int + online_nodes: int + offline_nodes: int + online_details: List[NodeInfo] + offline_details: List[NodeInfo] + + +@dataclass +class NodeStatus: + timestamp: str + nodes: List[Dict[str, Any]] + node_summary: Optional[NodeSummary] = None + error: Optional[str] = None + + +@dataclass +class LibraryInfo: + name: Optional[str] + path: Optional[str] + file_count: int + scan_progress: int + last_scan: Optional[str] + is_scanning: bool + + +@dataclass +class ScanStatus: + total_libraries: int + total_files: int + scanning_libraries: int + + +@dataclass +class LibraryStatus: + timestamp: str + libraries: List[LibraryInfo] + scan_status: Optional[ScanStatus] = None + error: Optional[str] = None + + +@dataclass +class Statistics: + total_transcodes: int + space_saved: int + total_files_processed: int + failed_transcodes: int + processing_speed: int + eta: Optional[str] + + +@dataclass +class StatisticsStatus: + timestamp: str + statistics: Optional[Statistics] = None + error: Optional[str] = None + + +@dataclass +class HealthCheck: + status: str + healthy: bool + online_count: Optional[int] = None + total_count: Optional[int] = None + accessible: Optional[bool] = None + total_items: Optional[int] = None + + +@dataclass +class HealthStatus: + timestamp: str + overall_status: str + checks: Dict[str, HealthCheck] + + +class TdarrMonitor: + def __init__(self, server_url: str, timeout: int = 30): + """Initialize Tdarr monitor with server URL.""" + self.server_url = server_url.rstrip('/') + self.timeout = timeout + self.session = requests.Session() + + # Configure logging + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' + ) + self.logger = logging.getLogger(__name__) + + def _make_request(self, endpoint: str) -> Optional[Dict[str, Any]]: + """Make HTTP request to Tdarr API endpoint.""" + url = urljoin(self.server_url, endpoint) + + try: + response = self.session.get(url, timeout=self.timeout) + response.raise_for_status() + return response.json() + + except requests.exceptions.RequestException as e: + self.logger.error(f"Request failed for {url}: {e}") + return None + except json.JSONDecodeError as e: + self.logger.error(f"JSON decode failed for {url}: {e}") + return None + + def get_server_status(self) -> ServerStatus: + """Get overall server status and configuration.""" + timestamp = datetime.now().isoformat() + + # Try to get server info from API + data = self._make_request('/api/v2/get-server-info') + if data: + return ServerStatus( + timestamp=timestamp, + server_url=self.server_url, + status='online', + version=data.get('version'), + server_id=data.get('serverId'), + uptime=data.get('uptime'), + system_info=data.get('systemInfo', {}) + ) + else: + return ServerStatus( + timestamp=timestamp, + server_url=self.server_url, + status='offline', + error='Unable to connect to Tdarr server' + ) + + def get_queue_status(self) -> QueueStatus: + """Get transcoding queue status and statistics.""" + timestamp = datetime.now().isoformat() + + # Get queue information + data = self._make_request('/api/v2/get-queue') + if data: + queue_data = data.get('queue', []) + + # Calculate queue statistics + total_files = len(queue_data) + queued_files = len([f for f in queue_data if f.get('status') == 'Queued']) + processing_files = len([f for f in queue_data if f.get('status') == 'Processing']) + completed_files = len([f for f in queue_data if f.get('status') == 'Completed']) + + queue_stats = QueueStats( + total_files=total_files, + queued=queued_files, + processing=processing_files, + completed=completed_files, + queue_items=queue_data[:10] # First 10 items for details + ) + + return QueueStatus( + timestamp=timestamp, + queue_stats=queue_stats + ) + else: + return QueueStatus( + timestamp=timestamp, + error='Unable to fetch queue data' + ) + + def get_node_status(self) -> NodeStatus: + """Get status of all connected nodes.""" + timestamp = datetime.now().isoformat() + + # Get nodes information + data = self._make_request('/api/v2/get-nodes') + if data: + nodes = data.get('nodes', []) + + # Process node information + online_nodes = [] + offline_nodes = [] + + for node in nodes: + node_info = NodeInfo( + id=node.get('_id'), + nodeName=node.get('nodeName'), + status='online' if node.get('lastSeen', 0) > 0 else 'offline', + lastSeen=node.get('lastSeen'), + version=node.get('version'), + platform=node.get('platform'), + workers={ + 'cpu': node.get('workers', {}).get('CPU', 0), + 'gpu': node.get('workers', {}).get('GPU', 0) + }, + processing=node.get('currentJobs', []) + ) + + if node_info.status == 'online': + online_nodes.append(node_info) + else: + offline_nodes.append(node_info) + + node_summary = NodeSummary( + total_nodes=len(nodes), + online_nodes=len(online_nodes), + offline_nodes=len(offline_nodes), + online_details=online_nodes, + offline_details=offline_nodes + ) + + return NodeStatus( + timestamp=timestamp, + nodes=nodes, + node_summary=node_summary + ) + else: + return NodeStatus( + timestamp=timestamp, + nodes=[], + error='Unable to fetch node data' + ) + + def get_library_status(self) -> LibraryStatus: + """Get library scan status and file statistics.""" + timestamp = datetime.now().isoformat() + + # Get library information + data = self._make_request('/api/v2/get-libraries') + if data: + libraries = data.get('libraries', []) + + library_stats = [] + total_files = 0 + + for lib in libraries: + lib_info = LibraryInfo( + name=lib.get('name'), + path=lib.get('path'), + file_count=lib.get('totalFiles', 0), + scan_progress=lib.get('scanProgress', 0), + last_scan=lib.get('lastScan'), + is_scanning=lib.get('isScanning', False) + ) + library_stats.append(lib_info) + total_files += lib_info.file_count + + scan_status = ScanStatus( + total_libraries=len(libraries), + total_files=total_files, + scanning_libraries=len([l for l in library_stats if l.is_scanning]) + ) + + return LibraryStatus( + timestamp=timestamp, + libraries=library_stats, + scan_status=scan_status + ) + else: + return LibraryStatus( + timestamp=timestamp, + libraries=[], + error='Unable to fetch library data' + ) + + def get_statistics(self) -> StatisticsStatus: + """Get overall Tdarr statistics and health metrics.""" + timestamp = datetime.now().isoformat() + + # Get statistics + data = self._make_request('/api/v2/get-stats') + if data: + stats = data.get('stats', {}) + statistics = Statistics( + total_transcodes=stats.get('totalTranscodes', 0), + space_saved=stats.get('spaceSaved', 0), + total_files_processed=stats.get('totalFilesProcessed', 0), + failed_transcodes=stats.get('failedTranscodes', 0), + processing_speed=stats.get('processingSpeed', 0), + eta=stats.get('eta') + ) + + return StatisticsStatus( + timestamp=timestamp, + statistics=statistics + ) + else: + return StatisticsStatus( + timestamp=timestamp, + error='Unable to fetch statistics' + ) + + def health_check(self) -> HealthStatus: + """Perform comprehensive health check.""" + timestamp = datetime.now().isoformat() + + # Server connectivity + server_status = self.get_server_status() + server_check = HealthCheck( + status=server_status.status, + healthy=server_status.status == 'online' + ) + + # Node connectivity + node_status = self.get_node_status() + nodes_healthy = ( + node_status.node_summary.online_nodes > 0 if node_status.node_summary else False + ) and not node_status.error + + nodes_check = HealthCheck( + status='online' if nodes_healthy else 'offline', + healthy=nodes_healthy, + online_count=node_status.node_summary.online_nodes if node_status.node_summary else 0, + total_count=node_status.node_summary.total_nodes if node_status.node_summary else 0 + ) + + # Queue status + queue_status = self.get_queue_status() + queue_healthy = not queue_status.error + queue_check = HealthCheck( + status='accessible' if queue_healthy else 'error', + healthy=queue_healthy, + accessible=queue_healthy, + total_items=queue_status.queue_stats.total_files if queue_status.queue_stats else 0 + ) + + checks = { + 'server': server_check, + 'nodes': nodes_check, + 'queue': queue_check + } + + # Determine overall health + all_checks_healthy = all(check.healthy for check in checks.values()) + overall_status = 'healthy' if all_checks_healthy else 'unhealthy' + + return HealthStatus( + timestamp=timestamp, + overall_status=overall_status, + checks=checks + ) + + +def main(): + parser = argparse.ArgumentParser(description='Monitor Tdarr server via API') + parser.add_argument('--server', required=True, help='Tdarr server URL (e.g., http://10.10.0.43:8265)') + parser.add_argument('--check', choices=['all', 'status', 'queue', 'nodes', 'libraries', 'stats', 'health'], + default='health', help='Type of check to perform') + parser.add_argument('--timeout', type=int, default=30, help='Request timeout in seconds') + parser.add_argument('--output', choices=['json', 'pretty'], default='pretty', help='Output format') + parser.add_argument('--verbose', action='store_true', help='Enable verbose logging') + + args = parser.parse_args() + + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + # Initialize monitor + monitor = TdarrMonitor(args.server, args.timeout) + + # Perform requested check + result = None + if args.check == 'all': + result = { + 'server_status': monitor.get_server_status(), + 'queue_status': monitor.get_queue_status(), + 'node_status': monitor.get_node_status(), + 'library_status': monitor.get_library_status(), + 'statistics': monitor.get_statistics() + } + elif args.check == 'status': + result = monitor.get_server_status() + elif args.check == 'queue': + result = monitor.get_queue_status() + elif args.check == 'nodes': + result = monitor.get_node_status() + elif args.check == 'libraries': + result = monitor.get_library_status() + elif args.check == 'stats': + result = monitor.get_statistics() + elif args.check == 'health': + result = monitor.health_check() + + # Output results + if args.output == 'json': + # Convert dataclasses to dictionaries for JSON serialization + if args.check == 'all': + json_result = {} + for key, value in result.items(): + json_result[key] = asdict(value) + print(json.dumps(json_result, indent=2)) + else: + print(json.dumps(asdict(result), indent=2)) + else: + # Pretty print format + print(f"=== Tdarr Monitor Results - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===") + + if args.check == 'health' or (hasattr(result, 'overall_status') and result.overall_status): + health = result if hasattr(result, 'overall_status') else None + if health: + status = health.overall_status + print(f"Overall Status: {status.upper()}") + + if health.checks: + print("\nHealth Checks:") + for check_name, check_data in health.checks.items(): + status_icon = "✓" if check_data.healthy else "✗" + print(f" {status_icon} {check_name.title()}: {asdict(check_data)}") + + if args.check == 'all': + for section, data in result.items(): + print(f"\n=== {section.replace('_', ' ').title()} ===") + print(json.dumps(asdict(data), indent=2)) + elif args.check != 'health': + print(json.dumps(asdict(result), indent=2)) + + # Exit with appropriate code + if result: + # Check for unhealthy status in health check + if isinstance(result, HealthStatus) and result.overall_status == 'unhealthy': + sys.exit(1) + # Check for errors in individual status objects (all status classes except HealthStatus have error attribute) + elif (isinstance(result, (ServerStatus, QueueStatus, NodeStatus, LibraryStatus, StatisticsStatus)) + and result.error): + sys.exit(1) + # Check for errors in 'all' results + elif isinstance(result, dict): + for status_obj in result.values(): + if (isinstance(status_obj, (ServerStatus, QueueStatus, NodeStatus, LibraryStatus, StatisticsStatus)) + and status_obj.error): + sys.exit(1) + + sys.exit(0) + + +if __name__ == '__main__': + main() \ No newline at end of file