claude-home/monitoring/scripts/nvidia_update_checker.py
Cal Corum d0dbe86fba Add NVIDIA update checker and monitoring scripts documentation
Add nvidia_update_checker.py for weekly driver update monitoring with
Discord alerts. Add scripts CONTEXT.md and update README.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 22:21:00 -06:00

301 lines
9.9 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
NVIDIA Driver Update Checker
Monitors for available updates to held NVIDIA packages and sends
Discord notifications when new versions are available.
This allows manual, planned updates during maintenance windows
rather than surprise auto-updates causing downtime.
Usage:
# Check for updates (with Discord alert)
python3 nvidia_update_checker.py --check --discord-alerts
# Check silently (cron job logging)
python3 nvidia_update_checker.py --check
# Test Discord integration
python3 nvidia_update_checker.py --discord-test
"""
import argparse
import json
import logging
import subprocess
import sys
from dataclasses import dataclass, asdict
from datetime import datetime
from typing import List, Optional
import requests
@dataclass
class PackageUpdate:
name: str
current_version: str
available_version: str
held: bool
@dataclass
class UpdateCheckResult:
timestamp: str
updates_available: bool
held_packages: List[PackageUpdate]
other_packages: List[PackageUpdate]
total_updates: int
class DiscordNotifier:
def __init__(self, webhook_url: str, timeout: int = 10):
self.webhook_url = webhook_url
self.timeout = timeout
self.logger = logging.getLogger(f"{__name__}.DiscordNotifier")
def send_alert(self, title: str, description: str, color: int = 0xffa500,
fields: list = None) -> bool:
"""Send embed alert to Discord."""
embed = {
"title": title,
"description": description,
"color": color,
"timestamp": datetime.now().isoformat(),
"fields": fields or []
}
payload = {
"username": "NVIDIA Update Monitor",
"embeds": [embed]
}
try:
response = requests.post(
self.webhook_url,
json=payload,
timeout=self.timeout
)
response.raise_for_status()
self.logger.info("Discord notification sent successfully")
return True
except Exception as e:
self.logger.error(f"Failed to send Discord notification: {e}")
return False
def send_update_available_alert(self, updates: List[PackageUpdate]) -> bool:
"""Send alert when NVIDIA driver updates are available."""
version_list = "\n".join([
f"• **{pkg.name}**: {pkg.current_version}{pkg.available_version}"
for pkg in updates
])
fields = [
{
"name": "Available Updates",
"value": version_list,
"inline": False
},
{
"name": "⚠️ Action Required",
"value": (
"These packages are held and will NOT auto-update.\n"
"Plan a maintenance window to update manually:\n"
"```bash\n"
"sudo apt-mark unhold nvidia-driver-570\n"
"sudo apt update && sudo apt upgrade\n"
"sudo reboot\n"
"```"
),
"inline": False
}
]
return self.send_alert(
title="🔔 NVIDIA Driver Update Available",
description=f"New NVIDIA driver version(s) available for ubuntu-manticore ({len(updates)} package(s))",
color=0xffa500, # Orange
fields=fields
)
class NvidiaUpdateChecker:
def __init__(self, ssh_host: str = None, discord_webhook: str = None,
enable_discord: bool = False):
self.ssh_host = ssh_host
self.logger = logging.getLogger(__name__)
self.discord = None
if enable_discord and discord_webhook:
self.discord = DiscordNotifier(discord_webhook)
def _run_command(self, cmd: list, timeout: int = 30) -> tuple:
"""Run command locally or via SSH."""
if self.ssh_host:
cmd = ["ssh", self.ssh_host] + [" ".join(cmd)]
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=timeout,
shell=isinstance(cmd[-1], str) and self.ssh_host is not None
)
return result.returncode, result.stdout.strip(), result.stderr.strip()
except subprocess.TimeoutExpired:
return -1, "", "Command timed out"
except Exception as e:
return -1, "", str(e)
def get_held_packages(self) -> List[str]:
"""Get list of held packages."""
cmd = ["apt-mark", "showhold"]
code, stdout, stderr = self._run_command(cmd)
if code != 0:
self.logger.error(f"Failed to get held packages: {stderr}")
return []
return [line.strip() for line in stdout.split("\n") if line.strip()]
def check_package_updates(self) -> List[PackageUpdate]:
"""Check for available updates."""
# Update package cache
update_cmd = ["apt-get", "update", "-qq"]
self._run_command(update_cmd)
# Get list of upgradable packages
cmd = ["apt", "list", "--upgradable"]
code, stdout, stderr = self._run_command(cmd)
if code != 0:
self.logger.error(f"Failed to check updates: {stderr}")
return []
held_packages = self.get_held_packages()
updates = []
for line in stdout.split("\n"):
if "/" not in line or "[upgradable" not in line:
continue
# Parse: package/release version arch [upgradable from: old_version]
parts = line.split()
if len(parts) < 6:
continue
package_name = parts[0].split("/")[0]
new_version = parts[1]
old_version = parts[5].rstrip("]")
# Filter for NVIDIA packages
if "nvidia" in package_name.lower():
updates.append(PackageUpdate(
name=package_name,
current_version=old_version,
available_version=new_version,
held=package_name in held_packages
))
return updates
def check_updates(self) -> UpdateCheckResult:
"""Perform full update check."""
timestamp = datetime.now().isoformat()
updates = self.check_package_updates()
held_updates = [u for u in updates if u.held]
other_updates = [u for u in updates if not u.held]
result = UpdateCheckResult(
timestamp=timestamp,
updates_available=len(held_updates) > 0,
held_packages=held_updates,
other_packages=other_updates,
total_updates=len(updates)
)
# Send Discord alert for held packages with updates
if result.updates_available and self.discord:
self.discord.send_update_available_alert(held_updates)
return result
def main():
parser = argparse.ArgumentParser(
description='Monitor NVIDIA driver updates on held packages'
)
parser.add_argument('--check', action='store_true', help='Check for updates')
parser.add_argument('--discord-webhook',
default='https://discord.com/api/webhooks/1404105821549498398/y2Ud1RK9rzFjv58xbypUfQNe3jrL7ZUq1FkQHa4_dfOHm2ylp93z0f4tY0O8Z-vQgKhD',
help='Discord webhook URL')
parser.add_argument('--discord-alerts', action='store_true',
help='Enable Discord alerts')
parser.add_argument('--discord-test', action='store_true',
help='Test Discord integration')
parser.add_argument('--ssh-host', default='cal@10.10.0.226',
help='SSH host for remote monitoring')
parser.add_argument('--output', choices=['json', 'pretty'], default='pretty')
parser.add_argument('--verbose', action='store_true', help='Verbose logging')
args = parser.parse_args()
# Configure logging
level = logging.DEBUG if args.verbose else logging.INFO
logging.basicConfig(
level=level,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# Discord test
if args.discord_test:
notifier = DiscordNotifier(args.discord_webhook)
success = notifier.send_alert(
title="NVIDIA Update Monitor Test",
description="Discord integration is working correctly.",
color=0x00ff00,
fields=[
{"name": "Host", "value": args.ssh_host, "inline": True},
{"name": "Status", "value": "Test successful", "inline": True}
]
)
sys.exit(0 if success else 1)
# Update check
if args.check:
checker = NvidiaUpdateChecker(
ssh_host=args.ssh_host,
discord_webhook=args.discord_webhook,
enable_discord=args.discord_alerts
)
result = checker.check_updates()
if args.output == 'json':
print(json.dumps(asdict(result), indent=2))
else:
print(f"=== NVIDIA Update Check - {result.timestamp} ===")
if result.updates_available:
print(f"\n⚠️ {len(result.held_packages)} held package(s) have updates:")
for pkg in result.held_packages:
print(f"{pkg.name}: {pkg.current_version}{pkg.available_version}")
print("\nThese packages will NOT auto-update (held)")
print("Plan a maintenance window to update manually")
else:
print("\n✅ All held NVIDIA packages are up to date")
if result.other_packages:
print(f"\n {len(result.other_packages)} other NVIDIA package(s) have updates:")
for pkg in result.other_packages:
print(f"{pkg.name}: {pkg.current_version}{pkg.available_version}")
sys.exit(0 if not result.updates_available else 1)
parser.print_help()
if __name__ == '__main__':
main()