Add nvidia_update_checker.py for weekly driver update monitoring with Discord alerts. Add scripts CONTEXT.md and update README. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
301 lines
9.9 KiB
Python
301 lines
9.9 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
NVIDIA Driver Update Checker
|
||
|
||
Monitors for available updates to held NVIDIA packages and sends
|
||
Discord notifications when new versions are available.
|
||
|
||
This allows manual, planned updates during maintenance windows
|
||
rather than surprise auto-updates causing downtime.
|
||
|
||
Usage:
|
||
# Check for updates (with Discord alert)
|
||
python3 nvidia_update_checker.py --check --discord-alerts
|
||
|
||
# Check silently (cron job logging)
|
||
python3 nvidia_update_checker.py --check
|
||
|
||
# Test Discord integration
|
||
python3 nvidia_update_checker.py --discord-test
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import logging
|
||
import subprocess
|
||
import sys
|
||
from dataclasses import dataclass, asdict
|
||
from datetime import datetime
|
||
from typing import List, Optional
|
||
import requests
|
||
|
||
|
||
@dataclass
|
||
class PackageUpdate:
|
||
name: str
|
||
current_version: str
|
||
available_version: str
|
||
held: bool
|
||
|
||
|
||
@dataclass
|
||
class UpdateCheckResult:
|
||
timestamp: str
|
||
updates_available: bool
|
||
held_packages: List[PackageUpdate]
|
||
other_packages: List[PackageUpdate]
|
||
total_updates: int
|
||
|
||
|
||
class DiscordNotifier:
|
||
def __init__(self, webhook_url: str, timeout: int = 10):
|
||
self.webhook_url = webhook_url
|
||
self.timeout = timeout
|
||
self.logger = logging.getLogger(f"{__name__}.DiscordNotifier")
|
||
|
||
def send_alert(self, title: str, description: str, color: int = 0xffa500,
|
||
fields: list = None) -> bool:
|
||
"""Send embed alert to Discord."""
|
||
embed = {
|
||
"title": title,
|
||
"description": description,
|
||
"color": color,
|
||
"timestamp": datetime.now().isoformat(),
|
||
"fields": fields or []
|
||
}
|
||
|
||
payload = {
|
||
"username": "NVIDIA Update Monitor",
|
||
"embeds": [embed]
|
||
}
|
||
|
||
try:
|
||
response = requests.post(
|
||
self.webhook_url,
|
||
json=payload,
|
||
timeout=self.timeout
|
||
)
|
||
response.raise_for_status()
|
||
self.logger.info("Discord notification sent successfully")
|
||
return True
|
||
except Exception as e:
|
||
self.logger.error(f"Failed to send Discord notification: {e}")
|
||
return False
|
||
|
||
def send_update_available_alert(self, updates: List[PackageUpdate]) -> bool:
|
||
"""Send alert when NVIDIA driver updates are available."""
|
||
version_list = "\n".join([
|
||
f"• **{pkg.name}**: {pkg.current_version} → {pkg.available_version}"
|
||
for pkg in updates
|
||
])
|
||
|
||
fields = [
|
||
{
|
||
"name": "Available Updates",
|
||
"value": version_list,
|
||
"inline": False
|
||
},
|
||
{
|
||
"name": "⚠️ Action Required",
|
||
"value": (
|
||
"These packages are held and will NOT auto-update.\n"
|
||
"Plan a maintenance window to update manually:\n"
|
||
"```bash\n"
|
||
"sudo apt-mark unhold nvidia-driver-570\n"
|
||
"sudo apt update && sudo apt upgrade\n"
|
||
"sudo reboot\n"
|
||
"```"
|
||
),
|
||
"inline": False
|
||
}
|
||
]
|
||
|
||
return self.send_alert(
|
||
title="🔔 NVIDIA Driver Update Available",
|
||
description=f"New NVIDIA driver version(s) available for ubuntu-manticore ({len(updates)} package(s))",
|
||
color=0xffa500, # Orange
|
||
fields=fields
|
||
)
|
||
|
||
|
||
class NvidiaUpdateChecker:
|
||
def __init__(self, ssh_host: str = None, discord_webhook: str = None,
|
||
enable_discord: bool = False):
|
||
self.ssh_host = ssh_host
|
||
self.logger = logging.getLogger(__name__)
|
||
|
||
self.discord = None
|
||
if enable_discord and discord_webhook:
|
||
self.discord = DiscordNotifier(discord_webhook)
|
||
|
||
def _run_command(self, cmd: list, timeout: int = 30) -> tuple:
|
||
"""Run command locally or via SSH."""
|
||
if self.ssh_host:
|
||
cmd = ["ssh", self.ssh_host] + [" ".join(cmd)]
|
||
|
||
try:
|
||
result = subprocess.run(
|
||
cmd,
|
||
capture_output=True,
|
||
text=True,
|
||
timeout=timeout,
|
||
shell=isinstance(cmd[-1], str) and self.ssh_host is not None
|
||
)
|
||
return result.returncode, result.stdout.strip(), result.stderr.strip()
|
||
except subprocess.TimeoutExpired:
|
||
return -1, "", "Command timed out"
|
||
except Exception as e:
|
||
return -1, "", str(e)
|
||
|
||
def get_held_packages(self) -> List[str]:
|
||
"""Get list of held packages."""
|
||
cmd = ["apt-mark", "showhold"]
|
||
code, stdout, stderr = self._run_command(cmd)
|
||
|
||
if code != 0:
|
||
self.logger.error(f"Failed to get held packages: {stderr}")
|
||
return []
|
||
|
||
return [line.strip() for line in stdout.split("\n") if line.strip()]
|
||
|
||
def check_package_updates(self) -> List[PackageUpdate]:
|
||
"""Check for available updates."""
|
||
# Update package cache
|
||
update_cmd = ["apt-get", "update", "-qq"]
|
||
self._run_command(update_cmd)
|
||
|
||
# Get list of upgradable packages
|
||
cmd = ["apt", "list", "--upgradable"]
|
||
code, stdout, stderr = self._run_command(cmd)
|
||
|
||
if code != 0:
|
||
self.logger.error(f"Failed to check updates: {stderr}")
|
||
return []
|
||
|
||
held_packages = self.get_held_packages()
|
||
updates = []
|
||
|
||
for line in stdout.split("\n"):
|
||
if "/" not in line or "[upgradable" not in line:
|
||
continue
|
||
|
||
# Parse: package/release version arch [upgradable from: old_version]
|
||
parts = line.split()
|
||
if len(parts) < 6:
|
||
continue
|
||
|
||
package_name = parts[0].split("/")[0]
|
||
new_version = parts[1]
|
||
old_version = parts[5].rstrip("]")
|
||
|
||
# Filter for NVIDIA packages
|
||
if "nvidia" in package_name.lower():
|
||
updates.append(PackageUpdate(
|
||
name=package_name,
|
||
current_version=old_version,
|
||
available_version=new_version,
|
||
held=package_name in held_packages
|
||
))
|
||
|
||
return updates
|
||
|
||
def check_updates(self) -> UpdateCheckResult:
|
||
"""Perform full update check."""
|
||
timestamp = datetime.now().isoformat()
|
||
|
||
updates = self.check_package_updates()
|
||
held_updates = [u for u in updates if u.held]
|
||
other_updates = [u for u in updates if not u.held]
|
||
|
||
result = UpdateCheckResult(
|
||
timestamp=timestamp,
|
||
updates_available=len(held_updates) > 0,
|
||
held_packages=held_updates,
|
||
other_packages=other_updates,
|
||
total_updates=len(updates)
|
||
)
|
||
|
||
# Send Discord alert for held packages with updates
|
||
if result.updates_available and self.discord:
|
||
self.discord.send_update_available_alert(held_updates)
|
||
|
||
return result
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description='Monitor NVIDIA driver updates on held packages'
|
||
)
|
||
parser.add_argument('--check', action='store_true', help='Check for updates')
|
||
parser.add_argument('--discord-webhook',
|
||
default='https://discord.com/api/webhooks/1404105821549498398/y2Ud1RK9rzFjv58xbypUfQNe3jrL7ZUq1FkQHa4_dfOHm2ylp93z0f4tY0O8Z-vQgKhD',
|
||
help='Discord webhook URL')
|
||
parser.add_argument('--discord-alerts', action='store_true',
|
||
help='Enable Discord alerts')
|
||
parser.add_argument('--discord-test', action='store_true',
|
||
help='Test Discord integration')
|
||
parser.add_argument('--ssh-host', default='cal@10.10.0.226',
|
||
help='SSH host for remote monitoring')
|
||
parser.add_argument('--output', choices=['json', 'pretty'], default='pretty')
|
||
parser.add_argument('--verbose', action='store_true', help='Verbose logging')
|
||
|
||
args = parser.parse_args()
|
||
|
||
# Configure logging
|
||
level = logging.DEBUG if args.verbose else logging.INFO
|
||
logging.basicConfig(
|
||
level=level,
|
||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||
)
|
||
|
||
# Discord test
|
||
if args.discord_test:
|
||
notifier = DiscordNotifier(args.discord_webhook)
|
||
success = notifier.send_alert(
|
||
title="NVIDIA Update Monitor Test",
|
||
description="Discord integration is working correctly.",
|
||
color=0x00ff00,
|
||
fields=[
|
||
{"name": "Host", "value": args.ssh_host, "inline": True},
|
||
{"name": "Status", "value": "Test successful", "inline": True}
|
||
]
|
||
)
|
||
sys.exit(0 if success else 1)
|
||
|
||
# Update check
|
||
if args.check:
|
||
checker = NvidiaUpdateChecker(
|
||
ssh_host=args.ssh_host,
|
||
discord_webhook=args.discord_webhook,
|
||
enable_discord=args.discord_alerts
|
||
)
|
||
|
||
result = checker.check_updates()
|
||
|
||
if args.output == 'json':
|
||
print(json.dumps(asdict(result), indent=2))
|
||
else:
|
||
print(f"=== NVIDIA Update Check - {result.timestamp} ===")
|
||
|
||
if result.updates_available:
|
||
print(f"\n⚠️ {len(result.held_packages)} held package(s) have updates:")
|
||
for pkg in result.held_packages:
|
||
print(f" • {pkg.name}: {pkg.current_version} → {pkg.available_version}")
|
||
print("\nThese packages will NOT auto-update (held)")
|
||
print("Plan a maintenance window to update manually")
|
||
else:
|
||
print("\n✅ All held NVIDIA packages are up to date")
|
||
|
||
if result.other_packages:
|
||
print(f"\nℹ️ {len(result.other_packages)} other NVIDIA package(s) have updates:")
|
||
for pkg in result.other_packages:
|
||
print(f" • {pkg.name}: {pkg.current_version} → {pkg.available_version}")
|
||
|
||
sys.exit(0 if not result.updates_available else 1)
|
||
|
||
parser.print_help()
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|