#!/usr/bin/env bash
# homelab-audit.sh — SSH-based homelab health audit
#
# Runs on the Proxmox host. Discovers running LXCs and VMs, SSHes into each
# to collect system metrics, then generates a summary report.
#
# Usage:
#   homelab-audit.sh [--output-dir DIR]
#
# Environment overrides:
#   STUCK_PROC_CPU_WARN  CPU% at which a D-state process is flagged (default: 10)
#   REPORT_DIR           Output directory for per-host reports and logs
#   SSH_USER             Remote user (default: root)

# -e omitted intentionally — unreachable hosts should not abort the full audit
set -uo pipefail

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
STUCK_PROC_CPU_WARN="${STUCK_PROC_CPU_WARN:-10}"
REPORT_DIR="${REPORT_DIR:-/tmp/homelab-audit-$(date +%Y%m%d-%H%M%S)}"
SSH_USER="${SSH_USER:-root}"
SSH_OPTS="-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes"

DISK_WARN=80
DISK_CRIT=90
LOAD_WARN=2.0
MEM_WARN=85

while [[ $# -gt 0 ]]; do
  case "$1" in
    --output-dir)
      REPORT_DIR="$2"
      shift 2
      ;;
    *)
      echo "Unknown option: $1" >&2
      exit 1
      ;;
  esac
done

mkdir -p "$REPORT_DIR"
SSH_FAILURES_LOG="$REPORT_DIR/ssh-failures.log"
FINDINGS_FILE="$REPORT_DIR/findings.txt"

# ---------------------------------------------------------------------------
# Remote collector script
#
# Kept single-quoted so no local variables are interpolated into the heredoc.
# STUCK_PROC_CPU_WARN is passed as $1 when invoking the remote bash session,
# so the configurable threshold reaches the collector without escaping issues.
# ---------------------------------------------------------------------------
COLLECTOR_SCRIPT='#!/usr/bin/env bash
STUCK_PROC_CPU_WARN="${1:-10}"

cpu_load() {
  uptime | awk -F"load average:" "{print \$2}" | awk -F"[, ]+" "{print \$2}"
}

mem_pct() {
  free | awk "/^Mem:/ {printf \"%.0f\", \$3/\$2*100}"
}

disk_usage() {
  df --output=pcent,target -x tmpfs -x devtmpfs 2>/dev/null | tail -n +2 | \
    while read -r pct mnt; do echo "${pct%%%} $mnt"; done
}

zombie_count() {
  ps -eo stat= | grep -c "^Z" || true
}

stuck_procs() {
  ps -eo stat=,pcpu=,comm= | \
    awk -v t="$STUCK_PROC_CPU_WARN" "$1 ~ /^D/ && $2+0 >= t+0 {print $3}" | \
    paste -sd,
}

echo "CPU_LOAD=$(cpu_load)"
echo "MEM_PCT=$(mem_pct)"
echo "ZOMBIES=$(zombie_count)"
echo "STUCK_PROCS=$(stuck_procs)"
disk_usage | while read -r pct mnt; do
  echo "DISK $pct $mnt"
done
'

# ---------------------------------------------------------------------------
# SSH helper — logs stderr to ssh-failures.log instead of silently discarding
# ---------------------------------------------------------------------------
ssh_cmd() {
  local host="$1"
  shift
  # shellcheck disable=SC2086
  ssh $SSH_OPTS "${SSH_USER}@${host}" "$@" 2>>"$SSH_FAILURES_LOG"
}

# ---------------------------------------------------------------------------
# LXC IP discovery
#
# lxc-info only returns IPs for containers using Proxmox-managed DHCP bridges.
# Containers with static IPs defined inside the container (not via Proxmox
# network config) return nothing. Fall back to parsing `pct config` in that
# case to find the ip= field from the container's network interface config.
# ---------------------------------------------------------------------------
get_lxc_ip() {
  local ctid="$1"
  local ip
  ip=$(lxc-info -n "$ctid" -iH 2>/dev/null | head -1)
  if [[ -z "$ip" ]]; then
    ip=$(pct config "$ctid" 2>/dev/null | grep -oP '(?<=ip=)[^/,]+' | head -1)
  fi
  echo "$ip"
}

# ---------------------------------------------------------------------------
# Inventory: running LXCs and VMs
# Returns lines of "label ip"
# ---------------------------------------------------------------------------
collect_inventory() {
  # LXCs
  pct list 2>/dev/null | tail -n +2 | while read -r ctid status _name; do
    [[ "$status" != "running" ]] && continue
    local ip
    ip=$(get_lxc_ip "$ctid")
    [[ -n "$ip" ]] && echo "lxc-${ctid} $ip"
  done

  # VMs — use agent network info if available, fall back to qm config
  qm list 2>/dev/null | tail -n +2 | while read -r vmid _name status _mem _bootdisk _pid; do
    [[ "$status" != "running" ]] && continue
    local ip
    ip=$(qm guest cmd "$vmid" network-get-interfaces 2>/dev/null |
      python3 -c "
import sys, json
try:
    data = json.load(sys.stdin)
    for iface in data:
        for addr in iface.get('ip-addresses', []):
            if addr['ip-address-type'] == 'ipv4' and not addr['ip-address'].startswith('127.'):
                print(addr['ip-address'])
                raise SystemExit
except Exception:
    pass
" 2>/dev/null)
    [[ -n "$ip" ]] && echo "vm-${vmid} $ip"
  done
}

# ---------------------------------------------------------------------------
# Collect metrics from one host and record findings
# ---------------------------------------------------------------------------
parse_and_report() {
  local label="$1"
  local addr="$2"
  local raw

  if ! raw=$(echo "$COLLECTOR_SCRIPT" | ssh_cmd "$addr" bash -s -- "$STUCK_PROC_CPU_WARN"); then
    echo "SSH_FAILURE $label $addr" >>"$SSH_FAILURES_LOG"
    echo "WARN  $label: SSH connection failed" >>"$FINDINGS_FILE"
    return
  fi

  while IFS= read -r line; do
    case "$line" in
      CPU_LOAD=*)
        local load="${line#CPU_LOAD=}"
        if [[ -n "$load" ]] && awk "BEGIN{exit !($load > $LOAD_WARN)}"; then
          echo "WARN  $label: load average ${load} > ${LOAD_WARN}" >>"$FINDINGS_FILE"
        fi
        ;;
      MEM_PCT=*)
        local mem="${line#MEM_PCT=}"
        if [[ -n "$mem" ]] && ((mem >= MEM_WARN)); then
          echo "WARN  $label: memory ${mem}% >= ${MEM_WARN}%" >>"$FINDINGS_FILE"
        fi
        ;;
      ZOMBIES=*)
        local zombies="${line#ZOMBIES=}"
        if [[ -n "$zombies" ]] && ((zombies > 0)); then
          echo "WARN  $label: ${zombies} zombie process(es)" >>"$FINDINGS_FILE"
        fi
        ;;
      STUCK_PROCS=*)
        local procs="${line#STUCK_PROCS=}"
        if [[ -n "$procs" ]]; then
          echo "WARN  $label: D-state procs with CPU>=${STUCK_PROC_CPU_WARN}%: ${procs}" >>"$FINDINGS_FILE"
        fi
        ;;
      DISK\ *)
        local pct mnt
        read -r _ pct mnt <<<"$line"
        if ((pct >= DISK_CRIT)); then
          echo "CRIT  $label: disk ${mnt} at ${pct}% >= ${DISK_CRIT}%" >>"$FINDINGS_FILE"
        elif ((pct >= DISK_WARN)); then
          echo "WARN  $label: disk ${mnt} at ${pct}% >= ${DISK_WARN}%" >>"$FINDINGS_FILE"
        fi
        ;;
    esac
  done <<<"$raw"
}

# ---------------------------------------------------------------------------
# Summary — driven by actual findings in findings.txt and ssh-failures.log
# ---------------------------------------------------------------------------
generate_summary() {
  local host_count="$1"
  local ssh_failure_count=0
  local warn_count=0
  local crit_count=0

  [[ -f "$SSH_FAILURES_LOG" ]] &&
    ssh_failure_count=$(grep -c '^SSH_FAILURE' "$SSH_FAILURES_LOG" 2>/dev/null || true)
  [[ -f "$FINDINGS_FILE" ]] &&
    warn_count=$(grep -c '^WARN' "$FINDINGS_FILE" 2>/dev/null || true)
  [[ -f "$FINDINGS_FILE" ]] &&
    crit_count=$(grep -c '^CRIT' "$FINDINGS_FILE" 2>/dev/null || true)

  echo ""
  echo "=============================="
  echo "  HOMELAB AUDIT SUMMARY"
  echo "=============================="
  printf "  Hosts audited : %d\n" "$host_count"
  printf "  SSH failures  : %d\n" "$ssh_failure_count"
  printf "  Warnings      : %d\n" "$warn_count"
  printf "  Critical      : %d\n" "$crit_count"
  echo "=============================="

  if ((warn_count + crit_count > 0)); then
    echo ""
    echo "Findings:"
    sort "$FINDINGS_FILE"
  fi

  if ((ssh_failure_count > 0)); then
    echo ""
    echo "SSH failures (see $SSH_FAILURES_LOG for details):"
    grep '^SSH_FAILURE' "$SSH_FAILURES_LOG" | awk '{print "  " $2 " (" $3 ")"}'
  fi

  echo ""
  echo "Reports: $REPORT_DIR"
}

# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
main() {
  echo "Starting homelab audit — $(date)"
  echo "Report dir: $REPORT_DIR"
  echo "STUCK_PROC_CPU_WARN threshold: ${STUCK_PROC_CPU_WARN}%"
  echo ""

  >"$FINDINGS_FILE"

  local host_count=0
  while read -r label addr; do
    echo "  Auditing $label ($addr)..."
    parse_and_report "$label" "$addr"
    ((host_count++)) || true
  done < <(collect_inventory)

  generate_summary "$host_count"
}

main "$@"