cognitive-memory/common.py

"""
Cognitive Memory - Common Constants & Helpers

Module-level constants, YAML parsing, slug generation, decay calculation,
embedding helpers, and cosine similarity. Shared by all other modules.
"""

import json
import math
import os
import re
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from urllib.error import URLError

# =============================================================================
# CONSTANTS
# =============================================================================

# Data directory resolution order:
#   1. COGNITIVE_MEMORY_DIR env var (explicit override)
#   2. XDG_DATA_HOME/cognitive-memory/ (Linux standard)
#   3. ~/.local/share/cognitive-memory/ (XDG default)
_env_dir = os.environ.get("COGNITIVE_MEMORY_DIR", "")
if _env_dir:
    MEMORY_DIR = Path(_env_dir).expanduser()
else:
    _xdg_data = os.environ.get("XDG_DATA_HOME", "") or str(
        Path.home() / ".local" / "share"
    )
    MEMORY_DIR = Path(_xdg_data) / "cognitive-memory"

INDEX_PATH = MEMORY_DIR / "_index.json"
STATE_PATH = MEMORY_DIR / "_state.json"
EMBEDDINGS_PATH = MEMORY_DIR / "_embeddings.json"
OLLAMA_URL = "http://localhost:11434"
EMBEDDING_MODEL = "nomic-embed-text"
EMBEDDING_TIMEOUT = 5  # seconds
CONFIG_PATH = MEMORY_DIR / "_config.json"
OPENAI_EMBED_URL = "https://api.openai.com/v1/embeddings"
OPENAI_MODEL_DEFAULT = "text-embedding-3-small"

# Memory type -> directory name mapping
TYPE_DIRS = {
    "solution": "solutions",
    "fix": "fixes",
    "decision": "decisions",
    "configuration": "configurations",
    "problem": "problems",
    "workflow": "workflows",
    "code_pattern": "code-patterns",
    "error": "errors",
    "general": "general",
    "procedure": "procedures",
    "insight": "insights",
}

VALID_TYPES = list(TYPE_DIRS.keys())

# Decay model type weights
TYPE_WEIGHTS = {
    "decision": 1.3,
    "solution": 1.2,
    "insight": 1.25,
    "code_pattern": 1.1,
    "configuration": 1.1,
    "fix": 1.0,
    "workflow": 1.0,
    "problem": 0.9,
    "error": 0.8,
    "general": 0.8,
    "procedure": 1.4,
}

DECAY_LAMBDA = 0.03  # Half-life ~23 days

# Decay score thresholds
THRESHOLD_ACTIVE = 0.5
THRESHOLD_FADING = 0.2
THRESHOLD_DORMANT = 0.05

# Relationship types (subset from MemoryGraph, focused on most useful)
VALID_RELATION_TYPES = [
    "SOLVES",
    "CAUSES",
    "BUILDS_ON",
    "ALTERNATIVE_TO",
    "REQUIRES",
    "FOLLOWS",
    "RELATED_TO",
]

# Edge file constants
EDGES_DIR_NAME = "edges"
EDGE_FIELD_ORDER = [
    "id",
    "type",
    "from_id",
    "from_title",
    "to_id",
    "to_title",
    "strength",
    "created",
    "updated",
]

# Frontmatter field order for consistent output
FIELD_ORDER = [
    "id",
    "type",
    "title",
    "tags",
    "importance",
    "confidence",
    "steps",
    "preconditions",
    "postconditions",
    "created",
    "updated",
    "relations",
]

# CORE.md token budget (approximate, 1 token ~= 4 chars)
CORE_MAX_CHARS = 12000  # ~3K tokens

GRAPHS_BASE_DIR = MEMORY_DIR.parent / "cognitive-memory-graphs"


# =============================================================================
# YAML FRONTMATTER PARSING (stdlib only)
# =============================================================================


def _needs_quoting(s: str) -> bool:
    """Check if a YAML string value needs quoting."""
    if not s:
        return True
    if any(c in s for c in ":#{}[]&*?|>!%@`"):
        return True
    try:
        float(s)
        return True
    except ValueError:
        pass
    if s.lower() in ("true", "false", "null", "yes", "no", "on", "off"):
        return True
    return False


def _quote_yaml(s: str) -> str:
    """Quote a string for YAML, escaping internal quotes."""
    escaped = s.replace("\\", "\\\\").replace('"', '\\"')
    return f'"{escaped}"'


def _format_yaml_value(value: Any, force_quote: bool = False) -> str:
    """Format a Python value for YAML output."""
    if value is None:
        return "null"
    if isinstance(value, bool):
        return "true" if value else "false"
    if isinstance(value, (int, float)):
        return str(value)
    s = str(value)
    if force_quote or _needs_quoting(s):
        return _quote_yaml(s)
    return s


def _parse_scalar(value: str) -> Any:
    """Parse a YAML scalar value to Python type."""
    v = value.strip()
    if not v or v == "null":
        return None
    if v == "true":
        return True
    if v == "false":
        return False
    # Try numeric
    try:
        if "." in v:
            return float(v)
        return int(v)
    except ValueError:
        pass
    # Strip quotes
    if (v.startswith('"') and v.endswith('"')) or (
        v.startswith("'") and v.endswith("'")
    ):
        return v[1:-1]
    return v


def serialize_frontmatter(data: Dict[str, Any]) -> str:
    """Serialize a dict to YAML frontmatter string (between --- markers)."""
    lines = ["---"]

    for key in FIELD_ORDER:
        if key not in data:
            continue
        value = data[key]

        if key == "tags" and isinstance(value, list):
            if value:
                items = ", ".join(_format_yaml_value(t) for t in value)
                lines.append(f"tags: [{items}]")
            else:
                lines.append("tags: []")

        elif key in ("steps", "preconditions", "postconditions") and isinstance(
            value, list
        ):
            if not value:
                continue
            lines.append(f"{key}:")
            for item in value:
                lines.append(f"  - {_format_yaml_value(str(item), force_quote=True)}")

        elif key == "relations" and isinstance(value, list):
            if not value:
                continue
            lines.append("relations:")
            for rel in value:
                first = True
                for rk in [
                    "target",
                    "type",
                    "direction",
                    "strength",
                    "context",
                    "edge_id",
                ]:
                    if rk not in rel:
                        continue
                    rv = rel[rk]
                    prefix = "  - " if first else "    "
                    force_q = rk in ("context",)
                    lines.append(
                        f"{prefix}{rk}: {_format_yaml_value(rv, force_quote=force_q)}"
                    )
                    first = False

        elif key == "title":
            lines.append(f"title: {_format_yaml_value(value, force_quote=True)}")

        else:
            lines.append(f"{key}: {_format_yaml_value(value)}")

    lines.append("---")
    return "\n".join(lines)


def parse_frontmatter(text: str) -> Tuple[Dict[str, Any], str]:
    """Parse YAML frontmatter and body from markdown text.

    Returns (frontmatter_dict, body_text).
    """
    if not text.startswith("---\n"):
        return {}, text

    # Find closing ---
    end_match = re.search(r"\n---\s*\n", text[3:])
    if not end_match:
        # Try end of string
        if text.rstrip().endswith("---"):
            end_pos = text.rstrip().rfind("\n---")
            if end_pos <= 3:
                return {}, text
            fm_text = text[4:end_pos]
            body = ""
        else:
            return {}, text
    else:
        end_pos = end_match.start() + 3  # Offset from text[3:]
        fm_text = text[4:end_pos]
        body = text[end_pos + end_match.end() - end_match.start() :]

    body = body.lstrip("\n")
    data = {}
    lines = fm_text.split("\n")
    i = 0

    while i < len(lines):
        line = lines[i]

        # Skip empty lines
        if not line.strip():
            i += 1
            continue

        # Must be a top-level key (no leading whitespace)
        if line[0] == " ":
            i += 1
            continue

        if ":" not in line:
            i += 1
            continue

        key, _, rest = line.partition(":")
        key = key.strip()
        rest = rest.strip()

        if not rest:
            # Block value - collect indented lines
            block_lines = []
            j = i + 1
            while j < len(lines) and lines[j] and lines[j][0] == " ":
                block_lines.append(lines[j])
                j += 1

            if key == "relations":
                data["relations"] = _parse_relations_block(block_lines)
            elif block_lines and block_lines[0].strip().startswith("- "):
                # Simple list
                data[key] = [
                    _parse_scalar(bl.strip().lstrip("- "))
                    for bl in block_lines
                    if bl.strip().startswith("- ")
                ]
            else:
                data[key] = None
            i = j
            continue

        # Inline list: [a, b, c]
        if rest.startswith("[") and rest.endswith("]"):
            inner = rest[1:-1]
            if inner.strip():
                data[key] = [
                    _parse_scalar(v.strip()) for v in inner.split(",") if v.strip()
                ]
            else:
                data[key] = []
        else:
            data[key] = _parse_scalar(rest)

        i += 1

    return data, body


def _parse_relations_block(lines: List[str]) -> List[Dict[str, Any]]:
    """Parse a YAML block list of relation dicts."""
    relations = []
    current = None

    for line in lines:
        stripped = line.strip()
        if not stripped:
            continue

        if stripped.startswith("- "):
            # New relation entry
            current = {}
            relations.append(current)
            # Parse key:value on same line as -
            rest = stripped[2:]
            if ":" in rest:
                k, _, v = rest.partition(":")
                current[k.strip()] = _parse_scalar(v.strip())
        elif current is not None and ":" in stripped:
            k, _, v = stripped.partition(":")
            current[k.strip()] = _parse_scalar(v.strip())

    return relations


# =============================================================================
# HELPER FUNCTIONS
# =============================================================================


def slugify(text: str, max_length: int = 60) -> str:
    """Convert text to a URL-friendly slug."""
    text = text.lower().strip()
    text = re.sub(r"[^\w\s-]", "", text)
    text = re.sub(r"[\s_]+", "-", text)
    text = re.sub(r"-+", "-", text)
    text = text.strip("-")
    if len(text) > max_length:
        text = text[:max_length].rstrip("-")
    return text or "untitled"


def make_filename(title: str, memory_id: str) -> str:
    """Create a filename from title and UUID suffix."""
    slug = slugify(title)
    suffix = memory_id[:6]
    return f"{slug}-{suffix}.md"


def calculate_decay_score(
    importance: float, days_since_access: float, access_count: int, type_weight: float
) -> float:
    """Calculate decay score for a memory.

    decay_score = importance * e^(-lambda * days) * log2(access_count + 1) * type_weight
    """
    time_factor = math.exp(-DECAY_LAMBDA * days_since_access)
    usage_factor = math.log2(access_count + 1) if access_count > 0 else 1.0
    return importance * time_factor * usage_factor * type_weight


def _ollama_embed(
    texts: List[str],
    model: str = EMBEDDING_MODEL,
    timeout: int = EMBEDDING_TIMEOUT,
) -> Optional[List[List[float]]]:
    """Get embeddings from Ollama for a list of texts.

    Returns list of embedding vectors, or None if Ollama is unavailable.
    """
    try:
        payload = json.dumps({"model": model, "input": texts}).encode("utf-8")
        req = urllib.request.Request(
            f"{OLLAMA_URL}/api/embed",
            data=payload,
            headers={"Content-Type": "application/json"},
            method="POST",
        )
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            if resp.status != 200:
                return None
            data = json.loads(resp.read().decode("utf-8"))
            embeddings = data.get("embeddings")
            if embeddings and isinstance(embeddings, list):
                return embeddings
            return None
    except (
        ConnectionRefusedError,
        URLError,
        TimeoutError,
        OSError,
        json.JSONDecodeError,
        ValueError,
        KeyError,
    ):
        return None


def _load_memory_config(config_path: Optional[Path] = None) -> Dict[str, Any]:
    """Read _config.json, return defaults if missing."""
    path = config_path or CONFIG_PATH
    defaults = {
        "embedding_provider": "ollama",
        "openai_api_key": None,
        "ollama_model": EMBEDDING_MODEL,
        "openai_model": OPENAI_MODEL_DEFAULT,
    }
    if path.exists():
        try:
            data = json.loads(path.read_text())
            for k, v in defaults.items():
                data.setdefault(k, v)
            return data
        except (json.JSONDecodeError, OSError):
            pass
    return defaults


def _openai_embed(
    texts: List[str],
    api_key: str,
    model: str = OPENAI_MODEL_DEFAULT,
    timeout: int = 30,
) -> Optional[List[List[float]]]:
    """Get embeddings from OpenAI API (stdlib-only, same interface as _ollama_embed)."""
    try:
        payload = json.dumps({"input": texts, "model": model}).encode("utf-8")
        req = urllib.request.Request(
            OPENAI_EMBED_URL,
            data=payload,
            headers={
                "Content-Type": "application/json",
                "Authorization": f"Bearer {api_key}",
            },
            method="POST",
        )
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            if resp.status != 200:
                return None
            data = json.loads(resp.read().decode("utf-8"))
            items = data.get("data", [])
            if items and isinstance(items, list):
                # Sort by index to ensure order matches input
                items.sort(key=lambda x: x.get("index", 0))
                return [item["embedding"] for item in items]
            return None
    except (
        ConnectionRefusedError,
        URLError,
        TimeoutError,
        OSError,
        json.JSONDecodeError,
        ValueError,
        KeyError,
    ):
        return None


def _cosine_similarity(a: List[float], b: List[float]) -> float:
    """Compute cosine similarity between two vectors."""
    dot = sum(x * y for x, y in zip(a, b))
    norm_a = math.sqrt(sum(x * x for x in a))
    norm_b = math.sqrt(sum(x * x for x in b))
    if norm_a == 0.0 or norm_b == 0.0:
        return 0.0
    return dot / (norm_a * norm_b)


def _make_edge_filename(
    from_title: str, rel_type: str, to_title: str, edge_id: str
) -> str:
    """Produce edge filename: {from-slug}--{TYPE}--{to-slug}-{6char}.md"""
    from_slug = slugify(from_title, max_length=30)
    to_slug = slugify(to_title, max_length=30)
    suffix = edge_id[:6]
    return f"{from_slug}--{rel_type}--{to_slug}-{suffix}.md"


def serialize_edge_frontmatter(data: Dict[str, Any]) -> str:
    """Serialize an edge dict to YAML frontmatter string."""
    lines = ["---"]
    for key in EDGE_FIELD_ORDER:
        if key not in data:
            continue
        value = data[key]
        if key in ("from_title", "to_title"):
            lines.append(f"{key}: {_format_yaml_value(value, force_quote=True)}")
        else:
            lines.append(f"{key}: {_format_yaml_value(value)}")
    lines.append("---")
    return "\n".join(lines)


def load_graph_config(config_path: Optional[Path] = None) -> Dict[str, Dict[str, Any]]:
    """Load named graphs config from _config.json 'graphs' key."""
    cfg = _load_memory_config(config_path)
    return cfg.get("graphs", {})


def resolve_graph_path(
    graph_name: Optional[str], config_path: Optional[Path] = None
) -> Path:
    """Resolve graph name to directory path. None/'default' → MEMORY_DIR."""
    if not graph_name or graph_name == "default":
        return MEMORY_DIR
    graphs = load_graph_config(config_path)
    if graph_name in graphs:
        p = graphs[graph_name].get("path", "")
        if p:
            return Path(p).expanduser()
    # Convention: sibling of MEMORY_DIR
    return GRAPHS_BASE_DIR / graph_name


def list_graphs(config_path: Optional[Path] = None) -> List[Dict[str, Any]]:
    """List all known graphs: default + configured + discovered on disk."""
    result = [{"name": "default", "path": str(MEMORY_DIR)}]
    seen = {"default"}

    # From config
    graphs = load_graph_config(config_path)
    for name, cfg in graphs.items():
        if name not in seen:
            p = cfg.get("path", "")
            path = str(Path(p).expanduser()) if p else str(GRAPHS_BASE_DIR / name)
            result.append({"name": name, "path": path})
            seen.add(name)

    # Discover on disk
    if GRAPHS_BASE_DIR.exists():
        for d in sorted(GRAPHS_BASE_DIR.iterdir()):
            if d.is_dir() and d.name not in seen:
                result.append({"name": d.name, "path": str(d)})
                seen.add(d.name)

    return result