cognitive-memory/scripts/edge-proposer.py

#!/usr/bin/env python3
"""Analyze cognitive memories and propose high-quality edges.

Reads all active/fading memories (decay >= 0.2), groups by shared tags,
and scores candidate relationships based on three signals:
  - Type heuristics (40%): e.g. fix+problem → SOLVES, decision+solution → BUILDS_ON
  - Tag overlap (30%): Jaccard similarity of tag sets
  - Content similarity (30%): Keyword overlap in memory body text

Outputs ranked proposals to stdout and saves top 80 as JSON for
programmatic use by Claude Code sessions.

Usage:
    python3 edge-proposer.py

    # Then review stdout output, pick good candidates, and create edges via:
    #   MCP: memory_relate(from_id, to_id, rel_type, description, strength)
    #   CLI: claude-memory relate <from_id> <to_id> <REL_TYPE> --description "..."

Output:
    - Ranked candidates printed to stdout (score, type, titles, shared tags)
    - JSON file saved to ~/.claude/tmp/edge-candidates.json

Scoring:
    - Minimum threshold: 0.15 (below this, candidates are discarded)
    - Importance boost: 1.2x multiplier when avg importance >= 0.7
    - Session-log memories tend to produce noise — review FOLLOWS edges carefully

Relation types available:
    SOLVES, CAUSES, BUILDS_ON, ALTERNATIVE_TO, REQUIRES, FOLLOWS, RELATED_TO

First run: 2026-02-19 — produced 5186 candidates from 473 memories,
    20 high-quality edges were manually selected and created.
"""

import json
import os
import re
from pathlib import Path
from collections import defaultdict
from itertools import combinations

# Resolve data directory: COGNITIVE_MEMORY_DIR > XDG_DATA_HOME > default
_env_dir = os.environ.get("COGNITIVE_MEMORY_DIR", "")
if _env_dir:
    MEMORY_DIR = Path(_env_dir).expanduser()
else:
    _xdg_data = os.environ.get("XDG_DATA_HOME", "") or str(
        Path.home() / ".local" / "share"
    )
    MEMORY_DIR = Path(_xdg_data) / "cognitive-memory"

STATE_FILE = MEMORY_DIR / "_state.json"
GRAPH_DIR = MEMORY_DIR / "graph"
EDGES_DIR = GRAPH_DIR / "edges"

# Type-based heuristics: (type_a, type_b) -> (suggested_rel, direction, base_score)
# direction: "ab" means a->b, "ba" means b->a
TYPE_HEURISTICS = {
    ("fix", "problem"): ("SOLVES", "ab", 0.6),
    ("solution", "problem"): ("SOLVES", "ab", 0.7),
    ("solution", "error"): ("SOLVES", "ab", 0.6),
    ("fix", "error"): ("SOLVES", "ab", 0.6),
    ("decision", "solution"): ("BUILDS_ON", "ab", 0.3),
    ("decision", "decision"): ("ALTERNATIVE_TO", None, 0.2),
    ("solution", "solution"): ("BUILDS_ON", None, 0.2),
    ("configuration", "solution"): ("REQUIRES", "ab", 0.3),
    ("workflow", "configuration"): ("REQUIRES", "ab", 0.3),
    ("insight", "solution"): ("BUILDS_ON", "ab", 0.4),
    ("insight", "decision"): ("BUILDS_ON", "ab", 0.4),
    ("fix", "fix"): ("FOLLOWS", None, 0.15),
    ("fix", "solution"): ("BUILDS_ON", "ab", 0.2),
    ("code_pattern", "solution"): ("BUILDS_ON", "ab", 0.3),
    ("procedure", "workflow"): ("BUILDS_ON", "ab", 0.3),
    ("configuration", "configuration"): ("RELATED_TO", None, 0.1),
}


def parse_frontmatter(filepath: Path) -> dict | None:
    """Parse YAML frontmatter from a markdown file."""
    try:
        text = filepath.read_text(encoding="utf-8")
    except Exception:
        return None

    if not text.startswith("---"):
        return None

    end = text.find("---", 3)
    if end == -1:
        return None

    fm = {}
    body = text[end + 3 :].strip()
    fm["_body"] = body[:500]  # first 500 chars of content for matching
    fm["_filepath"] = str(filepath)

    for line in text[3:end].strip().splitlines():
        if ":" not in line:
            continue
        key, _, val = line.partition(":")
        key = key.strip()
        val = val.strip().strip('"').strip("'")

        if key == "tags":
            # Handle both [a, b] and "a, b" formats
            val = val.strip("[]")
            fm["tags"] = [
                t.strip().strip('"').strip("'") for t in val.split(",") if t.strip()
            ]
        elif key == "importance":
            try:
                fm["importance"] = float(val)
            except ValueError:
                pass
        else:
            fm[key] = val

    return fm


def load_memories() -> dict[str, dict]:
    """Load all memories from graph subdirectories."""
    memories = {}
    type_dirs = [
        "solutions",
        "fixes",
        "decisions",
        "configurations",
        "problems",
        "workflows",
        "code-patterns",
        "errors",
        "general",
        "procedures",
        "insights",
    ]

    for type_dir in type_dirs:
        dirpath = GRAPH_DIR / type_dir
        if not dirpath.exists():
            continue
        for f in dirpath.glob("*.md"):
            fm = parse_frontmatter(f)
            if fm and "id" in fm:
                memories[fm["id"]] = fm

    return memories


def load_existing_edges() -> set[tuple[str, str]]:
    """Load existing edges to avoid duplicates."""
    existing = set()

    if not EDGES_DIR.exists():
        return existing

    for f in EDGES_DIR.glob("*.md"):
        fm = parse_frontmatter(f)
        if fm and "from_id" in fm and "to_id" in fm:
            existing.add((fm["from_id"], fm["to_id"]))
            existing.add((fm["to_id"], fm["from_id"]))  # bidirectional check

    return existing


def load_decay_state() -> dict[str, float]:
    """Load decay scores from state file."""
    if not STATE_FILE.exists():
        return {}
    try:
        state = json.loads(STATE_FILE.read_text())
        return {mid: info.get("decay_score", 0) for mid, info in state.items()}
    except Exception:
        return {}


def tag_overlap_score(tags_a: list[str], tags_b: list[str]) -> float:
    """Jaccard similarity of tag sets."""
    if not tags_a or not tags_b:
        return 0.0
    set_a, set_b = set(tags_a), set(tags_b)
    intersection = set_a & set_b
    union = set_a | set_b
    return len(intersection) / len(union) if union else 0.0


def content_keyword_overlap(body_a: str, body_b: str) -> float:
    """Simple keyword overlap between content bodies."""
    if not body_a or not body_b:
        return 0.0

    def extract_keywords(text: str) -> set[str]:
        words = re.findall(r"[a-zA-Z_]{4,}", text.lower())
        # Filter common words
        stopwords = {
            "that",
            "this",
            "with",
            "from",
            "have",
            "been",
            "were",
            "they",
            "their",
            "will",
            "would",
            "could",
            "should",
            "which",
            "where",
            "when",
            "what",
            "about",
            "into",
            "also",
            "more",
            "some",
            "then",
            "than",
            "each",
            "only",
            "used",
            "using",
            "after",
            "before",
            "because",
            "between",
            "through",
            "during",
            "added",
            "updated",
            "fixed",
            "error",
            "issue",
            "problem",
            "solution",
            "memory",
            "memories",
            "configuration",
            "successfully",
            "working",
            "works",
        }
        return {w for w in words if w not in stopwords}

    kw_a = extract_keywords(body_a)
    kw_b = extract_keywords(body_b)

    if not kw_a or not kw_b:
        return 0.0

    intersection = kw_a & kw_b
    union = kw_a | kw_b
    return len(intersection) / len(union) if union else 0.0


def get_type_heuristic(
    type_a: str, type_b: str
) -> tuple[str, str | None, float] | None:
    """Look up type-based heuristic, checking both orderings."""
    key = (type_a, type_b)
    if key in TYPE_HEURISTICS:
        rel, direction, score = TYPE_HEURISTICS[key]
        return rel, direction, score

    key_rev = (type_b, type_a)
    if key_rev in TYPE_HEURISTICS:
        rel, direction, score = TYPE_HEURISTICS[key_rev]
        # Flip direction
        if direction == "ab":
            direction = "ba"
        elif direction == "ba":
            direction = "ab"
        return rel, direction, score

    return None


def score_pair(mem_a: dict, mem_b: dict) -> dict | None:
    """Score a candidate edge between two memories."""
    tags_a = mem_a.get("tags", [])
    tags_b = mem_b.get("tags", [])

    # Must share at least one tag
    shared_tags = set(tags_a) & set(tags_b)
    if not shared_tags:
        return None

    tag_score = tag_overlap_score(tags_a, tags_b)
    content_score = content_keyword_overlap(
        mem_a.get("_body", ""), mem_b.get("_body", "")
    )

    type_a = mem_a.get("type", "general")
    type_b = mem_b.get("type", "general")

    heuristic = get_type_heuristic(type_a, type_b)
    if heuristic:
        suggested_rel, direction, type_score = heuristic
    else:
        suggested_rel = "RELATED_TO"
        direction = None
        type_score = 0.05

    # Composite score
    total = (tag_score * 0.4) + (content_score * 0.3) + (type_score * 0.3)

    # Boost for high importance memories
    imp_a = mem_a.get("importance", 0.5)
    imp_b = mem_b.get("importance", 0.5)
    if isinstance(imp_a, str):
        imp_a = float(imp_a)
    if isinstance(imp_b, str):
        imp_b = float(imp_b)
    avg_importance = (imp_a + imp_b) / 2
    if avg_importance >= 0.7:
        total *= 1.2

    if total < 0.15:
        return None

    # Determine from/to based on direction
    if direction == "ab":
        from_mem, to_mem = mem_a, mem_b
    elif direction == "ba":
        from_mem, to_mem = mem_b, mem_a
    else:
        # Default: higher importance is "from"
        if imp_a >= imp_b:
            from_mem, to_mem = mem_a, mem_b
        else:
            from_mem, to_mem = mem_b, mem_a

    return {
        "score": round(total, 3),
        "rel_type": suggested_rel,
        "from_id": from_mem["id"],
        "from_title": from_mem.get("title", "?"),
        "from_type": from_mem.get("type", "?"),
        "to_id": to_mem["id"],
        "to_title": to_mem.get("title", "?"),
        "to_type": to_mem.get("type", "?"),
        "shared_tags": sorted(shared_tags),
        "tag_score": round(tag_score, 3),
        "content_score": round(content_score, 3),
        "type_score": round(type_score, 3),
    }


def main():
    print("Loading memories...")
    memories = load_memories()
    print(f"  Found {len(memories)} memories")

    print("Loading decay state...")
    decay_scores = load_decay_state()

    # Filter to active + fading only (decay >= 0.2)
    active_ids = {
        mid for mid, score in decay_scores.items() if score >= 0.2 and mid in memories
    }
    # Also include memories without decay state (new)
    for mid in memories:
        if mid not in decay_scores:
            active_ids.add(mid)

    active_memories = {mid: memories[mid] for mid in active_ids}
    print(f"  {len(active_memories)} active/fading memories to analyze")

    print("Loading existing edges...")
    existing = load_existing_edges()
    print(f"  {len(existing) // 2} existing edges")

    print("Scoring candidate pairs...")
    candidates = []

    # Group by shared tags first to reduce pair space
    tag_groups = defaultdict(set)
    for mid, mem in active_memories.items():
        for tag in mem.get("tags", []):
            tag_groups[tag].add(mid)

    # Collect unique pairs that share at least one tag
    seen_pairs = set()
    for tag, mids in tag_groups.items():
        if len(mids) < 2 or len(mids) > 50:  # skip too-common tags
            continue
        for a, b in combinations(mids, 2):
            pair = tuple(sorted([a, b]))
            if pair in seen_pairs:
                continue
            if (a, b) in existing or (b, a) in existing:
                continue
            seen_pairs.add(pair)

            result = score_pair(active_memories[a], active_memories[b])
            if result:
                candidates.append(result)

    # Sort by score descending
    candidates.sort(key=lambda x: x["score"], reverse=True)

    print(f"\n{'='*100}")
    print(f"TOP EDGE CANDIDATES ({len(candidates)} total, showing top 80)")
    print(f"{'='*100}\n")

    for i, c in enumerate(candidates[:80], 1):
        print(f"#{i:3d} | Score: {c['score']:.3f} | {c['rel_type']}")
        print(f"      FROM [{c['from_type']}] {c['from_title']}")
        print(f"        TO [{c['to_type']}] {c['to_title']}")
        print(
            f"      Tags: {', '.join(c['shared_tags'])} | "
            f"tag={c['tag_score']:.2f} content={c['content_score']:.2f} type={c['type_score']:.2f}"
        )
        print(f"      IDs: {c['from_id']} -> {c['to_id']}")
        print()

    # Also output as JSON for programmatic use
    json_path = MEMORY_DIR.parent / "tmp" / "edge-candidates.json"
    json_path.parent.mkdir(parents=True, exist_ok=True)
    json_path.write_text(json.dumps(candidates[:80], indent=2))
    print(f"Full candidates saved to: {json_path}")


if __name__ == "__main__":
    main()