#!/usr/bin/env python3 """Analyze cognitive memories and propose high-quality edges. Reads all active/fading memories (decay >= 0.2), groups by shared tags, and scores candidate relationships based on three signals: - Type heuristics (40%): e.g. fix+problem → SOLVES, decision+solution → BUILDS_ON - Tag overlap (30%): Jaccard similarity of tag sets - Content similarity (30%): Keyword overlap in memory body text Outputs ranked proposals to stdout and saves top 80 as JSON for programmatic use by Claude Code sessions. Usage: python3 edge-proposer.py # Then review stdout output, pick good candidates, and create edges via: # MCP: memory_relate(from_id, to_id, rel_type, description, strength) # CLI: claude-memory relate --description "..." Output: - Ranked candidates printed to stdout (score, type, titles, shared tags) - JSON file saved to ~/.claude/tmp/edge-candidates.json Scoring: - Minimum threshold: 0.15 (below this, candidates are discarded) - Importance boost: 1.2x multiplier when avg importance >= 0.7 - Session-log memories tend to produce noise — review FOLLOWS edges carefully Relation types available: SOLVES, CAUSES, BUILDS_ON, ALTERNATIVE_TO, REQUIRES, FOLLOWS, RELATED_TO First run: 2026-02-19 — produced 5186 candidates from 473 memories, 20 high-quality edges were manually selected and created. """ import json import os import re from pathlib import Path from collections import defaultdict from itertools import combinations # Resolve data directory: COGNITIVE_MEMORY_DIR > XDG_DATA_HOME > default _env_dir = os.environ.get("COGNITIVE_MEMORY_DIR", "") if _env_dir: MEMORY_DIR = Path(_env_dir).expanduser() else: _xdg_data = os.environ.get("XDG_DATA_HOME", "") or str( Path.home() / ".local" / "share" ) MEMORY_DIR = Path(_xdg_data) / "cognitive-memory" STATE_FILE = MEMORY_DIR / "_state.json" GRAPH_DIR = MEMORY_DIR / "graph" EDGES_DIR = GRAPH_DIR / "edges" # Type-based heuristics: (type_a, type_b) -> (suggested_rel, direction, base_score) # direction: "ab" means a->b, "ba" means b->a TYPE_HEURISTICS = { ("fix", "problem"): ("SOLVES", "ab", 0.6), ("solution", "problem"): ("SOLVES", "ab", 0.7), ("solution", "error"): ("SOLVES", "ab", 0.6), ("fix", "error"): ("SOLVES", "ab", 0.6), ("decision", "solution"): ("BUILDS_ON", "ab", 0.3), ("decision", "decision"): ("ALTERNATIVE_TO", None, 0.2), ("solution", "solution"): ("BUILDS_ON", None, 0.2), ("configuration", "solution"): ("REQUIRES", "ab", 0.3), ("workflow", "configuration"): ("REQUIRES", "ab", 0.3), ("insight", "solution"): ("BUILDS_ON", "ab", 0.4), ("insight", "decision"): ("BUILDS_ON", "ab", 0.4), ("fix", "fix"): ("FOLLOWS", None, 0.15), ("fix", "solution"): ("BUILDS_ON", "ab", 0.2), ("code_pattern", "solution"): ("BUILDS_ON", "ab", 0.3), ("procedure", "workflow"): ("BUILDS_ON", "ab", 0.3), ("configuration", "configuration"): ("RELATED_TO", None, 0.1), } def parse_frontmatter(filepath: Path) -> dict | None: """Parse YAML frontmatter from a markdown file.""" try: text = filepath.read_text(encoding="utf-8") except Exception: return None if not text.startswith("---"): return None end = text.find("---", 3) if end == -1: return None fm = {} body = text[end + 3 :].strip() fm["_body"] = body[:500] # first 500 chars of content for matching fm["_filepath"] = str(filepath) for line in text[3:end].strip().splitlines(): if ":" not in line: continue key, _, val = line.partition(":") key = key.strip() val = val.strip().strip('"').strip("'") if key == "tags": # Handle both [a, b] and "a, b" formats val = val.strip("[]") fm["tags"] = [ t.strip().strip('"').strip("'") for t in val.split(",") if t.strip() ] elif key == "importance": try: fm["importance"] = float(val) except ValueError: pass else: fm[key] = val return fm def load_memories() -> dict[str, dict]: """Load all memories from graph subdirectories.""" memories = {} type_dirs = [ "solutions", "fixes", "decisions", "configurations", "problems", "workflows", "code-patterns", "errors", "general", "procedures", "insights", ] for type_dir in type_dirs: dirpath = GRAPH_DIR / type_dir if not dirpath.exists(): continue for f in dirpath.glob("*.md"): fm = parse_frontmatter(f) if fm and "id" in fm: memories[fm["id"]] = fm return memories def load_existing_edges() -> set[tuple[str, str]]: """Load existing edges to avoid duplicates.""" existing = set() if not EDGES_DIR.exists(): return existing for f in EDGES_DIR.glob("*.md"): fm = parse_frontmatter(f) if fm and "from_id" in fm and "to_id" in fm: existing.add((fm["from_id"], fm["to_id"])) existing.add((fm["to_id"], fm["from_id"])) # bidirectional check return existing def load_decay_state() -> dict[str, float]: """Load decay scores from state file.""" if not STATE_FILE.exists(): return {} try: state = json.loads(STATE_FILE.read_text()) return {mid: info.get("decay_score", 0) for mid, info in state.items()} except Exception: return {} def tag_overlap_score(tags_a: list[str], tags_b: list[str]) -> float: """Jaccard similarity of tag sets.""" if not tags_a or not tags_b: return 0.0 set_a, set_b = set(tags_a), set(tags_b) intersection = set_a & set_b union = set_a | set_b return len(intersection) / len(union) if union else 0.0 def content_keyword_overlap(body_a: str, body_b: str) -> float: """Simple keyword overlap between content bodies.""" if not body_a or not body_b: return 0.0 def extract_keywords(text: str) -> set[str]: words = re.findall(r"[a-zA-Z_]{4,}", text.lower()) # Filter common words stopwords = { "that", "this", "with", "from", "have", "been", "were", "they", "their", "will", "would", "could", "should", "which", "where", "when", "what", "about", "into", "also", "more", "some", "then", "than", "each", "only", "used", "using", "after", "before", "because", "between", "through", "during", "added", "updated", "fixed", "error", "issue", "problem", "solution", "memory", "memories", "configuration", "successfully", "working", "works", } return {w for w in words if w not in stopwords} kw_a = extract_keywords(body_a) kw_b = extract_keywords(body_b) if not kw_a or not kw_b: return 0.0 intersection = kw_a & kw_b union = kw_a | kw_b return len(intersection) / len(union) if union else 0.0 def get_type_heuristic( type_a: str, type_b: str ) -> tuple[str, str | None, float] | None: """Look up type-based heuristic, checking both orderings.""" key = (type_a, type_b) if key in TYPE_HEURISTICS: rel, direction, score = TYPE_HEURISTICS[key] return rel, direction, score key_rev = (type_b, type_a) if key_rev in TYPE_HEURISTICS: rel, direction, score = TYPE_HEURISTICS[key_rev] # Flip direction if direction == "ab": direction = "ba" elif direction == "ba": direction = "ab" return rel, direction, score return None def score_pair(mem_a: dict, mem_b: dict) -> dict | None: """Score a candidate edge between two memories.""" tags_a = mem_a.get("tags", []) tags_b = mem_b.get("tags", []) # Must share at least one tag shared_tags = set(tags_a) & set(tags_b) if not shared_tags: return None tag_score = tag_overlap_score(tags_a, tags_b) content_score = content_keyword_overlap( mem_a.get("_body", ""), mem_b.get("_body", "") ) type_a = mem_a.get("type", "general") type_b = mem_b.get("type", "general") heuristic = get_type_heuristic(type_a, type_b) if heuristic: suggested_rel, direction, type_score = heuristic else: suggested_rel = "RELATED_TO" direction = None type_score = 0.05 # Composite score total = (tag_score * 0.4) + (content_score * 0.3) + (type_score * 0.3) # Boost for high importance memories imp_a = mem_a.get("importance", 0.5) imp_b = mem_b.get("importance", 0.5) if isinstance(imp_a, str): imp_a = float(imp_a) if isinstance(imp_b, str): imp_b = float(imp_b) avg_importance = (imp_a + imp_b) / 2 if avg_importance >= 0.7: total *= 1.2 if total < 0.15: return None # Determine from/to based on direction if direction == "ab": from_mem, to_mem = mem_a, mem_b elif direction == "ba": from_mem, to_mem = mem_b, mem_a else: # Default: higher importance is "from" if imp_a >= imp_b: from_mem, to_mem = mem_a, mem_b else: from_mem, to_mem = mem_b, mem_a return { "score": round(total, 3), "rel_type": suggested_rel, "from_id": from_mem["id"], "from_title": from_mem.get("title", "?"), "from_type": from_mem.get("type", "?"), "to_id": to_mem["id"], "to_title": to_mem.get("title", "?"), "to_type": to_mem.get("type", "?"), "shared_tags": sorted(shared_tags), "tag_score": round(tag_score, 3), "content_score": round(content_score, 3), "type_score": round(type_score, 3), } def main(): print("Loading memories...") memories = load_memories() print(f" Found {len(memories)} memories") print("Loading decay state...") decay_scores = load_decay_state() # Filter to active + fading only (decay >= 0.2) active_ids = { mid for mid, score in decay_scores.items() if score >= 0.2 and mid in memories } # Also include memories without decay state (new) for mid in memories: if mid not in decay_scores: active_ids.add(mid) active_memories = {mid: memories[mid] for mid in active_ids} print(f" {len(active_memories)} active/fading memories to analyze") print("Loading existing edges...") existing = load_existing_edges() print(f" {len(existing) // 2} existing edges") print("Scoring candidate pairs...") candidates = [] # Group by shared tags first to reduce pair space tag_groups = defaultdict(set) for mid, mem in active_memories.items(): for tag in mem.get("tags", []): tag_groups[tag].add(mid) # Collect unique pairs that share at least one tag seen_pairs = set() for tag, mids in tag_groups.items(): if len(mids) < 2 or len(mids) > 50: # skip too-common tags continue for a, b in combinations(mids, 2): pair = tuple(sorted([a, b])) if pair in seen_pairs: continue if (a, b) in existing or (b, a) in existing: continue seen_pairs.add(pair) result = score_pair(active_memories[a], active_memories[b]) if result: candidates.append(result) # Sort by score descending candidates.sort(key=lambda x: x["score"], reverse=True) print(f"\n{'='*100}") print(f"TOP EDGE CANDIDATES ({len(candidates)} total, showing top 80)") print(f"{'='*100}\n") for i, c in enumerate(candidates[:80], 1): print(f"#{i:3d} | Score: {c['score']:.3f} | {c['rel_type']}") print(f" FROM [{c['from_type']}] {c['from_title']}") print(f" TO [{c['to_type']}] {c['to_title']}") print( f" Tags: {', '.join(c['shared_tags'])} | " f"tag={c['tag_score']:.2f} content={c['content_score']:.2f} type={c['type_score']:.2f}" ) print(f" IDs: {c['from_id']} -> {c['to_id']}") print() # Also output as JSON for programmatic use json_path = MEMORY_DIR.parent / "tmp" / "edge-candidates.json" json_path.parent.mkdir(parents=True, exist_ok=True) json_path.write_text(json.dumps(candidates[:80], indent=2)) print(f"Full candidates saved to: {json_path}") if __name__ == "__main__": main()