cognitive-memory/scripts/edge-proposer.py
Cal Corum 48df2a89ce Initial commit: extract cognitive-memory app from skill directory
Moved application code from ~/.claude/skills/cognitive-memory/ to its own
project directory. The skill layer (SKILL.md, SCHEMA.md) remains in the
skill directory for Claude Code to read.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 16:02:28 -06:00

428 lines
13 KiB
Python

#!/usr/bin/env python3
"""Analyze cognitive memories and propose high-quality edges.
Reads all active/fading memories (decay >= 0.2), groups by shared tags,
and scores candidate relationships based on three signals:
- Type heuristics (40%): e.g. fix+problem → SOLVES, decision+solution → BUILDS_ON
- Tag overlap (30%): Jaccard similarity of tag sets
- Content similarity (30%): Keyword overlap in memory body text
Outputs ranked proposals to stdout and saves top 80 as JSON for
programmatic use by Claude Code sessions.
Usage:
python3 edge-proposer.py
# Then review stdout output, pick good candidates, and create edges via:
# MCP: memory_relate(from_id, to_id, rel_type, description, strength)
# CLI: claude-memory relate <from_id> <to_id> <REL_TYPE> --description "..."
Output:
- Ranked candidates printed to stdout (score, type, titles, shared tags)
- JSON file saved to ~/.claude/tmp/edge-candidates.json
Scoring:
- Minimum threshold: 0.15 (below this, candidates are discarded)
- Importance boost: 1.2x multiplier when avg importance >= 0.7
- Session-log memories tend to produce noise — review FOLLOWS edges carefully
Relation types available:
SOLVES, CAUSES, BUILDS_ON, ALTERNATIVE_TO, REQUIRES, FOLLOWS, RELATED_TO
First run: 2026-02-19 — produced 5186 candidates from 473 memories,
20 high-quality edges were manually selected and created.
"""
import json
import os
import re
from pathlib import Path
from collections import defaultdict
from itertools import combinations
# Resolve data directory: COGNITIVE_MEMORY_DIR > XDG_DATA_HOME > default
_env_dir = os.environ.get("COGNITIVE_MEMORY_DIR", "")
if _env_dir:
MEMORY_DIR = Path(_env_dir).expanduser()
else:
_xdg_data = os.environ.get("XDG_DATA_HOME", "") or str(
Path.home() / ".local" / "share"
)
MEMORY_DIR = Path(_xdg_data) / "cognitive-memory"
STATE_FILE = MEMORY_DIR / "_state.json"
GRAPH_DIR = MEMORY_DIR / "graph"
EDGES_DIR = GRAPH_DIR / "edges"
# Type-based heuristics: (type_a, type_b) -> (suggested_rel, direction, base_score)
# direction: "ab" means a->b, "ba" means b->a
TYPE_HEURISTICS = {
("fix", "problem"): ("SOLVES", "ab", 0.6),
("solution", "problem"): ("SOLVES", "ab", 0.7),
("solution", "error"): ("SOLVES", "ab", 0.6),
("fix", "error"): ("SOLVES", "ab", 0.6),
("decision", "solution"): ("BUILDS_ON", "ab", 0.3),
("decision", "decision"): ("ALTERNATIVE_TO", None, 0.2),
("solution", "solution"): ("BUILDS_ON", None, 0.2),
("configuration", "solution"): ("REQUIRES", "ab", 0.3),
("workflow", "configuration"): ("REQUIRES", "ab", 0.3),
("insight", "solution"): ("BUILDS_ON", "ab", 0.4),
("insight", "decision"): ("BUILDS_ON", "ab", 0.4),
("fix", "fix"): ("FOLLOWS", None, 0.15),
("fix", "solution"): ("BUILDS_ON", "ab", 0.2),
("code_pattern", "solution"): ("BUILDS_ON", "ab", 0.3),
("procedure", "workflow"): ("BUILDS_ON", "ab", 0.3),
("configuration", "configuration"): ("RELATED_TO", None, 0.1),
}
def parse_frontmatter(filepath: Path) -> dict | None:
"""Parse YAML frontmatter from a markdown file."""
try:
text = filepath.read_text(encoding="utf-8")
except Exception:
return None
if not text.startswith("---"):
return None
end = text.find("---", 3)
if end == -1:
return None
fm = {}
body = text[end + 3 :].strip()
fm["_body"] = body[:500] # first 500 chars of content for matching
fm["_filepath"] = str(filepath)
for line in text[3:end].strip().splitlines():
if ":" not in line:
continue
key, _, val = line.partition(":")
key = key.strip()
val = val.strip().strip('"').strip("'")
if key == "tags":
# Handle both [a, b] and "a, b" formats
val = val.strip("[]")
fm["tags"] = [
t.strip().strip('"').strip("'") for t in val.split(",") if t.strip()
]
elif key == "importance":
try:
fm["importance"] = float(val)
except ValueError:
pass
else:
fm[key] = val
return fm
def load_memories() -> dict[str, dict]:
"""Load all memories from graph subdirectories."""
memories = {}
type_dirs = [
"solutions",
"fixes",
"decisions",
"configurations",
"problems",
"workflows",
"code-patterns",
"errors",
"general",
"procedures",
"insights",
]
for type_dir in type_dirs:
dirpath = GRAPH_DIR / type_dir
if not dirpath.exists():
continue
for f in dirpath.glob("*.md"):
fm = parse_frontmatter(f)
if fm and "id" in fm:
memories[fm["id"]] = fm
return memories
def load_existing_edges() -> set[tuple[str, str]]:
"""Load existing edges to avoid duplicates."""
existing = set()
if not EDGES_DIR.exists():
return existing
for f in EDGES_DIR.glob("*.md"):
fm = parse_frontmatter(f)
if fm and "from_id" in fm and "to_id" in fm:
existing.add((fm["from_id"], fm["to_id"]))
existing.add((fm["to_id"], fm["from_id"])) # bidirectional check
return existing
def load_decay_state() -> dict[str, float]:
"""Load decay scores from state file."""
if not STATE_FILE.exists():
return {}
try:
state = json.loads(STATE_FILE.read_text())
return {mid: info.get("decay_score", 0) for mid, info in state.items()}
except Exception:
return {}
def tag_overlap_score(tags_a: list[str], tags_b: list[str]) -> float:
"""Jaccard similarity of tag sets."""
if not tags_a or not tags_b:
return 0.0
set_a, set_b = set(tags_a), set(tags_b)
intersection = set_a & set_b
union = set_a | set_b
return len(intersection) / len(union) if union else 0.0
def content_keyword_overlap(body_a: str, body_b: str) -> float:
"""Simple keyword overlap between content bodies."""
if not body_a or not body_b:
return 0.0
def extract_keywords(text: str) -> set[str]:
words = re.findall(r"[a-zA-Z_]{4,}", text.lower())
# Filter common words
stopwords = {
"that",
"this",
"with",
"from",
"have",
"been",
"were",
"they",
"their",
"will",
"would",
"could",
"should",
"which",
"where",
"when",
"what",
"about",
"into",
"also",
"more",
"some",
"then",
"than",
"each",
"only",
"used",
"using",
"after",
"before",
"because",
"between",
"through",
"during",
"added",
"updated",
"fixed",
"error",
"issue",
"problem",
"solution",
"memory",
"memories",
"configuration",
"successfully",
"working",
"works",
}
return {w for w in words if w not in stopwords}
kw_a = extract_keywords(body_a)
kw_b = extract_keywords(body_b)
if not kw_a or not kw_b:
return 0.0
intersection = kw_a & kw_b
union = kw_a | kw_b
return len(intersection) / len(union) if union else 0.0
def get_type_heuristic(
type_a: str, type_b: str
) -> tuple[str, str | None, float] | None:
"""Look up type-based heuristic, checking both orderings."""
key = (type_a, type_b)
if key in TYPE_HEURISTICS:
rel, direction, score = TYPE_HEURISTICS[key]
return rel, direction, score
key_rev = (type_b, type_a)
if key_rev in TYPE_HEURISTICS:
rel, direction, score = TYPE_HEURISTICS[key_rev]
# Flip direction
if direction == "ab":
direction = "ba"
elif direction == "ba":
direction = "ab"
return rel, direction, score
return None
def score_pair(mem_a: dict, mem_b: dict) -> dict | None:
"""Score a candidate edge between two memories."""
tags_a = mem_a.get("tags", [])
tags_b = mem_b.get("tags", [])
# Must share at least one tag
shared_tags = set(tags_a) & set(tags_b)
if not shared_tags:
return None
tag_score = tag_overlap_score(tags_a, tags_b)
content_score = content_keyword_overlap(
mem_a.get("_body", ""), mem_b.get("_body", "")
)
type_a = mem_a.get("type", "general")
type_b = mem_b.get("type", "general")
heuristic = get_type_heuristic(type_a, type_b)
if heuristic:
suggested_rel, direction, type_score = heuristic
else:
suggested_rel = "RELATED_TO"
direction = None
type_score = 0.05
# Composite score
total = (tag_score * 0.4) + (content_score * 0.3) + (type_score * 0.3)
# Boost for high importance memories
imp_a = mem_a.get("importance", 0.5)
imp_b = mem_b.get("importance", 0.5)
if isinstance(imp_a, str):
imp_a = float(imp_a)
if isinstance(imp_b, str):
imp_b = float(imp_b)
avg_importance = (imp_a + imp_b) / 2
if avg_importance >= 0.7:
total *= 1.2
if total < 0.15:
return None
# Determine from/to based on direction
if direction == "ab":
from_mem, to_mem = mem_a, mem_b
elif direction == "ba":
from_mem, to_mem = mem_b, mem_a
else:
# Default: higher importance is "from"
if imp_a >= imp_b:
from_mem, to_mem = mem_a, mem_b
else:
from_mem, to_mem = mem_b, mem_a
return {
"score": round(total, 3),
"rel_type": suggested_rel,
"from_id": from_mem["id"],
"from_title": from_mem.get("title", "?"),
"from_type": from_mem.get("type", "?"),
"to_id": to_mem["id"],
"to_title": to_mem.get("title", "?"),
"to_type": to_mem.get("type", "?"),
"shared_tags": sorted(shared_tags),
"tag_score": round(tag_score, 3),
"content_score": round(content_score, 3),
"type_score": round(type_score, 3),
}
def main():
print("Loading memories...")
memories = load_memories()
print(f" Found {len(memories)} memories")
print("Loading decay state...")
decay_scores = load_decay_state()
# Filter to active + fading only (decay >= 0.2)
active_ids = {
mid for mid, score in decay_scores.items() if score >= 0.2 and mid in memories
}
# Also include memories without decay state (new)
for mid in memories:
if mid not in decay_scores:
active_ids.add(mid)
active_memories = {mid: memories[mid] for mid in active_ids}
print(f" {len(active_memories)} active/fading memories to analyze")
print("Loading existing edges...")
existing = load_existing_edges()
print(f" {len(existing) // 2} existing edges")
print("Scoring candidate pairs...")
candidates = []
# Group by shared tags first to reduce pair space
tag_groups = defaultdict(set)
for mid, mem in active_memories.items():
for tag in mem.get("tags", []):
tag_groups[tag].add(mid)
# Collect unique pairs that share at least one tag
seen_pairs = set()
for tag, mids in tag_groups.items():
if len(mids) < 2 or len(mids) > 50: # skip too-common tags
continue
for a, b in combinations(mids, 2):
pair = tuple(sorted([a, b]))
if pair in seen_pairs:
continue
if (a, b) in existing or (b, a) in existing:
continue
seen_pairs.add(pair)
result = score_pair(active_memories[a], active_memories[b])
if result:
candidates.append(result)
# Sort by score descending
candidates.sort(key=lambda x: x["score"], reverse=True)
print(f"\n{'='*100}")
print(f"TOP EDGE CANDIDATES ({len(candidates)} total, showing top 80)")
print(f"{'='*100}\n")
for i, c in enumerate(candidates[:80], 1):
print(f"#{i:3d} | Score: {c['score']:.3f} | {c['rel_type']}")
print(f" FROM [{c['from_type']}] {c['from_title']}")
print(f" TO [{c['to_type']}] {c['to_title']}")
print(
f" Tags: {', '.join(c['shared_tags'])} | "
f"tag={c['tag_score']:.2f} content={c['content_score']:.2f} type={c['type_score']:.2f}"
)
print(f" IDs: {c['from_id']} -> {c['to_id']}")
print()
# Also output as JSON for programmatic use
json_path = MEMORY_DIR.parent / "tmp" / "edge-candidates.json"
json_path.parent.mkdir(parents=True, exist_ok=True)
json_path.write_text(json.dumps(candidates[:80], indent=2))
print(f"Full candidates saved to: {json_path}")
if __name__ == "__main__":
main()