- Add vector store with sentence-transformers for semantic search - FastAPI backend with /chat and /health endpoints - Conversation state persistence via SQLite - OpenRouter integration with structured JSON responses - Discord bot with /ask slash command and reply-based follow-ups - Automated Gitea issue creation for unanswered questions - Docker support with docker-compose for easy deployment - Example rule file and ingestion script - Comprehensive documentation in README
145 lines
4.1 KiB
Python
145 lines
4.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Ingest rule documents from markdown files into ChromaDB.
|
|
|
|
The script reads all markdown files from the rules directory and adds them
|
|
to the vector store. Each file should have YAML frontmatter with metadata
|
|
fields matching RuleMetadata.
|
|
|
|
Example frontmatter:
|
|
---
|
|
rule_id: "5.2.1(b)"
|
|
title: "Stolen Base Attempts"
|
|
section: "Baserunning"
|
|
parent_rule: "5.2"
|
|
page_ref: "32"
|
|
---
|
|
|
|
Rule content here...
|
|
"""
|
|
|
|
import sys
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
import yaml
|
|
|
|
from app.config import settings
|
|
from app.vector_store import VectorStore
|
|
from app.models import RuleDocument, RuleMetadata
|
|
|
|
|
|
def parse_frontmatter(content: str) -> tuple[dict, str]:
|
|
"""Parse YAML frontmatter from markdown content."""
|
|
pattern = r"^---\s*\n(.*?)\n---\s*\n(.*)$"
|
|
match = re.match(pattern, content, re.DOTALL)
|
|
|
|
if match:
|
|
frontmatter_str = match.group(1)
|
|
body_content = match.group(2).strip()
|
|
metadata = yaml.safe_load(frontmatter_str) or {}
|
|
return metadata, body_content
|
|
else:
|
|
raise ValueError("No valid YAML frontmatter found")
|
|
|
|
|
|
def load_markdown_file(filepath: Path) -> Optional[RuleDocument]:
|
|
"""Load a single markdown file and convert to RuleDocument."""
|
|
try:
|
|
content = filepath.read_text(encoding="utf-8")
|
|
metadata_dict, body = parse_frontmatter(content)
|
|
|
|
# Validate and create metadata
|
|
metadata = RuleMetadata(**metadata_dict)
|
|
|
|
# Use filename as source reference
|
|
source_file = str(filepath.relative_to(Path.cwd()))
|
|
|
|
return RuleDocument(metadata=metadata, content=body, source_file=source_file)
|
|
except Exception as e:
|
|
print(f"Error loading {filepath}: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def ingest_rules(
|
|
rules_dir: Path, vector_store: VectorStore, clear_existing: bool = False
|
|
) -> None:
|
|
"""Ingest all markdown rule files into the vector store."""
|
|
if not rules_dir.exists():
|
|
print(f"Rules directory does not exist: {rules_dir}")
|
|
sys.exit(1)
|
|
|
|
if clear_existing:
|
|
print("Clearing existing vector store...")
|
|
vector_store.clear_all()
|
|
|
|
# Find all markdown files
|
|
md_files = list(rules_dir.rglob("*.md"))
|
|
if not md_files:
|
|
print(f"No markdown files found in {rules_dir}")
|
|
sys.exit(1)
|
|
|
|
print(f"Found {len(md_files)} markdown files to ingest")
|
|
|
|
# Load and validate documents
|
|
documents = []
|
|
for filepath in md_files:
|
|
doc = load_markdown_file(filepath)
|
|
if doc:
|
|
documents.append(doc)
|
|
print(f" Loaded: {doc.metadata.rule_id} - {doc.metadata.title}")
|
|
|
|
print(f"Successfully loaded {len(documents)} documents")
|
|
|
|
# Add to vector store
|
|
print("Adding to vector store (this may take a moment)...")
|
|
vector_store.add_documents(documents)
|
|
|
|
print(f"\nIngestion complete!")
|
|
print(f"Total rules in store: {vector_store.count()}")
|
|
stats = vector_store.get_stats()
|
|
print("Sections:", ", ".join(f"{k}: {v}" for k, v in stats["sections"].items()))
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Ingest rule documents into ChromaDB")
|
|
parser.add_argument(
|
|
"--rules-dir",
|
|
type=Path,
|
|
default=settings.rules_dir,
|
|
help="Directory containing markdown rule files",
|
|
)
|
|
parser.add_argument(
|
|
"--data-dir",
|
|
type=Path,
|
|
default=settings.data_dir,
|
|
help="Data directory (chroma will be stored in data/chroma)",
|
|
)
|
|
parser.add_argument(
|
|
"--clear",
|
|
action="store_true",
|
|
help="Clear existing vector store before ingesting",
|
|
)
|
|
parser.add_argument(
|
|
"--embedding-model",
|
|
type=str,
|
|
default=settings.embedding_model,
|
|
help="Sentence transformer model name",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
chroma_dir = args.data_dir / "chroma"
|
|
print(f"Initializing vector store at: {chroma_dir}")
|
|
print(f"Using embedding model: {args.embedding_model}")
|
|
|
|
vector_store = VectorStore(chroma_dir, args.embedding_model)
|
|
ingest_rules(args.rules_dir, vector_store, clear_existing=args.clear)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|