#!/usr/bin/env python3 """ Ingest rule documents from markdown files into ChromaDB. The script reads all markdown files from the rules directory and adds them to the vector store. Each file should have YAML frontmatter with metadata fields matching RuleMetadata. Example frontmatter: --- rule_id: "5.2.1(b)" title: "Stolen Base Attempts" section: "Baserunning" parent_rule: "5.2" page_ref: "32" --- Rule content here... """ import sys import re from pathlib import Path from typing import Optional import yaml from app.config import settings from app.vector_store import VectorStore from app.models import RuleDocument, RuleMetadata def parse_frontmatter(content: str) -> tuple[dict, str]: """Parse YAML frontmatter from markdown content.""" pattern = r"^---\s*\n(.*?)\n---\s*\n(.*)$" match = re.match(pattern, content, re.DOTALL) if match: frontmatter_str = match.group(1) body_content = match.group(2).strip() metadata = yaml.safe_load(frontmatter_str) or {} return metadata, body_content else: raise ValueError("No valid YAML frontmatter found") def load_markdown_file(filepath: Path) -> Optional[RuleDocument]: """Load a single markdown file and convert to RuleDocument.""" try: content = filepath.read_text(encoding="utf-8") metadata_dict, body = parse_frontmatter(content) # Validate and create metadata metadata = RuleMetadata(**metadata_dict) # Use filename as source reference source_file = str(filepath.relative_to(Path.cwd())) return RuleDocument(metadata=metadata, content=body, source_file=source_file) except Exception as e: print(f"Error loading {filepath}: {e}", file=sys.stderr) return None def ingest_rules( rules_dir: Path, vector_store: VectorStore, clear_existing: bool = False ) -> None: """Ingest all markdown rule files into the vector store.""" if not rules_dir.exists(): print(f"Rules directory does not exist: {rules_dir}") sys.exit(1) if clear_existing: print("Clearing existing vector store...") vector_store.clear_all() # Find all markdown files md_files = list(rules_dir.rglob("*.md")) if not md_files: print(f"No markdown files found in {rules_dir}") sys.exit(1) print(f"Found {len(md_files)} markdown files to ingest") # Load and validate documents documents = [] for filepath in md_files: doc = load_markdown_file(filepath) if doc: documents.append(doc) print(f" Loaded: {doc.metadata.rule_id} - {doc.metadata.title}") print(f"Successfully loaded {len(documents)} documents") # Add to vector store print("Adding to vector store (this may take a moment)...") vector_store.add_documents(documents) print(f"\nIngestion complete!") print(f"Total rules in store: {vector_store.count()}") stats = vector_store.get_stats() print("Sections:", ", ".join(f"{k}: {v}" for k, v in stats["sections"].items())) def main(): """Main entry point.""" import argparse parser = argparse.ArgumentParser(description="Ingest rule documents into ChromaDB") parser.add_argument( "--rules-dir", type=Path, default=settings.rules_dir, help="Directory containing markdown rule files", ) parser.add_argument( "--data-dir", type=Path, default=settings.data_dir, help="Data directory (chroma will be stored in data/chroma)", ) parser.add_argument( "--clear", action="store_true", help="Clear existing vector store before ingesting", ) parser.add_argument( "--embedding-model", type=str, default=settings.embedding_model, help="Sentence transformer model name", ) args = parser.parse_args() chroma_dir = args.data_dir / "chroma" print(f"Initializing vector store at: {chroma_dir}") print(f"Using embedding model: {args.embedding_model}") vector_store = VectorStore(chroma_dir, args.embedding_model) ingest_rules(args.rules_dir, vector_store, clear_existing=args.clear) if __name__ == "__main__": main()