strat-chatbot/scripts/ingest_rules.py

#!/usr/bin/env python3
"""
Ingest rule documents from markdown files into ChromaDB.

The script reads all markdown files from the rules directory and adds them
to the vector store. Each file should have YAML frontmatter with metadata
fields matching RuleMetadata.

Example frontmatter:
---
rule_id: "5.2.1(b)"
title: "Stolen Base Attempts"
section: "Baserunning"
parent_rule: "5.2"
page_ref: "32"
---

Rule content here...
"""

import sys
import re
from pathlib import Path
from typing import Optional
import yaml

from app.config import settings
from app.vector_store import VectorStore
from app.models import RuleDocument, RuleMetadata


def parse_frontmatter(content: str) -> tuple[dict, str]:
    """Parse YAML frontmatter from markdown content."""
    pattern = r"^---\s*\n(.*?)\n---\s*\n(.*)$"
    match = re.match(pattern, content, re.DOTALL)

    if match:
        frontmatter_str = match.group(1)
        body_content = match.group(2).strip()
        metadata = yaml.safe_load(frontmatter_str) or {}
        return metadata, body_content
    else:
        raise ValueError("No valid YAML frontmatter found")


def load_markdown_file(filepath: Path) -> Optional[RuleDocument]:
    """Load a single markdown file and convert to RuleDocument."""
    try:
        content = filepath.read_text(encoding="utf-8")
        metadata_dict, body = parse_frontmatter(content)

        # Validate and create metadata
        metadata = RuleMetadata(**metadata_dict)

        # Use filename as source reference
        source_file = str(filepath.relative_to(Path.cwd()))

        return RuleDocument(metadata=metadata, content=body, source_file=source_file)
    except Exception as e:
        print(f"Error loading {filepath}: {e}", file=sys.stderr)
        return None


def ingest_rules(
    rules_dir: Path, vector_store: VectorStore, clear_existing: bool = False
) -> None:
    """Ingest all markdown rule files into the vector store."""
    if not rules_dir.exists():
        print(f"Rules directory does not exist: {rules_dir}")
        sys.exit(1)

    if clear_existing:
        print("Clearing existing vector store...")
        vector_store.clear_all()

    # Find all markdown files
    md_files = list(rules_dir.rglob("*.md"))
    if not md_files:
        print(f"No markdown files found in {rules_dir}")
        sys.exit(1)

    print(f"Found {len(md_files)} markdown files to ingest")

    # Load and validate documents
    documents = []
    for filepath in md_files:
        doc = load_markdown_file(filepath)
        if doc:
            documents.append(doc)
            print(f"  Loaded: {doc.metadata.rule_id} - {doc.metadata.title}")

    print(f"Successfully loaded {len(documents)} documents")

    # Add to vector store
    print("Adding to vector store (this may take a moment)...")
    vector_store.add_documents(documents)

    print(f"\nIngestion complete!")
    print(f"Total rules in store: {vector_store.count()}")
    stats = vector_store.get_stats()
    print("Sections:", ", ".join(f"{k}: {v}" for k, v in stats["sections"].items()))


def main():
    """Main entry point."""
    import argparse

    parser = argparse.ArgumentParser(description="Ingest rule documents into ChromaDB")
    parser.add_argument(
        "--rules-dir",
        type=Path,
        default=settings.rules_dir,
        help="Directory containing markdown rule files",
    )
    parser.add_argument(
        "--data-dir",
        type=Path,
        default=settings.data_dir,
        help="Data directory (chroma will be stored in data/chroma)",
    )
    parser.add_argument(
        "--clear",
        action="store_true",
        help="Clear existing vector store before ingesting",
    )
    parser.add_argument(
        "--embedding-model",
        type=str,
        default=settings.embedding_model,
        help="Sentence transformer model name",
    )

    args = parser.parse_args()

    chroma_dir = args.data_dir / "chroma"
    print(f"Initializing vector store at: {chroma_dir}")
    print(f"Using embedding model: {args.embedding_model}")

    vector_store = VectorStore(chroma_dir, args.embedding_model)
    ingest_rules(args.rules_dir, vector_store, clear_existing=args.clear)


if __name__ == "__main__":
    main()