strat-chatbot/scripts/ingest_rules.py
Cal Corum c42fea66ba feat: initial chatbot implementation with FastAPI, ChromaDB, Discord bot, and Gitea integration
- Add vector store with sentence-transformers for semantic search
- FastAPI backend with /chat and /health endpoints
- Conversation state persistence via SQLite
- OpenRouter integration with structured JSON responses
- Discord bot with /ask slash command and reply-based follow-ups
- Automated Gitea issue creation for unanswered questions
- Docker support with docker-compose for easy deployment
- Example rule file and ingestion script
- Comprehensive documentation in README
2026-03-08 15:19:26 -05:00

145 lines
4.1 KiB
Python

#!/usr/bin/env python3
"""
Ingest rule documents from markdown files into ChromaDB.
The script reads all markdown files from the rules directory and adds them
to the vector store. Each file should have YAML frontmatter with metadata
fields matching RuleMetadata.
Example frontmatter:
---
rule_id: "5.2.1(b)"
title: "Stolen Base Attempts"
section: "Baserunning"
parent_rule: "5.2"
page_ref: "32"
---
Rule content here...
"""
import sys
import re
from pathlib import Path
from typing import Optional
import yaml
from app.config import settings
from app.vector_store import VectorStore
from app.models import RuleDocument, RuleMetadata
def parse_frontmatter(content: str) -> tuple[dict, str]:
"""Parse YAML frontmatter from markdown content."""
pattern = r"^---\s*\n(.*?)\n---\s*\n(.*)$"
match = re.match(pattern, content, re.DOTALL)
if match:
frontmatter_str = match.group(1)
body_content = match.group(2).strip()
metadata = yaml.safe_load(frontmatter_str) or {}
return metadata, body_content
else:
raise ValueError("No valid YAML frontmatter found")
def load_markdown_file(filepath: Path) -> Optional[RuleDocument]:
"""Load a single markdown file and convert to RuleDocument."""
try:
content = filepath.read_text(encoding="utf-8")
metadata_dict, body = parse_frontmatter(content)
# Validate and create metadata
metadata = RuleMetadata(**metadata_dict)
# Use filename as source reference
source_file = str(filepath.relative_to(Path.cwd()))
return RuleDocument(metadata=metadata, content=body, source_file=source_file)
except Exception as e:
print(f"Error loading {filepath}: {e}", file=sys.stderr)
return None
def ingest_rules(
rules_dir: Path, vector_store: VectorStore, clear_existing: bool = False
) -> None:
"""Ingest all markdown rule files into the vector store."""
if not rules_dir.exists():
print(f"Rules directory does not exist: {rules_dir}")
sys.exit(1)
if clear_existing:
print("Clearing existing vector store...")
vector_store.clear_all()
# Find all markdown files
md_files = list(rules_dir.rglob("*.md"))
if not md_files:
print(f"No markdown files found in {rules_dir}")
sys.exit(1)
print(f"Found {len(md_files)} markdown files to ingest")
# Load and validate documents
documents = []
for filepath in md_files:
doc = load_markdown_file(filepath)
if doc:
documents.append(doc)
print(f" Loaded: {doc.metadata.rule_id} - {doc.metadata.title}")
print(f"Successfully loaded {len(documents)} documents")
# Add to vector store
print("Adding to vector store (this may take a moment)...")
vector_store.add_documents(documents)
print(f"\nIngestion complete!")
print(f"Total rules in store: {vector_store.count()}")
stats = vector_store.get_stats()
print("Sections:", ", ".join(f"{k}: {v}" for k, v in stats["sections"].items()))
def main():
"""Main entry point."""
import argparse
parser = argparse.ArgumentParser(description="Ingest rule documents into ChromaDB")
parser.add_argument(
"--rules-dir",
type=Path,
default=settings.rules_dir,
help="Directory containing markdown rule files",
)
parser.add_argument(
"--data-dir",
type=Path,
default=settings.data_dir,
help="Data directory (chroma will be stored in data/chroma)",
)
parser.add_argument(
"--clear",
action="store_true",
help="Clear existing vector store before ingesting",
)
parser.add_argument(
"--embedding-model",
type=str,
default=settings.embedding_model,
help="Sentence transformer model name",
)
args = parser.parse_args()
chroma_dir = args.data_dir / "chroma"
print(f"Initializing vector store at: {chroma_dir}")
print(f"Using embedding model: {args.embedding_model}")
vector_store = VectorStore(chroma_dir, args.embedding_model)
ingest_rules(args.rules_dir, vector_store, clear_existing=args.clear)
if __name__ == "__main__":
main()