"""Tests for the ChromaRuleRepository outbound adapter. Uses ChromaDB's ephemeral (in-memory) client so no files are written to disk and no cleanup is needed between runs. All tests are marked ``slow`` because constructing a SentenceTransformer downloads a ~100 MB model on a cold cache. Skip the entire module when the sentence-transformers package is absent so the rest of the test suite still passes in a minimal CI environment. """ from __future__ import annotations import pytest # --------------------------------------------------------------------------- # Optional-import guard: skip the whole module if sentence-transformers is # not installed (avoids a hard ImportError in minimal environments). # --------------------------------------------------------------------------- sentence_transformers = pytest.importorskip( "sentence_transformers", reason="sentence-transformers not installed; skipping ChromaDB adapter tests", ) from unittest.mock import MagicMock, patch # noqa: E402 import chromadb # noqa: E402 (after importorskip guard) from adapters.outbound.chroma_rules import ChromaRuleRepository # noqa: E402 from domain.models import RuleDocument, RuleSearchResult # noqa: E402 from domain.ports import RuleRepository # noqa: E402 # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- EMBEDDING_MODEL = "all-MiniLM-L6-v2" def _make_doc( rule_id: str = "1.0", title: str = "Test Rule", section: str = "Section 1", content: str = "This is the content of the rule.", source_file: str = "rules/test.md", parent_rule: str | None = None, page_ref: str | None = None, ) -> RuleDocument: """Factory for RuleDocument with sensible defaults.""" return RuleDocument( rule_id=rule_id, title=title, section=section, content=content, source_file=source_file, parent_rule=parent_rule, page_ref=page_ref, ) # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- @pytest.fixture(scope="module") def embedding_model_mock(): """ Return a lightweight mock for SentenceTransformer so the tests do not download the real model unless running in a full environment. The mock's ``encode`` method returns a fixed-length float list that is valid for ChromaDB (32-dimensional vector). Using the same vector for every document means cosine distance will be 0 (similarity == 1), which lets us assert similarity >= 0 without caring about ranking. """ mock = MagicMock() # Single-doc encode returns a 1-D array-like; batch returns 2-D list. fixed_vector = [0.1] * 32 def encode(texts, **kwargs): if isinstance(texts, str): return fixed_vector # Batch: return one vector per document return [fixed_vector for _ in texts] mock.encode.side_effect = encode return mock @pytest.fixture() def repo(embedding_model_mock, tmp_path): """ ChromaRuleRepository backed by an ephemeral (in-memory) ChromaDB client. We patch: - ``chromadb.EphemeralClient`` is injected via monkeypatching the client factory inside the adapter so nothing is written to ``tmp_path``. - ``SentenceTransformer`` is replaced with ``embedding_model_mock`` so no model download occurs. ``tmp_path`` is still passed to satisfy the constructor signature even though the ephemeral client ignores it. """ ephemeral_client = chromadb.EphemeralClient() with ( patch( "adapters.outbound.chroma_rules.chromadb.PersistentClient", return_value=ephemeral_client, ), patch( "adapters.outbound.chroma_rules.SentenceTransformer", return_value=embedding_model_mock, ), ): instance = ChromaRuleRepository( persist_dir=tmp_path / "chroma", embedding_model=EMBEDDING_MODEL, ) yield instance # --------------------------------------------------------------------------- # Tests # --------------------------------------------------------------------------- @pytest.mark.slow class TestChromaRuleRepositoryContract: """Verify that ChromaRuleRepository satisfies the RuleRepository port.""" def test_is_rule_repository_subclass(self): """ChromaRuleRepository must be a concrete implementation of the port ABC.""" assert issubclass(ChromaRuleRepository, RuleRepository) @pytest.mark.slow class TestAddDocuments: """Tests for add_documents().""" def test_add_single_document_increments_count(self, repo): """ Adding a single RuleDocument should make count() return 1. Verifies that the adapter correctly maps the domain model to ChromaDB's add() API. """ doc = _make_doc(rule_id="1.1", content="Single rule content.") repo.add_documents([doc]) assert repo.count() == 1 def test_add_batch_all_stored(self, repo): """ Adding a batch of N documents should result in count() == N. Validates that batch encoding and bulk add() work end-to-end. """ docs = [ _make_doc(rule_id=f"2.{i}", content=f"Batch rule number {i}.") for i in range(5) ] repo.add_documents(docs) assert repo.count() == 5 def test_add_empty_list_is_noop(self, repo): """ Calling add_documents([]) must not raise and must leave count unchanged. """ repo.add_documents([]) assert repo.count() == 0 def test_add_document_with_optional_fields(self, repo): """ RuleDocument with parent_rule and page_ref set should be stored without error; optional fields must be serialised via to_metadata(). """ doc = _make_doc( rule_id="3.1", parent_rule="3.0", page_ref="p.42", ) repo.add_documents([doc]) assert repo.count() == 1 @pytest.mark.slow class TestSearch: """Tests for search().""" def test_search_returns_results(self, repo): """ After adding at least one document, search() must return a non-empty list of RuleSearchResult objects. """ doc = _make_doc(rule_id="10.1", content="A searchable rule about batting.") repo.add_documents([doc]) results = repo.search("batting rules", top_k=5) assert len(results) >= 1 assert all(isinstance(r, RuleSearchResult) for r in results) def test_search_result_fields_populated(self, repo): """ Each RuleSearchResult returned must have non-empty rule_id, title, content, and section. This confirms metadata round-trips correctly through ChromaDB. """ doc = _make_doc( rule_id="11.1", title="Fielding Rule", section="Defense", content="Rules for fielding plays.", ) repo.add_documents([doc]) results = repo.search("fielding", top_k=1) assert len(results) >= 1 r = results[0] assert r.rule_id == "11.1" assert r.title == "Fielding Rule" assert r.section == "Defense" assert r.content == "Rules for fielding plays." def test_search_with_section_filter(self, repo): """ search() with section_filter must only return documents whose section field matches the filter value. Documents from other sections must not appear in the results even when they would otherwise score highly. """ docs = [ _make_doc(rule_id="20.1", section="Pitching", content="Pitching rules."), _make_doc(rule_id="20.2", section="Batting", content="Batting rules."), ] repo.add_documents(docs) results = repo.search("rules", top_k=10, section_filter="Pitching") assert len(results) >= 1 assert all(r.section == "Pitching" for r in results) def test_search_top_k_respected(self, repo): """ The number of results must not exceed top_k even when more documents exist in the collection. """ docs = [ _make_doc(rule_id=f"30.{i}", content=f"Rule number {i}.") for i in range(10) ] repo.add_documents(docs) results = repo.search("rule", top_k=3) assert len(results) <= 3 def test_search_empty_collection_returns_empty_list(self, repo): """ Searching an empty collection must return an empty list without raising. ChromaDB raises when n_results > collection size, so the adapter must guard against this. """ results = repo.search("anything", top_k=5) assert results == [] @pytest.mark.slow class TestSimilarityClamping: """Tests for the similarity score clamping behaviour.""" def test_similarity_within_bounds(self, repo): """ Every RuleSearchResult returned by search() must have a similarity value in [0.0, 1.0]. ChromaDB cosine distance can technically exceed 1 for near-opposite vectors; the adapter must clamp the value before constructing RuleSearchResult (which validates the range in __post_init__). """ docs = [_make_doc(rule_id="40.1", content="Content for similarity check.")] repo.add_documents(docs) results = repo.search("similarity check", top_k=5) for r in results: assert ( 0.0 <= r.similarity <= 1.0 ), f"similarity {r.similarity} is outside [0.0, 1.0]" def test_similarity_clamped_when_distance_exceeds_one( self, repo, embedding_model_mock ): """ When ChromaDB returns a cosine distance > 1 (e.g. 1.5), the formula ``max(0.0, min(1.0, 1 - distance))`` must produce 0.0 rather than a negative value, preventing the RuleSearchResult validator from raising. We simulate this by patching the collection's query() to return a synthetic distance of 1.5. """ doc = _make_doc(rule_id="50.1", content="Edge case content.") repo.add_documents([doc]) raw_results = { "documents": [["Edge case content."]], "metadatas": [ [ { "rule_id": "50.1", "title": "Test Rule", "section": "Section 1", "parent_rule": "", "page_ref": "", "source_file": "rules/test.md", } ] ], "distances": [[1.5]], # distance > 1 → naive similarity would be negative } collection = repo._get_collection() with patch.object(collection, "query", return_value=raw_results): results = repo.search("edge case", top_k=1) assert len(results) == 1 assert results[0].similarity == 0.0 @pytest.mark.slow class TestCount: """Tests for count().""" def test_count_empty(self, repo): """count() on a fresh collection must return 0.""" assert repo.count() == 0 def test_count_after_add(self, repo): """count() must reflect the exact number of documents added.""" docs = [_make_doc(rule_id=f"60.{i}") for i in range(3)] repo.add_documents(docs) assert repo.count() == 3 @pytest.mark.slow class TestClearAll: """Tests for clear_all().""" def test_clear_all_resets_count_to_zero(self, repo): """ After adding documents and calling clear_all(), count() must return 0. Also verifies that the collection is recreated (not left deleted) so subsequent operations succeed without error. """ docs = [_make_doc(rule_id=f"70.{i}") for i in range(4)] repo.add_documents(docs) assert repo.count() == 4 repo.clear_all() assert repo.count() == 0 def test_operations_work_after_clear(self, repo): """ The adapter must be usable after clear_all() — the internal collection must be recreated so add_documents() and search() do not raise. """ repo.add_documents([_make_doc(rule_id="80.1")]) repo.clear_all() new_doc = _make_doc(rule_id="80.2", content="Post-clear document.") repo.add_documents([new_doc]) assert repo.count() == 1 @pytest.mark.slow class TestGetStats: """Tests for get_stats().""" def test_get_stats_returns_dict(self, repo): """get_stats() must return a dict (structural sanity check).""" stats = repo.get_stats() assert isinstance(stats, dict) def test_get_stats_contains_required_keys(self, repo): """ get_stats() must include at minimum: - ``total_rules``: int — total document count - ``sections``: dict — per-section counts - ``persist_directory``: str — path used by the client """ docs = [ _make_doc(rule_id="90.1", section="Alpha"), _make_doc(rule_id="90.2", section="Alpha"), _make_doc(rule_id="90.3", section="Beta"), ] repo.add_documents(docs) stats = repo.get_stats() assert "total_rules" in stats assert "sections" in stats assert "persist_directory" in stats assert stats["total_rules"] == 3 assert stats["sections"]["Alpha"] == 2 assert stats["sections"]["Beta"] == 1