strat-chatbot/adapters/outbound/openrouter.py

"""OpenRouter outbound adapter — implements LLMPort via the OpenRouter API.

This module is the sole owner of:
- The SYSTEM_PROMPT for the Strat-O-Matic rules assistant
- All JSON parsing / extraction logic for LLM responses
- The persistent httpx.AsyncClient connection pool

It returns domain.models.LLMResponse exclusively; no legacy app.* types leak
through this boundary.
"""

from __future__ import annotations

import json
import logging
import re
from typing import Optional

import httpx

from domain.models import LLMResponse, RuleSearchResult
from domain.ports import LLMPort

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# System prompt
# ---------------------------------------------------------------------------

SYSTEM_PROMPT = """You are a helpful assistant for a Strat-O-Matic baseball league.
Your job is to answer questions about league rules and procedures using the provided rule excerpts.

CRITICAL RULES:
1. ONLY use information from the provided rules. If the rules don't contain the answer, say so clearly.
2. ALWAYS cite rule IDs when referencing a rule (e.g., "Rule 5.2.1(b) states that...")
3. If multiple rules are relevant, cite all of them.
4. If you're uncertain or the rules are ambiguous, say so and suggest asking a league administrator.
5. Keep responses concise but complete. Use examples when helpful from the rules.
6. Do NOT make up rules or infer beyond what's explicitly stated.

When answering:
- Start with a direct answer to the question
- Support with rule citations
- Include relevant details from the rules
- If no relevant rules found, explicitly state: "I don't have a rule that addresses this question."

Response format (JSON):
{
    "answer": "Your response text",
    "cited_rules": ["rule_id_1", "rule_id_2"],
    "confidence": 0.0-1.0,
    "needs_human": boolean
}

Higher confidence (0.8-1.0) when rules clearly answer the question.
Lower confidence (0.3-0.7) when rules partially address the question or are ambiguous.
Very low confidence (0.0-0.2) when rules don't address the question at all.
"""

# Regex for extracting rule IDs from free-text answers when cited_rules is empty.
# Matches patterns like "Rule 5.2.1(b)" or "Rule 7.4".
# The character class includes '.' so a sentence-ending period may be captured
# (e.g. "Rule 7.4." → raw match "7.4.").  Matches are stripped of a trailing
# dot at the extraction site to normalise IDs like "7.4." → "7.4".
_RULE_ID_PATTERN = re.compile(r"Rule\s+([\d\.\(\)a-b]+)")


# ---------------------------------------------------------------------------
# Adapter
# ---------------------------------------------------------------------------


class OpenRouterLLM(LLMPort):
    """Outbound adapter that calls the OpenRouter chat completions API.

    A single httpx.AsyncClient is reused across all calls (connection pooling).
    Call ``await adapter.close()`` when tearing down to release the pool.

    Args:
        api_key: Bearer token for the OpenRouter API.
        model: OpenRouter model identifier, e.g. ``"openai/gpt-4o-mini"``.
        base_url: Full URL for the chat completions endpoint.
        http_client: Optional pre-built httpx.AsyncClient (useful for testing).
            When *None* a new client is created with a 120-second timeout.
    """

    def __init__(
        self,
        api_key: str,
        model: str,
        base_url: str = "https://openrouter.ai/api/v1/chat/completions",
        http_client: Optional[httpx.AsyncClient] = None,
    ) -> None:
        if not api_key:
            raise ValueError("api_key must not be empty")
        self._api_key = api_key
        self._model = model
        self._base_url = base_url
        self._http: httpx.AsyncClient = http_client or httpx.AsyncClient(timeout=120.0)

    # ------------------------------------------------------------------
    # LLMPort implementation
    # ------------------------------------------------------------------

    async def generate_response(
        self,
        question: str,
        rules: list[RuleSearchResult],
        conversation_history: Optional[list[dict[str, str]]] = None,
    ) -> LLMResponse:
        """Call the OpenRouter API and return a structured LLMResponse.

        Args:
            question: The user's natural-language question.
            rules: Relevant rule excerpts retrieved from the knowledge base.
            conversation_history: Optional list of prior ``{"role": ..., "content": ...}``
                dicts. At most the last 6 messages are forwarded to stay within
                token budgets.

        Returns:
            LLMResponse with ``answer``, ``cited_rules``, ``confidence``, and
            ``needs_human`` populated from the LLM's JSON reply.  On parse
            failure ``confidence=0.0`` and ``needs_human=True`` signal that
            the raw response could not be structured reliably.

        Raises:
            RuntimeError: When the API returns a non-200 HTTP status.
        """
        messages = self._build_messages(question, rules, conversation_history)

        logger.debug(
            "Sending request to OpenRouter model=%s messages=%d",
            self._model,
            len(messages),
        )

        response = await self._http.post(
            self._base_url,
            headers={
                "Authorization": f"Bearer {self._api_key}",
                "Content-Type": "application/json",
            },
            json={
                "model": self._model,
                "messages": messages,
                "temperature": 0.3,
                "max_tokens": 1000,
                "top_p": 0.9,
            },
        )

        if response.status_code != 200:
            raise RuntimeError(
                f"OpenRouter API error: {response.status_code} - {response.text}"
            )

        result = response.json()
        content: str = result["choices"][0]["message"]["content"]

        logger.debug("Received response content length=%d", len(content))

        return self._parse_content(content, rules)

    async def close(self) -> None:
        """Release the underlying HTTP connection pool.

        Should be called when the adapter is no longer needed (e.g. on
        application shutdown) to avoid resource leaks.
        """
        await self._http.aclose()

    # ------------------------------------------------------------------
    # Private helpers
    # ------------------------------------------------------------------

    def _build_messages(
        self,
        question: str,
        rules: list[RuleSearchResult],
        conversation_history: Optional[list[dict[str, str]]],
    ) -> list[dict[str, str]]:
        """Assemble the messages list for the API request."""
        if rules:
            rules_context = "\n\n".join(
                f"Rule {r.rule_id}: {r.title}\n{r.content}" for r in rules
            )
            context_msg = (
                f"Here are the relevant rules for the question:\n\n{rules_context}"
            )
        else:
            context_msg = "No relevant rules were found in the knowledge base."

        messages: list[dict[str, str]] = [{"role": "system", "content": SYSTEM_PROMPT}]

        if conversation_history:
            # Limit to last 6 messages (3 exchanges) to avoid token overflow
            messages.extend(conversation_history[-6:])

        user_message = (
            f"{context_msg}\n\nUser question: {question}\n\n"
            "Answer the question based on the rules provided."
        )
        messages.append({"role": "user", "content": user_message})

        return messages

    def _parse_content(
        self, content: str, rules: list[RuleSearchResult]
    ) -> LLMResponse:
        """Parse the raw LLM content string into an LLMResponse.

        Handles three cases in order:
        1. JSON wrapped in a ```json ... ``` markdown fence.
        2. Bare JSON string.
        3. Plain text (fallback) — sets confidence=0.0, needs_human=True.
        """
        try:
            json_str = self._extract_json_string(content)
            parsed = json.loads(json_str)
        except (json.JSONDecodeError, KeyError, IndexError) as exc:
            logger.warning("Failed to parse LLM response as JSON: %s", exc)
            return LLMResponse(
                answer=content,
                cited_rules=[],
                confidence=0.0,
                needs_human=True,
            )

        cited_rules: list[str] = parsed.get("cited_rules", [])

        # Regex fallback: if the model omitted cited_rules but mentioned rule
        # IDs inline, extract them from the answer text so callers have
        # attribution without losing information.
        if not cited_rules and rules:
            answer_text: str = parsed.get("answer", "")
            # Strip a trailing dot from each match to handle sentence-ending
            # punctuation (e.g. "Rule 7.4." → "7.4").
            matches = [m.rstrip(".") for m in _RULE_ID_PATTERN.findall(answer_text)]
            cited_rules = list(dict.fromkeys(matches))  # deduplicate, preserve order

        return LLMResponse(
            answer=parsed["answer"],
            cited_rules=cited_rules,
            confidence=float(parsed.get("confidence", 0.5)),
            needs_human=bool(parsed.get("needs_human", False)),
        )

    @staticmethod
    def _extract_json_string(content: str) -> str:
        """Strip optional markdown fences and return the raw JSON string."""
        if "```json" in content:
            return content.split("```json")[1].split("```")[0].strip()
        return content.strip()