voice-server/app/tts_engine.py

"""
TTS Engine module for voice-server.

Provides text-to-speech synthesis using Piper TTS.
Supports multiple voice models with lazy loading and caching.
"""

import logging
from pathlib import Path
from typing import Protocol

import numpy as np

logger = logging.getLogger(__name__)


class TTSEngine(Protocol):
    """Protocol defining the TTS engine interface."""

    def synthesize(self, text: str, voice: str | None = None) -> np.ndarray:
        """Convert text to audio samples."""
        ...

    def get_sample_rate(self) -> int:
        """Get the audio sample rate."""
        ...

    def list_voices(self) -> list[dict]:
        """List available voice models."""
        ...


class PiperTTSEngine:
    """
    Piper TTS engine implementation.

    Provides high-quality neural text-to-speech using Piper's ONNX models.
    Voice models are loaded lazily and cached for performance.
    """

    def __init__(self, model_dir: str = "./models", default_voice: str = "en_US-lessac-medium"):
        """
        Initialize the Piper TTS engine.

        Args:
            model_dir: Directory containing voice model files (.onnx + .onnx.json)
            default_voice: Default voice model name to use
        """
        self.model_dir = Path(model_dir)
        self.default_voice = default_voice
        self._voices: dict = {}  # Cache of loaded PiperVoice instances
        self._voice_metadata: dict = {}  # Cache of voice metadata
        self._sample_rate: int = 22050  # Piper default sample rate

        # Ensure model directory exists
        self.model_dir.mkdir(parents=True, exist_ok=True)

        logger.info(f"PiperTTSEngine initialized with model_dir={model_dir}")

    def _get_voice_path(self, voice_name: str) -> tuple[Path, Path]:
        """
        Get paths to voice model files.

        Args:
            voice_name: Name of the voice model

        Returns:
            Tuple of (onnx_path, json_path)
        """
        onnx_path = self.model_dir / f"{voice_name}.onnx"
        json_path = self.model_dir / f"{voice_name}.onnx.json"
        return onnx_path, json_path

    def _load_voice(self, voice_name: str):
        """
        Load a voice model (lazy loading with caching).

        Args:
            voice_name: Name of the voice model to load

        Returns:
            Loaded PiperVoice instance

        Raises:
            FileNotFoundError: If voice model files don't exist
            RuntimeError: If voice model fails to load
        """
        if voice_name in self._voices:
            return self._voices[voice_name]

        onnx_path, json_path = self._get_voice_path(voice_name)

        if not onnx_path.exists():
            raise FileNotFoundError(
                f"Voice model not found: {voice_name}. "
                f"Expected file: {onnx_path}"
            )

        try:
            from piper import PiperVoice

            logger.info(f"Loading voice model: {voice_name}")
            voice = PiperVoice.load(str(onnx_path), config_path=str(json_path) if json_path.exists() else None)
            self._voices[voice_name] = voice

            # Update sample rate from loaded voice
            if hasattr(voice, 'config') and voice.config:
                self._sample_rate = voice.config.sample_rate

            logger.info(f"Voice model loaded: {voice_name} (sample_rate={self._sample_rate})")
            return voice

        except Exception as e:
            logger.error(f"Failed to load voice model {voice_name}: {e}")
            raise RuntimeError(f"Failed to load voice model: {e}") from e

    def synthesize(self, text: str, voice: str | None = None) -> np.ndarray:
        """
        Convert text to audio samples.

        Args:
            text: Text to convert to speech
            voice: Voice model name (uses default if None)

        Returns:
            NumPy array of audio samples (int16)

        Raises:
            FileNotFoundError: If voice model not found
            RuntimeError: If synthesis fails
        """
        voice_name = voice or self.default_voice

        if not text or not text.strip():
            # Return empty audio for empty text
            return np.array([], dtype=np.int16)

        try:
            piper_voice = self._load_voice(voice_name)

            # Synthesize audio - piper returns an iterator of AudioChunk objects
            audio_chunks = []
            for chunk in piper_voice.synthesize(text):
                # Each chunk has audio_int16_array property
                audio_chunks.append(chunk.audio_int16_array)

            if not audio_chunks:
                return np.array([], dtype=np.int16)

            # Concatenate all chunks
            audio_array = np.concatenate(audio_chunks)

            logger.debug(f"Synthesized {len(text)} chars -> {len(audio_array)} samples")
            return audio_array

        except FileNotFoundError:
            raise
        except Exception as e:
            logger.error(f"TTS synthesis failed: {e}")
            raise RuntimeError(f"TTS synthesis failed: {e}") from e

    def synthesize_to_float32(self, text: str, voice: str | None = None) -> np.ndarray:
        """
        Convert text to float32 audio samples (normalized -1.0 to 1.0).

        This format is preferred by sounddevice for playback.

        Args:
            text: Text to convert to speech
            voice: Voice model name (uses default if None)

        Returns:
            NumPy array of float32 audio samples
        """
        int16_audio = self.synthesize(text, voice)

        if len(int16_audio) == 0:
            return np.array([], dtype=np.float32)

        # Convert int16 to float32 normalized
        float32_audio = int16_audio.astype(np.float32) / 32768.0
        return float32_audio

    def get_sample_rate(self) -> int:
        """Get the audio sample rate for the current voice."""
        return self._sample_rate

    def list_voices(self) -> list[dict]:
        """
        List available voice models in the model directory.

        Returns:
            List of voice info dictionaries with name, language, quality, etc.
        """
        voices = []

        if not self.model_dir.exists():
            return voices

        # Find all .onnx files
        for onnx_file in self.model_dir.glob("*.onnx"):
            voice_name = onnx_file.stem
            json_file = onnx_file.with_suffix(".onnx.json")

            voice_info = {
                "name": voice_name,
                "language": self._extract_language(voice_name),
                "quality": self._extract_quality(voice_name),
                "size_mb": round(onnx_file.stat().st_size / (1024 * 1024), 1),
                "installed": True,
            }

            # Try to load additional metadata from JSON config
            if json_file.exists():
                try:
                    import json
                    with open(json_file) as f:
                        config = json.load(f)
                        if "language" in config:
                            voice_info["language"] = config["language"].get("code", voice_info["language"])
                except Exception:
                    pass  # Use extracted values if JSON parsing fails

            voices.append(voice_info)

        return sorted(voices, key=lambda v: v["name"])

    def _extract_language(self, voice_name: str) -> str:
        """Extract language code from voice name (e.g., 'en_US' from 'en_US-lessac-medium')."""
        parts = voice_name.split("-")
        if parts:
            return parts[0]
        return "unknown"

    def _extract_quality(self, voice_name: str) -> str:
        """Extract quality level from voice name (e.g., 'medium' from 'en_US-lessac-medium')."""
        parts = voice_name.split("-")
        if len(parts) >= 3:
            quality = parts[-1].lower()
            if quality in ("low", "medium", "high", "x_low", "x_high"):
                return quality
        return "medium"

    def is_voice_available(self, voice_name: str) -> bool:
        """Check if a voice model is installed."""
        onnx_path, _ = self._get_voice_path(voice_name)
        return onnx_path.exists()

    def health_check(self) -> dict:
        """
        Perform a health check on the TTS engine.

        Returns:
            Dict with status and any error messages
        """
        try:
            # Check if piper is importable
            from piper import PiperVoice  # noqa: F401

            # Check if model directory exists
            if not self.model_dir.exists():
                return {
                    "status": "degraded",
                    "error": f"Model directory does not exist: {self.model_dir}",
                }

            # Check if default voice is available
            if not self.is_voice_available(self.default_voice):
                available = [v["name"] for v in self.list_voices()]
                return {
                    "status": "degraded",
                    "error": f"Default voice not found: {self.default_voice}",
                    "available_voices": available,
                }

            return {"status": "healthy"}

        except ImportError as e:
            return {
                "status": "unhealthy",
                "error": f"Piper TTS not installed: {e}",
            }
        except Exception as e:
            return {
                "status": "unhealthy",
                "error": str(e),
            }