""" TTS Engine module for voice-server. Provides text-to-speech synthesis using Piper TTS. Supports multiple voice models with lazy loading and caching. """ import logging from pathlib import Path from typing import Protocol import numpy as np logger = logging.getLogger(__name__) class TTSEngine(Protocol): """Protocol defining the TTS engine interface.""" def synthesize(self, text: str, voice: str | None = None) -> np.ndarray: """Convert text to audio samples.""" ... def get_sample_rate(self) -> int: """Get the audio sample rate.""" ... def list_voices(self) -> list[dict]: """List available voice models.""" ... class PiperTTSEngine: """ Piper TTS engine implementation. Provides high-quality neural text-to-speech using Piper's ONNX models. Voice models are loaded lazily and cached for performance. """ def __init__(self, model_dir: str = "./models", default_voice: str = "en_US-lessac-medium"): """ Initialize the Piper TTS engine. Args: model_dir: Directory containing voice model files (.onnx + .onnx.json) default_voice: Default voice model name to use """ self.model_dir = Path(model_dir) self.default_voice = default_voice self._voices: dict = {} # Cache of loaded PiperVoice instances self._voice_metadata: dict = {} # Cache of voice metadata self._sample_rate: int = 22050 # Piper default sample rate # Ensure model directory exists self.model_dir.mkdir(parents=True, exist_ok=True) logger.info(f"PiperTTSEngine initialized with model_dir={model_dir}") def _get_voice_path(self, voice_name: str) -> tuple[Path, Path]: """ Get paths to voice model files. Args: voice_name: Name of the voice model Returns: Tuple of (onnx_path, json_path) """ onnx_path = self.model_dir / f"{voice_name}.onnx" json_path = self.model_dir / f"{voice_name}.onnx.json" return onnx_path, json_path def _load_voice(self, voice_name: str): """ Load a voice model (lazy loading with caching). Args: voice_name: Name of the voice model to load Returns: Loaded PiperVoice instance Raises: FileNotFoundError: If voice model files don't exist RuntimeError: If voice model fails to load """ if voice_name in self._voices: return self._voices[voice_name] onnx_path, json_path = self._get_voice_path(voice_name) if not onnx_path.exists(): raise FileNotFoundError( f"Voice model not found: {voice_name}. " f"Expected file: {onnx_path}" ) try: from piper import PiperVoice logger.info(f"Loading voice model: {voice_name}") voice = PiperVoice.load(str(onnx_path), config_path=str(json_path) if json_path.exists() else None) self._voices[voice_name] = voice # Update sample rate from loaded voice if hasattr(voice, 'config') and voice.config: self._sample_rate = voice.config.sample_rate logger.info(f"Voice model loaded: {voice_name} (sample_rate={self._sample_rate})") return voice except Exception as e: logger.error(f"Failed to load voice model {voice_name}: {e}") raise RuntimeError(f"Failed to load voice model: {e}") from e def synthesize(self, text: str, voice: str | None = None) -> np.ndarray: """ Convert text to audio samples. Args: text: Text to convert to speech voice: Voice model name (uses default if None) Returns: NumPy array of audio samples (int16) Raises: FileNotFoundError: If voice model not found RuntimeError: If synthesis fails """ voice_name = voice or self.default_voice if not text or not text.strip(): # Return empty audio for empty text return np.array([], dtype=np.int16) try: piper_voice = self._load_voice(voice_name) # Synthesize audio - piper returns an iterator of AudioChunk objects audio_chunks = [] for chunk in piper_voice.synthesize(text): # Each chunk has audio_int16_array property audio_chunks.append(chunk.audio_int16_array) if not audio_chunks: return np.array([], dtype=np.int16) # Concatenate all chunks audio_array = np.concatenate(audio_chunks) logger.debug(f"Synthesized {len(text)} chars -> {len(audio_array)} samples") return audio_array except FileNotFoundError: raise except Exception as e: logger.error(f"TTS synthesis failed: {e}") raise RuntimeError(f"TTS synthesis failed: {e}") from e def synthesize_to_float32(self, text: str, voice: str | None = None) -> np.ndarray: """ Convert text to float32 audio samples (normalized -1.0 to 1.0). This format is preferred by sounddevice for playback. Args: text: Text to convert to speech voice: Voice model name (uses default if None) Returns: NumPy array of float32 audio samples """ int16_audio = self.synthesize(text, voice) if len(int16_audio) == 0: return np.array([], dtype=np.float32) # Convert int16 to float32 normalized float32_audio = int16_audio.astype(np.float32) / 32768.0 return float32_audio def get_sample_rate(self) -> int: """Get the audio sample rate for the current voice.""" return self._sample_rate def list_voices(self) -> list[dict]: """ List available voice models in the model directory. Returns: List of voice info dictionaries with name, language, quality, etc. """ voices = [] if not self.model_dir.exists(): return voices # Find all .onnx files for onnx_file in self.model_dir.glob("*.onnx"): voice_name = onnx_file.stem json_file = onnx_file.with_suffix(".onnx.json") voice_info = { "name": voice_name, "language": self._extract_language(voice_name), "quality": self._extract_quality(voice_name), "size_mb": round(onnx_file.stat().st_size / (1024 * 1024), 1), "installed": True, } # Try to load additional metadata from JSON config if json_file.exists(): try: import json with open(json_file) as f: config = json.load(f) if "language" in config: voice_info["language"] = config["language"].get("code", voice_info["language"]) except Exception: pass # Use extracted values if JSON parsing fails voices.append(voice_info) return sorted(voices, key=lambda v: v["name"]) def _extract_language(self, voice_name: str) -> str: """Extract language code from voice name (e.g., 'en_US' from 'en_US-lessac-medium').""" parts = voice_name.split("-") if parts: return parts[0] return "unknown" def _extract_quality(self, voice_name: str) -> str: """Extract quality level from voice name (e.g., 'medium' from 'en_US-lessac-medium').""" parts = voice_name.split("-") if len(parts) >= 3: quality = parts[-1].lower() if quality in ("low", "medium", "high", "x_low", "x_high"): return quality return "medium" def is_voice_available(self, voice_name: str) -> bool: """Check if a voice model is installed.""" onnx_path, _ = self._get_voice_path(voice_name) return onnx_path.exists() def health_check(self) -> dict: """ Perform a health check on the TTS engine. Returns: Dict with status and any error messages """ try: # Check if piper is importable from piper import PiperVoice # noqa: F401 # Check if model directory exists if not self.model_dir.exists(): return { "status": "degraded", "error": f"Model directory does not exist: {self.model_dir}", } # Check if default voice is available if not self.is_voice_available(self.default_voice): available = [v["name"] for v in self.list_voices()] return { "status": "degraded", "error": f"Default voice not found: {self.default_voice}", "available_voices": available, } return {"status": "healthy"} except ImportError as e: return { "status": "unhealthy", "error": f"Piper TTS not installed: {e}", } except Exception as e: return { "status": "unhealthy", "error": str(e), }