From a34aec06f11e90927d4a911d030380d89d3a9b90 Mon Sep 17 00:00:00 2001 From: Cal Corum Date: Fri, 19 Dec 2025 00:18:12 -0600 Subject: [PATCH] Initial commit: Voice server with Piper TTS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A local HTTP service that accepts text via POST and speaks it through system speakers using Piper TTS neural voice synthesis. Features: - POST /notify - Queue text for TTS playback - GET /health - Health check with TTS/audio/queue status - GET /voices - List installed voice models - Async queue processing (no overlapping audio) - Non-blocking audio via sounddevice - 73 tests covering API contract Tech stack: - FastAPI + Uvicorn - Piper TTS (neural voices, offline) - sounddevice (PortAudio) - Pydantic for validation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .env.example | 22 + .gitignore | 58 ++ PRD.md | 2147 ++++++++++++++++++++++++++++++++++++++++++ PROJECT_ROADMAP.json | 1012 ++++++++++++++++++++ README.md | 41 + app/__init__.py | 0 app/audio_player.py | 192 ++++ app/config.py | 98 ++ app/main.py | 140 +++ app/models.py | 162 ++++ app/queue_manager.py | 236 +++++ app/routes.py | 198 ++++ app/tts_engine.py | 287 ++++++ pyproject.toml | 69 ++ tests/__init__.py | 0 tests/test_api.py | 324 +++++++ tests/test_config.py | 300 ++++++ tests/test_models.py | 388 ++++++++ 18 files changed, 5674 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 PRD.md create mode 100644 PROJECT_ROADMAP.json create mode 100644 README.md create mode 100644 app/__init__.py create mode 100644 app/audio_player.py create mode 100644 app/config.py create mode 100644 app/main.py create mode 100644 app/models.py create mode 100644 app/queue_manager.py create mode 100644 app/routes.py create mode 100644 app/tts_engine.py create mode 100644 pyproject.toml create mode 100644 tests/__init__.py create mode 100644 tests/test_api.py create mode 100644 tests/test_config.py create mode 100644 tests/test_models.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..49c70cc --- /dev/null +++ b/.env.example @@ -0,0 +1,22 @@ +# Voice Server Configuration +# Copy this file to .env and modify as needed + +# Server Settings +HOST=0.0.0.0 +PORT=8888 + +# TTS Settings +MODEL_DIR=./models +DEFAULT_VOICE=en_US-lessac-medium +DEFAULT_RATE=170 + +# Queue Settings +QUEUE_MAX_SIZE=50 +REQUEST_TIMEOUT_SECONDS=60 + +# Logging +LOG_LEVEL=INFO +LOG_FILE=voice-server.log + +# Debug (disable TTS for testing) +# VOICE_ENABLED=true diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e175b07 --- /dev/null +++ b/.gitignore @@ -0,0 +1,58 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +.venv/ +venv/ +ENV/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo +*~ + +# Environment +.env +.env.local + +# Logs +*.log +logs/ + +# Voice models (large files) +models/*.onnx +models/*.onnx.json + +# Testing +.coverage +htmlcov/ +.pytest_cache/ +.tox/ + +# OS +.DS_Store +Thumbs.db + +# uv +uv.lock diff --git a/PRD.md b/PRD.md new file mode 100644 index 0000000..5c4c0f6 --- /dev/null +++ b/PRD.md @@ -0,0 +1,2147 @@ +# Product Requirements Document: Local Voice Server + +**Version:** 1.0 +**Date:** 2025-12-18 +**Author:** Atlas (Principal Software Architect) +**Project:** Local HTTP Voice Server for Text-to-Speech + +--- + +## Table of Contents + +1. [Executive Summary](#executive-summary) +2. [Goals and Non-Goals](#goals-and-non-goals) +3. [Technical Requirements](#technical-requirements) +4. [System Architecture](#system-architecture) +5. [API Specification](#api-specification) +6. [TTS Engine Analysis](#tts-engine-analysis) +7. [Web Framework Selection](#web-framework-selection) +8. [Audio Playback Strategy](#audio-playback-strategy) +9. [Error Handling Strategy](#error-handling-strategy) +10. [Implementation Checklist](#implementation-checklist) +11. [Testing Strategy](#testing-strategy) +12. [Future Considerations](#future-considerations) + +--- + +## Executive Summary + +### Project Overview + +This project delivers a local HTTP service that accepts POST requests containing text strings and converts them to speech through the computer's speakers. The service will run locally on Linux (Nobara/Fedora 42), providing fast, offline text-to-speech capabilities without requiring external API calls or internet connectivity. + +### Success Metrics + +- **Response Time:** TTS conversion and playback initiation within 200ms for short texts (< 100 characters) +- **Reliability:** 99.9% successful request handling under normal operating conditions +- **Concurrency:** Support for at least 5 concurrent TTS requests with proper queuing +- **Audio Quality:** Clear, intelligible speech output comparable to Google TTS quality +- **Startup Time:** Server ready to accept requests within 2 seconds of launch + +### Technical Stack + +| Component | Technology | Justification | +|-----------|-----------|---------------| +| Web Framework | FastAPI | Async support, high performance (15k-20k req/s), automatic API documentation | +| TTS Engine | Piper TTS | Neural voice quality, offline, optimized for local inference, ONNX-based | +| Audio Playback | sounddevice | Cross-platform, Pythonic API, excellent NumPy integration, non-blocking playback | +| Package Manager | uv | Fast Python package management (user preference) | +| ASGI Server | Uvicorn | High-performance ASGI server, native FastAPI integration | +| Async Runtime | asyncio | Built-in Python async support for concurrent request handling | + +### Timeline Estimate + +- **Phase 1 - Core Implementation:** 2-3 days (basic HTTP server + TTS integration) +- **Phase 2 - Error Handling & Testing:** 1-2 days (comprehensive error handling, unit tests) +- **Phase 3 - Concurrency & Queue Management:** 1-2 days (async queue, concurrent playback) +- **Total Estimated Time:** 4-7 days for production-ready v1.0 + +### Resource Requirements + +- **Development:** 1 full-stack Python developer with async programming experience +- **Testing:** Access to Linux environment (Nobara/Fedora 42) with audio hardware +- **Infrastructure:** Local development machine with 2+ CPU cores, 4GB+ RAM + +--- + +## Goals and Non-Goals + +### Goals + +**Primary Goals:** +1. Create a local HTTP service that accepts text via POST requests +2. Convert text to speech using high-quality offline TTS +3. Play audio through system speakers with minimal latency +4. Support concurrent requests with proper queue management +5. Provide comprehensive error handling and logging +6. Maintain zero external dependencies (fully offline capable) + +**Secondary Goals:** +1. Automatic API documentation via FastAPI's built-in OpenAPI support +2. Configurable TTS parameters (voice, speed, volume) via request parameters +3. Health check endpoint for service monitoring +4. Graceful handling of long-running text conversions +5. Support for multiple voice models + +### Non-Goals + +**Explicitly Out of Scope:** +1. Cloud-based or external API integration +2. Speech-to-text (STT) capabilities +3. Audio file storage or retrieval +4. User authentication or authorization +5. Rate limiting or quota management +6. Multi-language UI or web interface +7. Real-time streaming audio synthesis +8. Mobile app integration +9. Persistent audio history or logging +10. Advanced audio effects (reverb, pitch shifting, etc.) + +--- + +## Technical Requirements + +### Functional Requirements + +#### FR1: HTTP Server +- **FR1.1:** Server SHALL listen on configurable host and port (default: `0.0.0.0:8888`) +- **FR1.2:** Server SHALL accept POST requests to `/notify` endpoint +- **FR1.3:** Server SHALL accept JSON payload with `message` field containing text +- **FR1.4:** Server SHALL return HTTP 200 with success confirmation +- **FR1.5:** Server SHALL support CORS for local development + +#### FR2: Text-to-Speech Conversion +- **FR2.1:** System SHALL convert text strings to audio using Piper TTS +- **FR2.2:** System SHALL support configurable voice models via request parameters +- **FR2.3:** System SHALL support adjustable speech rate (50-400 words per minute) +- **FR2.4:** System SHALL handle text inputs from 1 to 10,000 characters +- **FR2.5:** System SHALL use default voice if not specified in request + +#### FR3: Audio Playback +- **FR3.1:** System SHALL play generated audio through default system audio output +- **FR3.2:** System SHALL support non-blocking audio playback +- **FR3.3:** System SHALL queue concurrent requests in FIFO order +- **FR3.4:** System SHALL allow configurable maximum queue size (default: 50) +- **FR3.5:** System SHALL provide feedback when queue is full + +#### FR4: Configuration +- **FR4.1:** System SHALL support configuration via environment variables +- **FR4.2:** System SHALL support configuration via command-line arguments +- **FR4.3:** System SHALL provide sensible defaults for all configuration values +- **FR4.4:** System SHALL validate configuration at startup + +#### FR5: Error Handling +- **FR5.1:** System SHALL return appropriate HTTP error codes for failures +- **FR5.2:** System SHALL log all errors with timestamps and context +- **FR5.3:** System SHALL continue operating after non-fatal errors +- **FR5.4:** System SHALL gracefully handle TTS engine failures +- **FR5.5:** System SHALL provide detailed error messages in responses + +### Non-Functional Requirements + +#### NFR1: Performance +- **NFR1.1:** API response time SHALL be < 50ms (excluding TTS processing) +- **NFR1.2:** TTS conversion SHALL complete in < 2 seconds for 500 character texts +- **NFR1.3:** System SHALL handle 20+ requests per second without degradation +- **NFR1.4:** Memory usage SHALL remain < 500MB under normal load +- **NFR1.5:** CPU usage SHALL average < 30% during active TTS processing + +#### NFR2: Reliability +- **NFR2.1:** System SHALL maintain 99.9% uptime during operation +- **NFR2.2:** System SHALL recover from audio device disconnections +- **NFR2.3:** System SHALL handle Out-of-Memory conditions gracefully +- **NFR2.4:** System SHALL log all critical errors for debugging + +#### NFR3: Maintainability +- **NFR3.1:** Code SHALL maintain > 80% test coverage +- **NFR3.2:** All functions SHALL include docstrings with type hints +- **NFR3.3:** Code SHALL follow PEP 8 style guidelines +- **NFR3.4:** Dependencies SHALL be pinned to specific versions +- **NFR3.5:** README SHALL provide clear setup and usage instructions + +#### NFR4: Security +- **NFR4.1:** System SHALL sanitize all text inputs to prevent injection attacks +- **NFR4.2:** System SHALL limit request payload size to 1MB +- **NFR4.3:** System SHALL not expose internal stack traces in API responses +- **NFR4.4:** System SHALL log all incoming requests for audit purposes + +#### NFR5: Compatibility +- **NFR5.1:** System SHALL run on Linux (Nobara/Fedora 42) +- **NFR5.2:** System SHALL support Python 3.9+ +- **NFR5.3:** System SHALL work with standard ALSA/PulseAudio setups +- **NFR5.4:** System SHALL be deployable as a systemd service + +--- + +## System Architecture + +### High-Level Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Client Applications │ +│ (AI Agents, Scripts, Other Services) │ +└────────────────────────────┬────────────────────────────────────┘ + │ HTTP POST /notify + │ JSON: {"message": "text"} + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ FastAPI Web Server │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ /notify │ │ /health │ │ /docs │ │ +│ │ endpoint │ │ endpoint │ │ (Swagger) │ │ +│ └──────┬───────┘ └──────────────┘ └──────────────┘ │ +│ │ │ +│ │ Validates & Enqueues │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────┐ │ +│ │ Async Request Queue │ │ +│ │ (asyncio.Queue with max size limit) │ │ +│ └──────────────────┬───────────────────────────────┘ │ +└────────────────────┬┼───────────────────────────────────────────┘ + ││ + ││ Background Task Processing + ▼▼ +┌─────────────────────────────────────────────────────────────────┐ +│ TTS Processing Layer │ +│ ┌────────────────────────────────────────────────────┐ │ +│ │ Piper TTS Engine │ │ +│ │ ┌──────────────┐ ┌──────────────┐ │ │ +│ │ │ Voice Models │ │ ONNX Runtime │ │ │ +│ │ │ (.onnx + │ │ Inference │ │ │ +│ │ │ .json) │ │ Engine │ │ │ +│ │ └──────────────┘ └──────────────┘ │ │ +│ └─────────────────────────┬──────────────────────────┘ │ +│ │ Generate WAV │ +│ ▼ │ +│ ┌────────────────────────────────────────────────────┐ │ +│ │ In-Memory Audio Buffer │ │ +│ │ (NumPy array / bytes) │ │ +│ └─────────────────────────┬──────────────────────────┘ │ +└────────────────────────────┼───────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Audio Playback Layer │ +│ ┌────────────────────────────────────────────────────┐ │ +│ │ PyAudio Stream Manager │ │ +│ │ - Callback-based playback │ │ +│ │ - Non-blocking operation │ │ +│ │ - Stream lifecycle management │ │ +│ └─────────────────────────┬──────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌────────────────────────────────────────────────────┐ │ +│ │ System Audio Output (ALSA/PulseAudio) │ │ +│ └────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ + 🔊 Computer Speakers +``` + +### Component Descriptions + +#### 1. FastAPI Web Server +- **Responsibilities:** + - Accept and validate HTTP POST requests + - Provide automatic OpenAPI documentation + - Handle CORS configuration + - Route requests to appropriate handlers + - Return HTTP responses with appropriate status codes + +- **Dependencies:** + - FastAPI framework + - Uvicorn ASGI server + - Pydantic for request/response validation + +#### 2. Async Request Queue +- **Responsibilities:** + - Queue incoming TTS requests in FIFO order + - Prevent queue overflow with configurable max size + - Enable asynchronous processing without blocking HTTP responses + - Provide queue status information + +- **Implementation:** + - `asyncio.Queue` for async-safe queuing + - Background task workers to process queue + - Queue metrics (size, processed count, errors) + +#### 3. TTS Processing Layer +- **Responsibilities:** + - Load and manage Piper TTS voice models + - Convert text to audio waveforms + - Handle voice model selection + - Configure TTS parameters (rate, pitch, volume) + - Generate in-memory audio buffers + +- **Implementation:** + - Piper TTS Python bindings + - ONNX Runtime for model inference + - Voice model caching for performance + - Error handling for model loading failures + +#### 4. Audio Playback Layer +- **Responsibilities:** + - Initialize audio output streams + - Play audio buffers through system speakers + - Support non-blocking playback + - Handle audio device errors + - Manage stream lifecycle + +- **Implementation:** + - sounddevice for cross-platform audio I/O + - Non-blocking `sd.play()` with background playback + - Simple NumPy array integration + - Graceful handling of audio device disconnections + +### Data Flow + +**Request Processing Flow:** + +1. **HTTP Request Reception:** + - Client sends POST to `/notify` with JSON payload + - FastAPI validates request schema using Pydantic models + - Request is immediately acknowledged with HTTP 202 (Accepted) + +2. **Request Enqueueing:** + - Validated request is added to async queue + - If queue is full, return HTTP 503 (Service Unavailable) + - Queue position is logged for monitoring + +3. **Background Processing:** + - Background worker retrieves request from queue + - Text is passed to Piper TTS for conversion + - Piper generates WAV audio in memory + +4. **Audio Playback:** + - Audio buffer is passed to PyAudio + - PyAudio streams audio to system output + - Playback occurs in callback thread (non-blocking) + - Completion is logged + +5. **Error Handling:** + - Errors at any stage are caught and logged + - Failed requests are removed from queue + - Error metrics are updated + +### Technology Stack Justification + +#### FastAPI vs Flask + +**Decision: FastAPI** + +**Rationale:** +- **Performance:** FastAPI handles 15,000-20,000 req/s vs Flask's 2,000-3,000 req/s ([Strapi Comparison](https://strapi.io/blog/fastapi-vs-flask-python-framework-comparison)) +- **Async Native:** Built on ASGI with native async/await support, critical for non-blocking TTS processing +- **Type Safety:** Pydantic integration provides automatic request validation and serialization +- **Documentation:** Automatic OpenAPI (Swagger) documentation generation +- **Modern Architecture:** Designed for microservices and high-concurrency applications +- **Growing Adoption:** 78k GitHub stars, 38% developer adoption in 2025 (40% YoY increase) + +**Trade-offs:** +- Steeper learning curve compared to Flask +- Smaller ecosystem of extensions (though growing rapidly) +- Requires ASGI server (Uvicorn) vs Flask's built-in development server + +#### Piper TTS Engine Selection + +**Decision: Piper TTS** + +**Rationale:** +- **Voice Quality:** Neural TTS with "Google TTS level quality" ([AntiX Forum](https://www.antixforum.com/forums/topic/tts-text-to-speech-in-linux-piper/)) +- **Offline Operation:** Fully local, no internet required +- **Performance:** Optimized for local inference using ONNX Runtime +- **Resource Efficiency:** Runs on Raspberry Pi 4, suitable for desktop Linux +- **Easy Installation:** Available via pip (`pip install piper-tts`) +- **Active Development:** Maintained project with 2025 updates +- **Multiple Voices:** Extensive voice model library with quality/speed trade-offs + +**Comparison with Alternatives:** + +| Engine | Voice Quality | Speed | Resource Usage | Offline | Ease of Use | +|--------|---------------|-------|----------------|---------|-------------| +| **Piper TTS** | ⭐⭐⭐⭐⭐ Neural | ⭐⭐⭐⭐ Fast | ⭐⭐⭐⭐ Medium | ✅ Yes | ⭐⭐⭐⭐ Easy | +| pyttsx3 | ⭐⭐ Robotic | ⭐⭐⭐⭐⭐ Very Fast | ⭐⭐⭐⭐⭐ Very Low | ✅ Yes | ⭐⭐⭐⭐⭐ Very Easy | +| eSpeak | ⭐⭐ Robotic | ⭐⭐⭐⭐⭐ Very Fast | ⭐⭐⭐⭐⭐ Very Low | ✅ Yes | ⭐⭐⭐⭐ Easy | +| gTTS | ⭐⭐⭐⭐⭐ Neural | ⭐⭐ Slow | ⭐⭐⭐⭐ Low | ❌ No | ⭐⭐⭐⭐⭐ Very Easy | +| Coqui TTS | ⭐⭐⭐⭐⭐ Neural | ⭐⭐⭐ Medium | ⭐⭐ High | ✅ Yes | ⭐⭐ Complex | + +**Trade-offs:** +- Larger model files (~20-100MB per voice) vs simple engines +- Higher resource usage than pyttsx3/eSpeak +- Requires ONNX Runtime dependency + +#### sounddevice for Audio Playback + +**Decision: sounddevice** + +**Rationale:** +- **Pythonic API:** Clean, intuitive interface that feels native to Python +- **NumPy Integration:** Direct support for NumPy arrays (perfect for Piper TTS output) +- **Non-Blocking:** Simple `sd.play()` returns immediately, audio plays in background +- **Cross-Platform:** Works on Linux, Windows, macOS via PortAudio backend +- **Active Maintenance:** Well-maintained with regular updates +- **Simple Async:** Easy integration with asyncio via `sd.wait()` or callbacks + +**Comparison with Alternatives:** + +| Library | Non-Blocking | Dependencies | Maintenance | Linux Support | +|---------|-------------|--------------|-------------|---------------| +| **sounddevice** | ✅ Native | PortAudio | ⭐⭐⭐⭐ Active | ✅ Excellent | +| PyAudio | ✅ Callbacks | PortAudio | ⭐⭐⭐ Active | ✅ Excellent | +| simpleaudio | ✅ Async | None | ❌ Archived | ⭐⭐⭐ Good | +| pygame | ⭐ Limited | SDL | ⭐⭐⭐⭐ Active | ⭐⭐⭐⭐ Excellent | + +**Why sounddevice over PyAudio:** +- Simpler API - `sd.play(audio, samplerate)` vs PyAudio's stream setup +- Better NumPy support - no conversion needed from Piper's output +- More Pythonic - feels like a modern Python library +- Easier async integration - works naturally with asyncio + +--- + +## API Specification + +### Endpoint: POST /notify + +**Description:** Accept text string and queue for TTS playback + +**Request Schema:** + +```json +{ + "message": "string (required)", + "voice": "string (optional)", + "rate": "integer (optional, default: 170)", + "voice_enabled": "boolean (optional, default: true)" +} +``` + +**Request Parameters:** + +| Parameter | Type | Required | Default | Constraints | Description | +|-----------|------|----------|---------|-------------|-------------| +| `message` | string | Yes | - | 1-10000 chars | Text to convert to speech | +| `voice` | string | No | `en_US-lessac-medium` | Valid voice model name | Piper voice model to use | +| `rate` | integer | No | `170` | 50-400 | Speech rate in words per minute | +| `voice_enabled` | boolean | No | `true` | - | Enable/disable TTS (for debugging) | + +**Example Request:** + +```bash +curl -X POST http://localhost:8888/notify \ + -H "Content-Type: application/json" \ + -d '{ + "message": "Hello, this is a test of the voice server", + "rate": 200, + "voice_enabled": true + }' +``` + +**Response Schema (Success - 202 Accepted):** + +```json +{ + "status": "queued", + "message_length": 42, + "queue_position": 3, + "estimated_duration": 2.5, + "voice_model": "en_US-lessac-medium" +} +``` + +**Response Schema (Error - 400 Bad Request):** + +```json +{ + "error": "validation_error", + "detail": "message field is required", + "timestamp": "2025-12-18T10:30:45.123Z" +} +``` + +**Response Schema (Error - 503 Service Unavailable):** + +```json +{ + "error": "queue_full", + "detail": "TTS queue is full, please retry later", + "queue_size": 50, + "timestamp": "2025-12-18T10:30:45.123Z" +} +``` + +**HTTP Status Codes:** + +| Code | Meaning | Scenario | +|------|---------|----------| +| 202 | Accepted | Request successfully queued for processing | +| 400 | Bad Request | Invalid request parameters or malformed JSON | +| 413 | Payload Too Large | Message exceeds 10,000 characters | +| 422 | Unprocessable Entity | Valid JSON but invalid parameter values | +| 500 | Internal Server Error | TTS engine failure or unexpected error | +| 503 | Service Unavailable | Queue is full or service is shutting down | + +--- + +### Endpoint: GET /health + +**Description:** Health check endpoint for monitoring + +**Request:** No parameters + +**Response Schema (Healthy - 200 OK):** + +```json +{ + "status": "healthy", + "uptime_seconds": 3600, + "queue_size": 2, + "queue_capacity": 50, + "tts_engine": "piper", + "audio_output": "available", + "voice_models_loaded": ["en_US-lessac-medium"], + "total_requests": 1523, + "failed_requests": 12, + "timestamp": "2025-12-18T10:30:45.123Z" +} +``` + +**Response Schema (Unhealthy - 503 Service Unavailable):** + +```json +{ + "status": "unhealthy", + "errors": [ + "Audio output device unavailable", + "TTS engine failed to initialize" + ], + "timestamp": "2025-12-18T10:30:45.123Z" +} +``` + +--- + +### Endpoint: GET /docs + +**Description:** Automatic Swagger UI documentation (provided by FastAPI) + +**Access:** `http://localhost:8888/docs` + +**Features:** +- Interactive API testing +- Schema visualization +- Request/response examples +- Authentication testing (if implemented) + +--- + +### Endpoint: GET /voices + +**Description:** List available TTS voice models + +**Request:** No parameters + +**Response Schema (200 OK):** + +```json +{ + "voices": [ + { + "name": "en_US-lessac-medium", + "language": "en_US", + "quality": "medium", + "size_mb": 63.5, + "installed": true + }, + { + "name": "en_US-libritts-high", + "language": "en_US", + "quality": "high", + "size_mb": 108.2, + "installed": false + } + ], + "default_voice": "en_US-lessac-medium" +} +``` + +--- + +## TTS Engine Analysis + +### Detailed Comparison Matrix + +| Engine | Voice Quality | Latency | CPU Usage | Memory | Offline | Linux Support | Python API | Maintenance | +|--------|---------------|---------|-----------|--------|---------|---------------|------------|-------------| +| **Piper TTS** | ⭐⭐⭐⭐⭐ | ~500ms | Medium | ~200MB | ✅ | ✅ Excellent | ✅ Native | 🟢 Active | +| **pyttsx3** | ⭐⭐ | ~100ms | Low | ~50MB | ✅ | ✅ Good | ✅ Native | 🟢 Active | +| **eSpeak-ng** | ⭐⭐ | ~50ms | Very Low | ~20MB | ✅ | ✅ Excellent | ⚠️ Wrapper | 🟢 Active | +| **gTTS** | ⭐⭐⭐⭐⭐ | ~2000ms | Low | ~30MB | ❌ | ✅ Good | ✅ Native | 🟢 Active | +| **Coqui TTS** | ⭐⭐⭐⭐⭐ | ~1500ms | High | ~500MB | ✅ | ✅ Good | ✅ Native | 🟡 Slow | +| **Festival** | ⭐⭐⭐ | ~300ms | Low | ~100MB | ✅ | ✅ Excellent | ⚠️ Wrapper | 🟡 Slow | +| **Mimic3** | ⭐⭐⭐⭐ | ~800ms | Medium | ~300MB | ✅ | ✅ Good | ❌ HTTP only | 🟢 Active | + +### Detailed Engine Profiles + +#### 1. Piper TTS (RECOMMENDED) + +**Pros:** +- Neural TTS with natural-sounding voices +- Optimized for local inference (ONNX Runtime) +- Multiple quality levels (low/medium/high) +- Extensive language and voice support +- Active development and community +- Easy pip installation +- GPU acceleration support (CUDA) + +**Cons:** +- Larger model files (20-100MB per voice) +- Higher resource usage than simple engines +- Initial model download required +- Slightly higher latency than robotic engines + +**Installation:** +```bash +uv pip install piper-tts +``` + +**Usage Example:** +```python +from piper import PiperVoice +import wave + +voice = PiperVoice.load("en_US-lessac-medium.onnx") +with wave.open("output.wav", "wb") as wav_file: + voice.synthesize("Hello world", wav_file) +``` + +**Voice Quality Sample:** +- **Low Quality:** Faster, smaller models (~20MB), decent quality +- **Medium Quality:** Balanced performance (~60MB), recommended default +- **High Quality:** Best quality (~100MB), slower inference + +**References:** +- [GitHub Repository](https://github.com/rhasspy/piper) +- [PyPI Package](https://pypi.org/project/piper-tts/) +- [Voice Model Library](https://github.com/rhasspy/piper/blob/master/VOICES.md) + +--- + +#### 2. pyttsx3 + +**Pros:** +- Extremely lightweight and fast +- Cross-platform (Windows SAPI5, macOS NSSpeech, Linux eSpeak) +- Zero external dependencies +- Simple API +- No model downloads required + +**Cons:** +- Robotic voice quality +- Limited voice customization +- Depends on system TTS engines + +**Installation:** +```bash +uv pip install pyttsx3 +``` + +**Usage Example:** +```python +import pyttsx3 + +engine = pyttsx3.init() +engine.say("Hello world") +engine.runAndWait() +``` + +**References:** +- [PyPI Package](https://pypi.org/project/pyttsx3/) +- [GitHub Repository](https://github.com/nateshmbhat/pyttsx3) + +--- + +#### 3. eSpeak-ng + +**Pros:** +- Ultra-fast synthesis +- 100+ language support +- Minimal resource usage +- Highly customizable +- System-level installation + +**Cons:** +- Robotic, mechanical voice quality +- Python wrapper required (not native) +- Less natural prosody + +**Installation:** +```bash +# System package +sudo dnf install espeak-ng + +# Python wrapper +uv pip install py3-tts # Uses eSpeak backend +``` + +**Usage Example:** +```bash +echo "Hello world" | espeak-ng +``` + +**References:** +- [eSpeak-ng Homepage](https://github.com/espeak-ng/espeak-ng) +- [Circuit Digest Comparison](https://circuitdigest.com/microcontroller-projects/best-text-to-speech-tts-converter-for-raspberry-pi-espeak-festival-google-tts-pico-and-pyttsx3) + +--- + +#### 4. Coqui TTS + +**Pros:** +- State-of-the-art neural voices +- Custom voice training support +- Multiple model architectures +- High-quality output + +**Cons:** +- Very high resource requirements +- Slower inference +- Complex setup +- Larger memory footprint +- Development has slowed + +**Installation:** +```bash +uv pip install TTS +``` + +**Usage Example:** +```python +from TTS.api import TTS + +tts = TTS("tts_models/en/ljspeech/tacotron2-DDC") +tts.tts_to_file(text="Hello world", file_path="output.wav") +``` + +**References:** +- [Coqui TTS GitHub](https://github.com/coqui-ai/TTS) + +--- + +### Recommendation: Piper TTS + +**Final Decision:** Piper TTS is the optimal choice for this project. + +**Justification:** +1. **Quality:** Neural voices with Google TTS-level quality +2. **Offline:** Fully local, no internet required (critical requirement) +3. **Performance:** Optimized for local inference, suitable for desktop Linux +4. **Python Native:** First-class Python API, easy integration +5. **Maintenance:** Actively maintained with 2025 updates +6. **Flexibility:** Multiple quality levels allow performance tuning +7. **Ease of Use:** Simple pip installation, straightforward API + +**Configuration Strategy:** +- **Default Voice:** `en_US-lessac-medium` (balanced quality/performance) +- **GPU Acceleration:** Optional CUDA support for faster inference +- **Model Caching:** Pre-load voice models at startup to reduce latency +- **Quality Toggle:** Allow clients to request different quality levels + +--- + +## Web Framework Selection + +### FastAPI: Detailed Analysis + +**Why FastAPI is Ideal for This Project:** + +#### 1. Async-First Architecture +FastAPI is built on Starlette (ASGI framework) with native async/await support. This is critical for our use case: + +```python +@app.post("/notify") +async def notify(request: NotifyRequest): + # Non-blocking enqueueing + await tts_queue.put(request) + return {"status": "queued"} + +# Background worker runs concurrently +async def process_queue(): + while True: + request = await tts_queue.get() + await generate_and_play_tts(request) +``` + +**Benefit:** HTTP responses return immediately while TTS processing happens in background. + +#### 2. Performance Benchmarks + +According to TechEmpower benchmarks ([Better Stack](https://betterstack.com/community/guides/scaling-python/flask-vs-fastapi/)): +- **FastAPI:** 15,000-20,000 requests/second +- **Flask:** 2,000-3,000 requests/second + +**Benefit:** 5-10x higher throughput for handling concurrent TTS requests. + +#### 3. Automatic API Documentation + +FastAPI generates interactive OpenAPI (Swagger) documentation automatically: + +```python +@app.post("/notify", response_model=NotifyResponse) +async def notify(request: NotifyRequest): + """ + Convert text to speech and play through speakers. + + - **message**: Text to convert (1-10000 characters) + - **rate**: Speech rate in WPM (50-400) + - **voice**: Voice model name (optional) + """ + ... +``` + +**Benefit:** Instant API documentation at `/docs` without manual maintenance. + +#### 4. Type Safety with Pydantic + +Automatic request validation and serialization: + +```python +from pydantic import BaseModel, Field, validator + +class NotifyRequest(BaseModel): + message: str = Field(..., min_length=1, max_length=10000) + rate: int = Field(170, ge=50, le=400) + voice_enabled: bool = True + + @validator('message') + def sanitize_message(cls, v): + # Automatic validation before handler runs + return v.strip() +``` + +**Benefit:** Eliminates manual validation code, reduces bugs. + +#### 5. Dependency Injection + +Clean separation of concerns: + +```python +async def get_tts_engine(): + return global_tts_engine + +@app.post("/notify") +async def notify( + request: NotifyRequest, + tts_engine: PiperVoice = Depends(get_tts_engine) +): + # tts_engine automatically injected + ... +``` + +**Benefit:** Testable, maintainable code with clear dependencies. + +#### 6. Background Tasks + +Built-in support for fire-and-forget tasks: + +```python +from fastapi import BackgroundTasks + +@app.post("/notify") +async def notify(request: NotifyRequest, background_tasks: BackgroundTasks): + background_tasks.add_task(generate_tts, request.message) + return {"status": "queued"} +``` + +**Benefit:** Simplified async task management. + +### Flask Comparison (Why Not Flask) + +**Flask Limitations for This Project:** + +1. **WSGI-Based:** Synchronous by default, requires Gunicorn/gevent for async +2. **Lower Performance:** 2,000-3,000 req/s vs FastAPI's 15,000-20,000 req/s +3. **Manual Documentation:** Requires Flask-RESTPlus or manual OpenAPI setup +4. **Manual Validation:** No built-in request validation, requires Flask-Pydantic extension +5. **Blocking I/O:** Natural behavior blocks request threads during TTS processing + +**When Flask Would Be Better:** +- Simple synchronous applications +- Heavy reliance on Flask extensions (Flask-Login, Flask-Admin) +- Team already experienced with Flask +- Need for Jinja2 templating (not needed here) + +**Verdict:** FastAPI is the clear winner for this async-heavy, high-performance use case. + +--- + +## Audio Playback Strategy + +### sounddevice Implementation Details + +#### Non-Blocking Playback + +sounddevice provides simple, non-blocking audio playback out of the box: + +```python +import sounddevice as sd +import numpy as np + +class AudioPlayer: + """Simple audio player using sounddevice.""" + + def __init__(self, sample_rate: int = 22050): + self.sample_rate = sample_rate + self._current_stream = None + + def play(self, audio_data: np.ndarray, sample_rate: int = None): + """ + Non-blocking audio playback. + + Args: + audio_data: NumPy array of audio samples (float32 or int16) + sample_rate: Sample rate in Hz (defaults to instance default) + """ + rate = sample_rate or self.sample_rate + + # Stop any currently playing audio + self.stop() + + # Play audio - returns immediately, audio plays in background + sd.play(audio_data, rate) + + def is_playing(self) -> bool: + """Check if audio is currently playing.""" + return sd.get_stream() is not None and sd.get_stream().active + + def stop(self): + """Stop current playback.""" + sd.stop() + + def wait(self): + """Block until current playback completes.""" + sd.wait() + + async def wait_async(self): + """Async wait for playback completion.""" + import asyncio + while self.is_playing(): + await asyncio.sleep(0.05) +``` + +**Benefits of sounddevice:** +- `sd.play()` returns immediately - audio plays in background thread +- Direct NumPy array support - no conversion needed from Piper TTS +- Simple API - one line to play audio +- Built-in `sd.wait()` for synchronous waiting when needed + +--- + +#### Handling Concurrent Requests + +**Strategy:** Queue-based sequential playback with async queue management. + +**Rationale:** +- Playing multiple TTS outputs simultaneously would create audio chaos +- Sequential playback ensures clarity +- Queue allows buffering during high request volume + +**Implementation:** + +```python +import asyncio +import sounddevice as sd +import numpy as np +from typing import Dict, Any + +class TTSQueue: + def __init__(self, max_size: int = 50): + self.queue = asyncio.Queue(maxsize=max_size) + self.player = AudioPlayer() + self.stats = {"processed": 0, "errors": 0} + + async def enqueue(self, request: Dict[str, Any]): + """Add TTS request to queue.""" + try: + await asyncio.wait_for( + self.queue.put(request), + timeout=1.0 + ) + return self.queue.qsize() + except asyncio.TimeoutError: + raise QueueFullError("TTS queue is full") + + async def process_queue(self): + """Background worker to process TTS queue.""" + while True: + request = await self.queue.get() + + try: + # Generate TTS audio + audio_data = await self.generate_tts(request) + + # Play audio (non-blocking start) + self.player.play(audio_data, sample_rate=22050) + + # Wait for playback to complete (async-friendly) + await self.player.wait_async() + + self.stats["processed"] += 1 + + except Exception as e: + logger.error(f"TTS processing error: {e}") + self.stats["errors"] += 1 + + finally: + self.queue.task_done() + + async def generate_tts(self, request: Dict[str, Any]) -> np.ndarray: + """Generate TTS audio using Piper.""" + # Run CPU-intensive TTS in thread pool + loop = asyncio.get_event_loop() + audio_data = await loop.run_in_executor( + None, + self._sync_generate_tts, + request["message"], + request.get("voice", "en_US-lessac-medium") + ) + return audio_data + + def _sync_generate_tts(self, text: str, voice: str) -> np.ndarray: + """Synchronous TTS generation (runs in thread pool).""" + # Piper TTS generation code + ... + return audio_array +``` + +**Startup:** + +```python +from contextlib import asynccontextmanager + +@asynccontextmanager +async def lifespan(app: FastAPI): + # Startup: initialize queue and start processor + global tts_queue + tts_queue = TTSQueue(max_size=50) + asyncio.create_task(tts_queue.process_queue()) + yield + # Shutdown: stop audio playback + sd.stop() + +app = FastAPI(lifespan=lifespan) +``` + +--- + +#### Audio Device Error Handling + +**Common Issues:** +1. Audio device disconnected (headphones unplugged) +2. PulseAudio/ALSA daemon crashed +3. No audio devices available +4. Device in use by another process + +**Handling Strategy:** + +```python +import sounddevice as sd +import numpy as np +import time +import logging + +logger = logging.getLogger(__name__) + +class RobustAudioPlayer: + """Audio player with automatic retry and device recovery.""" + + def __init__(self, retry_attempts: int = 3, sample_rate: int = 22050): + self.retry_attempts = retry_attempts + self.sample_rate = sample_rate + self.verify_audio_devices() + + def verify_audio_devices(self): + """Verify audio devices are available.""" + try: + devices = sd.query_devices() + output_devices = [d for d in devices if d['max_output_channels'] > 0] + if not output_devices: + raise AudioDeviceError("No audio output devices found") + logger.info(f"Audio initialized: {len(output_devices)} output devices found") + logger.debug(f"Default output: {sd.query_devices(kind='output')['name']}") + except Exception as e: + logger.error(f"Audio initialization failed: {e}") + raise + + def play(self, audio_data: np.ndarray, sample_rate: int = None): + """Play audio with automatic retry on device errors.""" + rate = sample_rate or self.sample_rate + + for attempt in range(self.retry_attempts): + try: + sd.play(audio_data, rate) + return + except sd.PortAudioError as e: + logger.warning(f"Audio playback failed (attempt {attempt+1}): {e}") + + if attempt < self.retry_attempts - 1: + # Wait and retry - device may become available + sd.stop() + time.sleep(0.5) + self.verify_audio_devices() + else: + raise AudioPlaybackError(f"Failed after {self.retry_attempts} attempts: {e}") + + def is_playing(self) -> bool: + """Check if audio is currently playing.""" + stream = sd.get_stream() + return stream is not None and stream.active + + def stop(self): + """Stop current playback.""" + sd.stop() + + async def wait_async(self): + """Async wait for playback completion.""" + import asyncio + while self.is_playing(): + await asyncio.sleep(0.05) +``` + +**Device Query for Diagnostics:** + +```python +def get_audio_diagnostics() -> dict: + """Get audio system diagnostics for health check.""" + try: + devices = sd.query_devices() + default_output = sd.query_devices(kind='output') + return { + "status": "available", + "device_count": len(devices), + "default_output": default_output['name'], + "sample_rate": default_output['default_samplerate'] + } + except Exception as e: + return { + "status": "unavailable", + "error": str(e) + } +``` + +--- + +## Error Handling Strategy + +### Error Categories and Handling + +#### 1. Request Validation Errors + +**Scenarios:** +- Missing required fields +- Invalid parameter types +- Out-of-range values +- Malformed JSON + +**Handling:** + +```python +from fastapi import HTTPException, status +from pydantic import BaseModel, Field, ValidationError + +class NotifyRequest(BaseModel): + message: str = Field(..., min_length=1, max_length=10000) + rate: int = Field(170, ge=50, le=400) + voice: str = Field("en_US-lessac-medium", regex=r"^[\w-]+$") + +@app.exception_handler(ValidationError) +async def validation_exception_handler(request, exc): + return JSONResponse( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + content={ + "error": "validation_error", + "detail": str(exc), + "timestamp": datetime.utcnow().isoformat() + } + ) +``` + +**HTTP Status:** 422 Unprocessable Entity + +--- + +#### 2. Queue Full Errors + +**Scenario:** Too many concurrent requests, queue is at capacity + +**Handling:** + +```python +class QueueFullError(Exception): + pass + +@app.post("/notify") +async def notify(request: NotifyRequest): + try: + position = await tts_queue.enqueue(request) + return { + "status": "queued", + "queue_position": position + } + except QueueFullError: + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail={ + "error": "queue_full", + "message": "TTS queue is full, please retry later", + "queue_size": tts_queue.max_size, + "retry_after": 5 # seconds + } + ) +``` + +**HTTP Status:** 503 Service Unavailable +**Client Action:** Implement exponential backoff retry + +--- + +#### 3. TTS Engine Errors + +**Scenarios:** +- Voice model not found +- ONNX Runtime errors +- Memory allocation failures +- Corrupted model files + +**Handling:** + +```python +class TTSEngineError(Exception): + pass + +async def generate_tts(text: str, voice: str) -> np.ndarray: + try: + # Attempt TTS generation + audio = piper_voice.synthesize(text) + return audio + except FileNotFoundError: + raise TTSEngineError(f"Voice model '{voice}' not found") + except MemoryError: + raise TTSEngineError("Insufficient memory for TTS generation") + except Exception as e: + logger.error(f"TTS generation failed: {e}", exc_info=True) + raise TTSEngineError(f"TTS generation failed: {str(e)}") + +@app.exception_handler(TTSEngineError) +async def tts_engine_exception_handler(request, exc): + return JSONResponse( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + content={ + "error": "tts_engine_error", + "detail": str(exc), + "timestamp": datetime.utcnow().isoformat() + } + ) +``` + +**HTTP Status:** 500 Internal Server Error + +--- + +#### 4. Audio Playback Errors + +**Scenarios:** +- No audio devices available +- Audio device disconnected +- ALSA/PulseAudio errors +- Permission denied + +**Handling:** + +```python +class AudioPlaybackError(Exception): + pass + +async def play_audio(audio_data: np.ndarray): + try: + player.play_with_retry(audio_data, sample_rate=22050) + except AudioDeviceError as e: + logger.error(f"Audio device error: {e}") + raise AudioPlaybackError("No audio output devices available") + except OSError as e: + logger.error(f"Audio system error: {e}") + raise AudioPlaybackError(f"Audio playback failed: {str(e)}") + +# In queue processor +try: + await play_audio(audio_data) +except AudioPlaybackError as e: + logger.error(f"Playback error: {e}") + # Continue processing queue, don't crash server + stats["errors"] += 1 +``` + +**Action:** Log error, continue processing queue (don't crash server) + +--- + +#### 5. System Resource Errors + +**Scenarios:** +- Out of memory +- CPU overload +- Disk space exhausted + +**Handling:** + +```python +import psutil + +async def check_system_resources(): + """Monitor system resources.""" + memory = psutil.virtual_memory() + if memory.percent > 90: + logger.warning(f"High memory usage: {memory.percent}%") + + cpu = psutil.cpu_percent(interval=1) + if cpu > 90: + logger.warning(f"High CPU usage: {cpu}%") + +@app.middleware("http") +async def resource_monitoring_middleware(request, call_next): + """Monitor resources on each request.""" + await check_system_resources() + response = await call_next(request) + return response +``` + +**Action:** Log warnings, implement queue size limits to prevent resource exhaustion + +--- + +### Logging Strategy + +**Log Levels:** + +```python +import logging +from logging.handlers import RotatingFileHandler + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + RotatingFileHandler( + 'voice-server.log', + maxBytes=10*1024*1024, # 10MB + backupCount=5 + ), + logging.StreamHandler() + ] +) + +logger = logging.getLogger(__name__) + +# Log levels usage: +logger.debug("TTS parameters: rate=%d, voice=%s", rate, voice) # DEBUG +logger.info("Request queued: position=%d", queue_position) # INFO +logger.warning("Queue nearly full: %d/%d", current, max_size) # WARNING +logger.error("TTS generation failed: %s", error, exc_info=True) # ERROR +logger.critical("Audio system unavailable, shutting down") # CRITICAL +``` + +**Structured Logging:** + +```python +import json +from datetime import datetime + +def log_request(request_id: str, message: str, status: str): + """Structured JSON logging.""" + log_entry = { + "timestamp": datetime.utcnow().isoformat(), + "request_id": request_id, + "message_length": len(message), + "status": status, + "event_type": "tts_request" + } + logger.info(json.dumps(log_entry)) +``` + +--- + +### Health Check Implementation + +**Comprehensive Health Checks:** + +```python +@app.get("/health") +async def health_check(): + """Detailed health status.""" + health_status = { + "status": "healthy", + "timestamp": datetime.utcnow().isoformat(), + "checks": {} + } + + # Check TTS engine + try: + tts_engine.test_synthesis("test") + health_status["checks"]["tts_engine"] = "healthy" + except Exception as e: + health_status["checks"]["tts_engine"] = f"unhealthy: {str(e)}" + health_status["status"] = "unhealthy" + + # Check audio output + try: + audio_player.test_output() + health_status["checks"]["audio_output"] = "healthy" + except Exception as e: + health_status["checks"]["audio_output"] = f"unhealthy: {str(e)}" + health_status["status"] = "unhealthy" + + # Check queue status + queue_size = tts_queue.qsize() + health_status["checks"]["queue"] = { + "size": queue_size, + "capacity": tts_queue.max_size, + "utilization": f"{(queue_size/tts_queue.max_size)*100:.1f}%" + } + + # Check system resources + health_status["checks"]["system"] = { + "memory_percent": psutil.virtual_memory().percent, + "cpu_percent": psutil.cpu_percent(interval=0.1) + } + + status_code = 200 if health_status["status"] == "healthy" else 503 + return JSONResponse(status_code=status_code, content=health_status) +``` + +--- + +## Implementation Checklist + +### Phase 1: Core Infrastructure (Days 1-2) + +#### 1.1 Project Setup +- [ ] Initialize project directory `/mnt/NV2/Development/voice-server` +- [ ] Create Python virtual environment using `uv` +- [ ] Install core dependencies: + - [ ] `uv pip install fastapi` + - [ ] `uv pip install uvicorn[standard]` + - [ ] `uv pip install piper-tts` + - [ ] `uv pip install sounddevice` + - [ ] `uv pip install numpy` + - [ ] `uv pip install pydantic` + - [ ] `uv pip install python-dotenv` +- [ ] Create `requirements.txt` with pinned versions +- [ ] Create `.env.example` for configuration template +- [ ] Initialize git repository +- [ ] Create `.gitignore` (Python, IDEs, .env, voice models) + +#### 1.2 FastAPI Application Structure +- [ ] Create `app/main.py` with FastAPI app initialization +- [ ] Implement `/notify` endpoint skeleton +- [ ] Implement `/health` endpoint skeleton +- [ ] Implement `/voices` endpoint skeleton +- [ ] Configure CORS middleware +- [ ] Configure JSON logging middleware +- [ ] Create Pydantic models for request/response schemas +- [ ] Test basic server startup: `uvicorn app.main:app --reload` + +#### 1.3 Configuration Management +- [ ] Create `app/config.py` for configuration loading +- [ ] Implement environment variable loading +- [ ] Define configuration schema (host, port, queue size, etc.) +- [ ] Implement configuration validation at startup +- [ ] Create CLI argument parsing for overrides +- [ ] Document all configuration options in README + +### Phase 2: TTS Integration (Days 2-3) + +#### 2.1 Piper TTS Setup +- [ ] Create `app/tts_engine.py` module +- [ ] Implement `PiperTTSEngine` class +- [ ] Download default voice model (`en_US-lessac-medium`) +- [ ] Implement voice model loading with caching +- [ ] Implement text-to-audio synthesis method +- [ ] Add support for configurable speech rate +- [ ] Test TTS generation with sample text +- [ ] Measure TTS latency for various text lengths + +#### 2.2 Voice Model Management +- [ ] Create `models/` directory for voice model storage +- [ ] Implement voice model discovery (scan `models/` directory) +- [ ] Implement lazy loading of voice models (load on first use) +- [ ] Create model metadata cache (name, language, quality, size) +- [ ] Implement `/voices` endpoint to list available models +- [ ] Add error handling for missing/corrupted models +- [ ] Document voice model installation process + +#### 2.3 TTS Parameter Support +- [ ] Implement speech rate adjustment (50-400 WPM) +- [ ] Test rate adjustment across range +- [ ] Add voice selection via request parameter +- [ ] Implement voice validation (reject unknown voices) +- [ ] Add `voice_enabled` flag for debugging/testing +- [ ] Create comprehensive TTS unit tests + +### Phase 3: Audio Playback (Day 3) + +#### 3.1 sounddevice Integration +- [ ] Create `app/audio_player.py` module +- [ ] Implement `AudioPlayer` class with non-blocking `sd.play()` +- [ ] Verify sounddevice detects audio devices at startup +- [ ] Implement non-blocking playback method +- [ ] Implement async `wait_async()` method for queue processing +- [ ] Test audio playback with sample NumPy array +- [ ] Verify non-blocking behavior with concurrent requests + +#### 3.2 Audio Error Handling +- [ ] Implement audio device detection +- [ ] Add retry logic for device failures +- [ ] Handle device disconnection gracefully +- [ ] Test with headphones unplugged during playback +- [ ] Implement fallback to different audio devices +- [ ] Add detailed audio error logging +- [ ] Create audio system health check + +#### 3.3 Playback Testing +- [ ] Test simultaneous playback (should queue) +- [ ] Test rapid successive requests +- [ ] Measure audio latency (request → sound output) +- [ ] Test with various audio formats +- [ ] Verify memory cleanup after playback +- [ ] Test long-running playback (10+ minutes) + +### Phase 4: Queue Management (Day 4) + +#### 4.1 Async Queue Implementation +- [ ] Create `app/queue_manager.py` module +- [ ] Implement `TTSQueue` class with `asyncio.Queue` +- [ ] Set configurable max queue size (default: 50) +- [ ] Implement queue full detection +- [ ] Create background queue processor task +- [ ] Implement graceful queue shutdown +- [ ] Add queue metrics (size, processed, errors) + +#### 4.2 Request Processing Pipeline +- [ ] Implement request enqueueing in `/notify` endpoint +- [ ] Create background worker to process queue +- [ ] Integrate TTS generation in worker +- [ ] Integrate audio playback in worker +- [ ] Implement sequential playback (one at a time) +- [ ] Add request timeout handling (max 60s per request) +- [ ] Test queue with 100+ concurrent requests + +#### 4.3 Queue Monitoring +- [ ] Add queue size to `/health` endpoint +- [ ] Implement queue utilization metrics +- [ ] Add logging for queue events (enqueue, process, error) +- [ ] Create queue performance benchmarks +- [ ] Test queue overflow scenarios +- [ ] Document queue behavior and limits + +### Phase 5: Error Handling (Day 5) + +#### 5.1 Exception Handlers +- [ ] Implement custom exception classes +- [ ] Create `QueueFullError` exception handler +- [ ] Create `TTSEngineError` exception handler +- [ ] Create `AudioPlaybackError` exception handler +- [ ] Create `ValidationError` exception handler +- [ ] Implement generic exception handler (catch-all) +- [ ] Test all error scenarios + +#### 5.2 Logging Infrastructure +- [ ] Configure structured JSON logging +- [ ] Implement rotating file handler (10MB, 5 backups) +- [ ] Add request ID tracking across logs +- [ ] Implement log levels appropriately (DEBUG, INFO, WARNING, ERROR) +- [ ] Create log aggregation for queue processor +- [ ] Test log rotation +- [ ] Document log file locations and format + +#### 5.3 Health Monitoring +- [ ] Implement comprehensive `/health` endpoint +- [ ] Add TTS engine health check +- [ ] Add audio system health check +- [ ] Add queue status to health check +- [ ] Add system resource metrics (CPU, memory) +- [ ] Test health endpoint under load +- [ ] Create health check monitoring script + +### Phase 6: Testing (Days 5-6) + +#### 6.1 Unit Tests +- [ ] Create `tests/` directory structure +- [ ] Install pytest: `uv pip install pytest pytest-asyncio` +- [ ] Write tests for Pydantic models +- [ ] Write tests for TTS engine +- [ ] Write tests for audio player +- [ ] Write tests for queue manager +- [ ] Write tests for configuration loading +- [ ] Achieve 80%+ code coverage + +#### 6.2 Integration Tests +- [ ] Write tests for `/notify` endpoint +- [ ] Write tests for `/health` endpoint +- [ ] Write tests for `/voices` endpoint +- [ ] Test end-to-end request flow +- [ ] Test concurrent request handling +- [ ] Test queue overflow scenarios +- [ ] Test error scenarios (TTS failure, audio failure) + +#### 6.3 Performance Tests +- [ ] Create load testing script with `locust` or `wrk` +- [ ] Test 100 concurrent requests +- [ ] Measure request latency (p50, p95, p99) +- [ ] Measure TTS generation time +- [ ] Measure audio playback latency +- [ ] Measure memory usage under load +- [ ] Document performance characteristics + +#### 6.4 System Tests +- [ ] Test on target Linux environment (Nobara/Fedora 42) +- [ ] Test with different audio devices +- [ ] Test with PulseAudio and ALSA +- [ ] Test headphone disconnect/reconnect +- [ ] Test system resource exhaustion scenarios +- [ ] Test server restart recovery +- [ ] Test long-running stability (24+ hours) + +### Phase 7: Documentation & Deployment (Days 6-7) + +#### 7.1 Documentation +- [ ] Create comprehensive README.md: + - [ ] Project overview + - [ ] Installation instructions + - [ ] Configuration options + - [ ] Usage examples + - [ ] API documentation + - [ ] Troubleshooting guide +- [ ] Create CONTRIBUTING.md (if open source) +- [ ] Create CHANGELOG.md +- [ ] Document voice model installation +- [ ] Create architecture diagrams +- [ ] Add inline code documentation +- [ ] Create example client scripts (curl, Python) + +#### 7.2 Deployment Preparation +- [ ] Create systemd service file (`voice-server.service`) +- [ ] Test systemd service installation +- [ ] Test automatic restart on failure +- [ ] Create deployment script (`deploy.sh`) +- [ ] Document deployment process +- [ ] Create backup/restore procedures +- [ ] Test upgrade procedure + +#### 7.3 Production Hardening +- [ ] Enable production logging (disable debug logs) +- [ ] Configure log rotation +- [ ] Set up monitoring (optional: Prometheus, Grafana) +- [ ] Implement graceful shutdown (SIGTERM handling) +- [ ] Test crash recovery +- [ ] Implement rate limiting (optional) +- [ ] Security audit (input sanitization, resource limits) +- [ ] Performance tuning (queue size, worker count) + +--- + +## Testing Strategy + +### Unit Testing + +**Framework:** pytest with pytest-asyncio + +**Test Coverage Requirements:** +- Minimum 80% code coverage +- 100% coverage for critical paths (TTS, audio playback) +- All error handlers must have tests + +**Test Structure:** + +``` +tests/ +├── __init__.py +├── conftest.py # Shared fixtures +├── unit/ +│ ├── test_config.py # Configuration loading tests +│ ├── test_models.py # Pydantic model tests +│ ├── test_tts_engine.py # TTS engine tests +│ ├── test_audio_player.py # Audio player tests +│ └── test_queue.py # Queue manager tests +├── integration/ +│ ├── test_api.py # API endpoint tests +│ ├── test_end_to_end.py # Full request flow tests +│ └── test_errors.py # Error scenario tests +└── performance/ + └── test_load.py # Load testing +``` + +**Sample Unit Test:** + +```python +# tests/unit/test_tts_engine.py +import pytest +from app.tts_engine import PiperTTSEngine + +@pytest.fixture +def tts_engine(): + """Create TTS engine instance.""" + return PiperTTSEngine(model_dir="models/") + +def test_tts_engine_initialization(tts_engine): + """Test TTS engine initializes successfully.""" + assert tts_engine is not None + assert tts_engine.default_voice == "en_US-lessac-medium" + +def test_text_to_audio_conversion(tts_engine): + """Test converting text to audio.""" + audio = tts_engine.synthesize("Hello world") + assert audio is not None + assert len(audio) > 0 + assert audio.dtype == np.float32 + +def test_invalid_voice_raises_error(tts_engine): + """Test that invalid voice raises appropriate error.""" + with pytest.raises(ValueError, match="Voice model .* not found"): + tts_engine.synthesize("Hello", voice="invalid_voice") + +@pytest.mark.asyncio +async def test_async_synthesis(tts_engine): + """Test async TTS synthesis.""" + audio = await tts_engine.synthesize_async("Hello world") + assert audio is not None +``` + +**Sample Integration Test:** + +```python +# tests/integration/test_api.py +import pytest +from fastapi.testclient import TestClient +from app.main import app + +@pytest.fixture +def client(): + """Create test client.""" + return TestClient(app) + +def test_notify_endpoint_success(client): + """Test successful /notify request.""" + response = client.post( + "/notify", + json={"message": "Test message", "rate": 180} + ) + assert response.status_code == 202 + data = response.json() + assert data["status"] == "queued" + assert data["message_length"] == 12 + +def test_notify_endpoint_validation_error(client): + """Test /notify with invalid parameters.""" + response = client.post( + "/notify", + json={"message": "", "rate": 1000} # Empty message, invalid rate + ) + assert response.status_code == 422 + +def test_health_endpoint(client): + """Test /health endpoint.""" + response = client.get("/health") + assert response.status_code == 200 + data = response.json() + assert "status" in data + assert "queue_size" in data +``` + +--- + +### Load Testing + +**Tool:** wrk or locust + +**Sample wrk Test:** + +```bash +# Install wrk +sudo dnf install wrk + +# Run load test: 100 concurrent connections, 30 seconds +wrk -t4 -c100 -d30s -s post.lua http://localhost:8888/notify + +# post.lua script: +# wrk.method = "POST" +# wrk.headers["Content-Type"] = "application/json" +# wrk.body = '{"message": "Load test message"}' +``` + +**Sample locust Test:** + +```python +# locustfile.py +from locust import HttpUser, task, between + +class VoiceServerUser(HttpUser): + wait_time = between(1, 3) + + @task + def notify(self): + self.client.post("/notify", json={ + "message": "This is a load test message", + "rate": 180 + }) + + @task(5) + def health_check(self): + self.client.get("/health") + +# Run: locust -f locustfile.py --host=http://localhost:8888 +``` + +**Performance Benchmarks:** + +| Metric | Target | Acceptable | Unacceptable | +|--------|--------|------------|--------------| +| API Response Time (p95) | < 50ms | < 100ms | > 200ms | +| TTS Generation (500 chars) | < 2s | < 5s | > 10s | +| Requests/Second | > 50 | > 20 | < 10 | +| Memory Usage (idle) | < 200MB | < 500MB | > 1GB | +| Memory Usage (load) | < 500MB | < 1GB | > 2GB | +| Queue Processing Rate | > 10/s | > 5/s | < 2/s | + +--- + +### Manual Testing Checklist + +**Functional Testing:** +- [ ] Send POST request with valid message → Hear audio playback +- [ ] Send request with long text (5000 chars) → Successful playback +- [ ] Send request with special characters → Successful sanitization +- [ ] Send request with invalid voice → Receive 422 error +- [ ] Send request with rate=50 → Slow speech playback +- [ ] Send request with rate=400 → Fast speech playback +- [ ] Send 10 concurrent requests → All play sequentially +- [ ] Fill queue to capacity → Receive 503 error +- [ ] Check /health endpoint → Receive status information +- [ ] Check /voices endpoint → See available voice models +- [ ] Check /docs endpoint → See Swagger documentation + +**Error Scenario Testing:** +- [ ] Unplug headphones during playback → Graceful error handling +- [ ] Kill PulseAudio daemon → Audio error logged, server continues +- [ ] Send malformed JSON → Receive 400 error +- [ ] Send empty message → Receive 422 error +- [ ] Send 11,000 character message → Receive 413 error +- [ ] Restart server during playback → Queue cleared, server restarts + +**System Testing:** +- [ ] Run server for 24 hours → No memory leaks +- [ ] Send 10,000 requests → All processed successfully +- [ ] Monitor CPU usage during load → < 50% average +- [ ] Monitor memory usage during load → < 1GB +- [ ] Test on Fedora 42 → Successful operation +- [ ] Test with ALSA (without PulseAudio) → Successful operation + +--- + +## Future Considerations + +### Optional Features (Post-v1.0) + +#### 1. Advanced Voice Control +- **Pitch adjustment:** Allow clients to specify pitch modification +- **Volume control:** Per-request volume settings +- **Emotion/tone control:** Happy, sad, angry voice modulation (if TTS engine supports) +- **Voice cloning:** Custom voice model training (Coqui TTS integration) + +**Implementation Complexity:** Medium +**User Value:** High for accessibility and personalization + +--- + +#### 2. Audio Format Options +- **Output format selection:** Support WAV, MP3, OGG output +- **Sample rate options:** Allow 16kHz, 22kHz, 44.1kHz selection +- **Compression levels:** Configurable audio quality vs file size + +**Implementation Complexity:** Low +**User Value:** Medium (mostly for file storage use cases) + +--- + +#### 3. Streaming Audio +- **Real-time streaming:** Stream audio as it's generated (WebSocket or SSE) +- **Chunked TTS:** Generate and stream long texts in chunks +- **Lower latency:** Start playback before full text is synthesized + +**Implementation Complexity:** High +**User Value:** High for very long texts + +--- + +#### 4. SSML Support +- **Prosody control:** Fine-grained control over speech characteristics +- **Break insertion:** Explicit pauses and timing control +- **Phoneme specification:** Correct pronunciation for unusual words +- **Multi-voice support:** Different voices within single text + +**Example:** +```xml + + Hello, this is important. + A different voice. + +``` + +**Implementation Complexity:** Medium +**User Value:** High for advanced use cases + +--- + +#### 5. Caching Layer +- **TTS result caching:** Cache frequently requested texts +- **Cache invalidation:** LRU eviction policy +- **Cache persistence:** Store cache across restarts +- **Cache statistics:** Hit rate monitoring + +**Implementation Complexity:** Low +**User Value:** High for repeated texts (notifications, alerts) + +**Sample Implementation:** + +```python +from functools import lru_cache +import hashlib + +class TTSCache: + def __init__(self, max_size: int = 1000): + self.cache = {} + self.max_size = max_size + + def get_cache_key(self, text: str, voice: str, rate: int) -> str: + """Generate cache key from TTS parameters.""" + content = f"{text}|{voice}|{rate}" + return hashlib.sha256(content.encode()).hexdigest() + + def get(self, text: str, voice: str, rate: int): + """Retrieve cached audio.""" + key = self.get_cache_key(text, voice, rate) + return self.cache.get(key) + + def put(self, text: str, voice: str, rate: int, audio_data): + """Store audio in cache with LRU eviction.""" + if len(self.cache) >= self.max_size: + # Evict oldest entry (simple FIFO, use OrderedDict for true LRU) + self.cache.pop(next(iter(self.cache))) + + key = self.get_cache_key(text, voice, rate) + self.cache[key] = audio_data +``` + +--- + +#### 6. Multi-Language Support +- **Automatic language detection:** Detect input language +- **Language-specific voice selection:** Match voice to detected language +- **Mixed-language support:** Handle multilingual texts + +**Implementation Complexity:** Medium +**User Value:** High for international users + +--- + +#### 7. Audio Effects +- **Reverb:** Add spatial audio effects +- **Echo:** Add echo effects +- **Speed adjustment:** Time-stretch without pitch change +- **Normalization:** Automatic volume leveling + +**Implementation Complexity:** Medium (requires audio processing library like `pydub` or `librosa`) +**User Value:** Medium (aesthetic enhancement) + +--- + +#### 8. Queue Priority System +- **Priority levels:** High, normal, low priority requests +- **Priority queues:** Separate queues for different priorities +- **Preemption:** Allow high-priority requests to interrupt low-priority + +**Implementation Complexity:** Medium +**User Value:** Medium for multi-tenant scenarios + +--- + +#### 9. Webhook Notifications +- **Completion webhooks:** Notify external service when TTS completes +- **Error webhooks:** Notify on TTS failures +- **Webhook retry logic:** Handle webhook delivery failures + +**Example Request:** +```json +{ + "message": "Hello world", + "webhook_url": "https://example.com/tts-complete" +} +``` + +**Implementation Complexity:** Low +**User Value:** High for integration scenarios + +--- + +#### 10. Authentication & Authorization +- **API key authentication:** Secure endpoint access +- **Rate limiting:** Per-user request limits +- **Usage quotas:** Daily/monthly request quotas +- **Multi-tenant support:** Isolated queues per user + +**Implementation Complexity:** High +**User Value:** High for shared/production deployments + +--- + +#### 11. Web Interface +- **Simple web UI:** Browser-based TTS interface +- **Queue visualization:** Real-time queue status display +- **Voice model management:** Upload/download voice models via UI +- **Settings configuration:** Web-based configuration editor + +**Implementation Complexity:** Medium +**User Value:** High for non-technical users + +--- + +#### 12. Docker Deployment +- **Dockerfile:** Container image for easy deployment +- **Docker Compose:** Multi-container setup with monitoring +- **Volume management:** Persistent voice model storage +- **Health check integration:** Container health monitoring + +**Sample Dockerfile:** + +```dockerfile +FROM python:3.11-slim + +# Install system dependencies (PortAudio for sounddevice) +RUN apt-get update && apt-get install -y \ + libportaudio2 \ + portaudio19-dev \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +COPY app/ ./app/ + +# Download default voice model +RUN python -c "from piper import PiperVoice; PiperVoice.download('en_US-lessac-medium')" + +EXPOSE 8888 + +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8888"] +``` + +**Implementation Complexity:** Low +**User Value:** High for deployment consistency + +--- + +#### 13. Metrics & Monitoring +- **Prometheus metrics:** Request count, latency, queue size +- **Grafana dashboards:** Visual monitoring +- **Alerting:** Notify on errors, high queue size, etc. +- **Performance profiling:** Identify bottlenecks + +**Sample Metrics:** + +```python +from prometheus_client import Counter, Histogram, Gauge + +request_counter = Counter('tts_requests_total', 'Total TTS requests') +latency_histogram = Histogram('tts_latency_seconds', 'TTS latency') +queue_size_gauge = Gauge('tts_queue_size', 'Current queue size') + +@app.post("/notify") +async def notify(request: NotifyRequest): + request_counter.inc() + with latency_histogram.time(): + # Process request + ... + queue_size_gauge.set(tts_queue.qsize()) +``` + +**Implementation Complexity:** Medium +**User Value:** High for production deployments + +--- + +### Scalability Considerations + +**Horizontal Scaling:** +- Use Redis for shared queue across multiple server instances +- Implement distributed locking for audio device access +- Load balance requests across multiple servers + +**Vertical Scaling:** +- Increase queue size for higher throughput +- Use GPU acceleration for TTS (CUDA support in Piper) +- Optimize voice model loading (keep models in memory) + +**Architecture Evolution:** +- Separate TTS generation and audio playback into microservices +- Use message queue (RabbitMQ, Kafka) for request distribution +- Implement worker pool for parallel TTS generation + +--- + +## Appendix: References + +### Technical Documentation +- [FastAPI Official Documentation](https://fastapi.tiangolo.com/) +- [Piper TTS GitHub Repository](https://github.com/rhasspy/piper) +- [PyAudio Documentation](https://people.csail.mit.edu/hubert/pyaudio/docs/) +- [Uvicorn Documentation](https://www.uvicorn.org/) + +### Research & Comparisons +- [FastAPI vs Flask Performance Comparison - Strapi](https://strapi.io/blog/fastapi-vs-flask-python-framework-comparison) +- [Flask vs FastAPI - Better Stack](https://betterstack.com/community/guides/scaling-python/flask-vs-fastapi/) +- [Python TTS Engines Comparison - Smallest AI](https://smallest.ai/blog/python-packages-realistic-text-to-speech) +- [TTS Converters for Raspberry Pi - Circuit Digest](https://circuitdigest.com/microcontroller-projects/best-text-to-speech-tts-converter-for-raspberry-pi-espeak-festival-google-tts-pico-and-pyttsx3) +- [Piper TTS Tutorial - RMauro Dev](https://rmauro.dev/how-to-run-piper-tts-on-your-raspberry-pi-offline-voice-zero-internet-needed/) +- [Python Audio Playback - simpleaudio Docs](https://simpleaudio.readthedocs.io/) + +### Tools & Libraries +- [uv - Fast Python Package Manager](https://github.com/astral-sh/uv) +- [pytest - Testing Framework](https://docs.pytest.org/) +- [locust - Load Testing](https://locust.io/) + +--- + +## Document History + +| Version | Date | Author | Changes | +|---------|------|--------|---------| +| 1.0 | 2025-12-18 | Atlas | Initial PRD creation | + +--- + +**Document Status:** ✅ Complete - Ready for Implementation + +**Next Steps:** +1. Review PRD with stakeholders +2. Approve technical stack decisions +3. Begin Phase 1 implementation +4. Set up project tracking (GitHub Issues, Jira, etc.) +5. Assign development resources + +**Questions or Feedback:** Contact Atlas at [atlas@manticorum.com] diff --git a/PROJECT_ROADMAP.json b/PROJECT_ROADMAP.json new file mode 100644 index 0000000..3e397f3 --- /dev/null +++ b/PROJECT_ROADMAP.json @@ -0,0 +1,1012 @@ +{ + "project": { + "name": "voice-server", + "description": "Local HTTP service for text-to-speech playback", + "version": "1.0.0", + "created": "2025-12-18", + "last_updated": "2025-12-18" + }, + "methodology": { + "approach": "hybrid-tdd", + "description": "TDD for API contracts, validation, and queue logic. Implementation-first for hardware integrations.", + "tdd_components": [ + "request_validation", + "queue_behavior", + "error_responses", + "health_check_logic" + ], + "implementation_first_components": [ + "piper_tts_integration", + "sounddevice_playback", + "end_to_end_flow" + ] + }, + "phases": [ + { + "id": "phase_1", + "name": "Core Infrastructure", + "description": "Project setup, FastAPI skeleton, and configuration management", + "estimated_days": "1-2" + }, + { + "id": "phase_2", + "name": "TTS Integration", + "description": "Piper TTS setup, voice model management, and parameter support", + "estimated_days": "1-2" + }, + { + "id": "phase_3", + "name": "Audio Playback", + "description": "sounddevice integration and audio error handling", + "estimated_days": "1" + }, + { + "id": "phase_4", + "name": "Queue Management", + "description": "Async queue implementation and request processing pipeline", + "estimated_days": "1" + }, + { + "id": "phase_5", + "name": "Error Handling", + "description": "Exception handlers, logging infrastructure, and health monitoring", + "estimated_days": "1" + }, + { + "id": "phase_6", + "name": "Testing", + "description": "Unit tests, integration tests, performance tests, and system tests", + "estimated_days": "1-2" + }, + { + "id": "phase_7", + "name": "Documentation & Deployment", + "description": "README, systemd service, and production hardening", + "estimated_days": "1" + } + ], + "tasks": [ + { + "id": "1.1.1", + "phase": "phase_1", + "name": "Initialize project directory", + "description": "Create project directory structure at /mnt/NV2/Development/voice-server with app/, tests/, models/ subdirectories", + "dependencies": [], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Directory already exists from PRD creation" + }, + { + "id": "1.1.2", + "phase": "phase_1", + "name": "Create Python virtual environment", + "description": "Initialize virtual environment using uv (uv venv)", + "dependencies": ["1.1.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Use uv as per user preference" + }, + { + "id": "1.1.3", + "phase": "phase_1", + "name": "Install core dependencies", + "description": "Install fastapi, uvicorn[standard], piper-tts, sounddevice, numpy, pydantic, python-dotenv", + "dependencies": ["1.1.2"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Also install pytest, pytest-asyncio, httpx for testing" + }, + { + "id": "1.1.4", + "phase": "phase_1", + "name": "Create pyproject.toml", + "description": "Create pyproject.toml with pinned dependency versions and project metadata", + "dependencies": ["1.1.3"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Use uv's native pyproject.toml support instead of requirements.txt" + }, + { + "id": "1.1.5", + "phase": "phase_1", + "name": "Create environment configuration", + "description": "Create .env.example with all configurable environment variables", + "dependencies": ["1.1.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Include HOST, PORT, QUEUE_SIZE, LOG_LEVEL, MODEL_DIR, DEFAULT_VOICE" + }, + { + "id": "1.1.6", + "phase": "phase_1", + "name": "Initialize git repository", + "description": "Initialize git repo with .gitignore for Python, IDEs, .env, voice models, __pycache__", + "dependencies": ["1.1.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Add models/*.onnx and models/*.json to .gitignore (large files)" + }, + { + "id": "1.2.1", + "phase": "phase_1", + "name": "Write tests for Pydantic request/response models", + "description": "TDD: Write tests for NotifyRequest, NotifyResponse, HealthResponse, ErrorResponse models with validation rules", + "dependencies": ["1.1.4"], + "completed": false, + "tested": false, + "test_approach": "tdd", + "notes": "Test message length limits (1-10000), rate range (50-400), voice pattern validation" + }, + { + "id": "1.2.2", + "phase": "phase_1", + "name": "Implement Pydantic models", + "description": "Create app/models.py with NotifyRequest, NotifyResponse, HealthResponse, ErrorResponse models", + "dependencies": ["1.2.1"], + "completed": false, + "tested": true, + "test_approach": "tdd", + "notes": "Implementation to make tests from 1.2.1 pass" + }, + { + "id": "1.2.3", + "phase": "phase_1", + "name": "Create FastAPI application skeleton", + "description": "Create app/main.py with FastAPI app, lifespan handler, and CORS middleware", + "dependencies": ["1.2.2"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Use lifespan context manager (not deprecated on_event)" + }, + { + "id": "1.2.4", + "phase": "phase_1", + "name": "Write tests for /notify endpoint contract", + "description": "TDD: Write tests for POST /notify - valid requests return 202, invalid return 422, missing fields return 400", + "dependencies": ["1.2.3"], + "completed": false, + "tested": false, + "test_approach": "tdd", + "notes": "Use httpx.AsyncClient for async endpoint testing" + }, + { + "id": "1.2.5", + "phase": "phase_1", + "name": "Implement /notify endpoint skeleton", + "description": "Create POST /notify endpoint that validates request and returns 202 (queue integration later)", + "dependencies": ["1.2.4"], + "completed": false, + "tested": true, + "test_approach": "tdd", + "notes": "Initially just validate and return success; queue integration in phase 4" + }, + { + "id": "1.2.6", + "phase": "phase_1", + "name": "Write tests for /health endpoint", + "description": "TDD: Write tests for GET /health - returns status, uptime, queue info structure", + "dependencies": ["1.2.3"], + "completed": false, + "tested": false, + "test_approach": "tdd", + "notes": "Test both healthy and unhealthy response structures" + }, + { + "id": "1.2.7", + "phase": "phase_1", + "name": "Implement /health endpoint skeleton", + "description": "Create GET /health endpoint returning basic health status", + "dependencies": ["1.2.6"], + "completed": false, + "tested": true, + "test_approach": "tdd", + "notes": "Full health checks (TTS, audio) added in phase 5" + }, + { + "id": "1.2.8", + "phase": "phase_1", + "name": "Implement /voices endpoint skeleton", + "description": "Create GET /voices endpoint returning list of available voice models", + "dependencies": ["1.2.3"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Initially return empty list; populate after TTS integration" + }, + { + "id": "1.2.9", + "phase": "phase_1", + "name": "Configure JSON logging middleware", + "description": "Add structured JSON logging for all requests with timestamp, request_id, path, status_code", + "dependencies": ["1.2.3"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Use Python's logging with JSON formatter" + }, + { + "id": "1.2.10", + "phase": "phase_1", + "name": "Verify server startup", + "description": "Test server starts successfully with uvicorn app.main:app --reload", + "dependencies": ["1.2.5", "1.2.7", "1.2.8"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Verify /docs (Swagger UI) is accessible" + }, + { + "id": "1.3.1", + "phase": "phase_1", + "name": "Write tests for configuration loading", + "description": "TDD: Write tests for config loading from env vars with defaults, validation of values", + "dependencies": ["1.1.5"], + "completed": false, + "tested": false, + "test_approach": "tdd", + "notes": "Test default values, env var override, invalid value handling" + }, + { + "id": "1.3.2", + "phase": "phase_1", + "name": "Implement configuration module", + "description": "Create app/config.py with Settings class using pydantic-settings for env var loading", + "dependencies": ["1.3.1"], + "completed": false, + "tested": true, + "test_approach": "tdd", + "notes": "Include host, port, queue_size, log_level, model_dir, default_voice settings" + }, + { + "id": "1.3.3", + "phase": "phase_1", + "name": "Add CLI argument parsing", + "description": "Add CLI argument support for --host, --port, --log-level to override env vars", + "dependencies": ["1.3.2"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Use argparse or typer; CLI args take precedence over env vars" + }, + { + "id": "2.1.1", + "phase": "phase_2", + "name": "Create TTS engine module structure", + "description": "Create app/tts_engine.py with TTSEngine abstract base and PiperTTSEngine class skeleton", + "dependencies": ["1.2.10"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Abstract base allows for future TTS engine swapping" + }, + { + "id": "2.1.2", + "phase": "phase_2", + "name": "Download default voice model", + "description": "Download en_US-lessac-medium.onnx and .json to models/ directory", + "dependencies": ["1.1.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Can use piper --download-dir or manual download from GitHub releases" + }, + { + "id": "2.1.3", + "phase": "phase_2", + "name": "Implement Piper TTS voice loading", + "description": "Implement PiperTTSEngine.load_voice() to load .onnx model with caching", + "dependencies": ["2.1.1", "2.1.2"], + "completed": false, + "tested": false, + "test_approach": "implementation_first", + "notes": "Cache loaded models in memory to avoid reload on each request" + }, + { + "id": "2.1.4", + "phase": "phase_2", + "name": "Implement text-to-audio synthesis", + "description": "Implement PiperTTSEngine.synthesize() returning NumPy array of audio samples", + "dependencies": ["2.1.3"], + "completed": false, + "tested": false, + "test_approach": "implementation_first", + "notes": "Return float32 NumPy array compatible with sounddevice" + }, + { + "id": "2.1.5", + "phase": "phase_2", + "name": "Write integration tests for TTS synthesis", + "description": "Write tests verifying TTS generates valid audio array for sample text", + "dependencies": ["2.1.4"], + "completed": false, + "tested": false, + "test_approach": "implementation_first", + "notes": "Test output is non-empty NumPy array with expected sample rate" + }, + { + "id": "2.1.6", + "phase": "phase_2", + "name": "Measure TTS latency benchmarks", + "description": "Benchmark TTS generation time for various text lengths (10, 100, 500, 1000, 5000 chars)", + "dependencies": ["2.1.5"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Document results for performance baseline" + }, + { + "id": "2.2.1", + "phase": "phase_2", + "name": "Create models directory structure", + "description": "Create models/ directory for voice model storage with README explaining model installation", + "dependencies": ["1.1.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Include instructions for downloading additional voices" + }, + { + "id": "2.2.2", + "phase": "phase_2", + "name": "Implement voice model discovery", + "description": "Implement function to scan models/ directory and return available voice models", + "dependencies": ["2.2.1", "2.1.3"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Parse .json config files for model metadata (language, quality)" + }, + { + "id": "2.2.3", + "phase": "phase_2", + "name": "Implement /voices endpoint fully", + "description": "Update /voices endpoint to return discovered models with metadata", + "dependencies": ["2.2.2", "1.2.8"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Include name, language, quality, size_mb, installed status" + }, + { + "id": "2.2.4", + "phase": "phase_2", + "name": "Add voice validation to /notify", + "description": "Validate requested voice exists before queuing; return 422 if not found", + "dependencies": ["2.2.2", "1.2.5"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Provide helpful error message listing available voices" + }, + { + "id": "2.3.1", + "phase": "phase_2", + "name": "Implement speech rate adjustment", + "description": "Add rate parameter support to TTS synthesis (50-400 WPM range)", + "dependencies": ["2.1.4"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Check if Piper supports rate adjustment natively or needs post-processing" + }, + { + "id": "2.3.2", + "phase": "phase_2", + "name": "Test rate adjustment across range", + "description": "Test TTS output at rate=50, 100, 170 (default), 300, 400 WPM", + "dependencies": ["2.3.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Verify audio sounds correct at extremes" + }, + { + "id": "2.3.3", + "phase": "phase_2", + "name": "Implement voice_enabled flag", + "description": "Add voice_enabled parameter to skip TTS for debugging/testing", + "dependencies": ["2.1.4"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "When false, skip TTS and audio playback but still process request" + }, + { + "id": "3.1.1", + "phase": "phase_3", + "name": "Create audio player module", + "description": "Create app/audio_player.py with AudioPlayer class skeleton", + "dependencies": ["1.2.10"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Use sounddevice for non-blocking playback" + }, + { + "id": "3.1.2", + "phase": "phase_3", + "name": "Implement audio device verification", + "description": "Implement verify_audio_devices() to check for available output devices at startup", + "dependencies": ["3.1.1"], + "completed": false, + "tested": false, + "test_approach": "implementation_first", + "notes": "Use sd.query_devices() to enumerate devices" + }, + { + "id": "3.1.3", + "phase": "phase_3", + "name": "Implement non-blocking playback", + "description": "Implement AudioPlayer.play() using sd.play() for non-blocking audio output", + "dependencies": ["3.1.2"], + "completed": false, + "tested": false, + "test_approach": "implementation_first", + "notes": "sd.play() returns immediately; audio plays in background thread" + }, + { + "id": "3.1.4", + "phase": "phase_3", + "name": "Implement async wait method", + "description": "Implement AudioPlayer.wait_async() for async-friendly waiting on playback completion", + "dependencies": ["3.1.3"], + "completed": false, + "tested": false, + "test_approach": "implementation_first", + "notes": "Poll sd.get_stream().active with asyncio.sleep()" + }, + { + "id": "3.1.5", + "phase": "phase_3", + "name": "Test audio playback with sample data", + "description": "Test AudioPlayer with synthesized sine wave to verify audio output works", + "dependencies": ["3.1.4"], + "completed": false, + "tested": false, + "test_approach": "implementation_first", + "notes": "Use numpy to generate test tone; verify sound is heard" + }, + { + "id": "3.1.6", + "phase": "phase_3", + "name": "Verify non-blocking behavior", + "description": "Test that play() returns immediately and server can handle requests during playback", + "dependencies": ["3.1.5"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Send request, verify 202 returned before audio finishes" + }, + { + "id": "3.2.1", + "phase": "phase_3", + "name": "Implement retry logic for device failures", + "description": "Implement RobustAudioPlayer with automatic retry on sd.PortAudioError", + "dependencies": ["3.1.4"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Retry up to 3 times with 0.5s delay between attempts" + }, + { + "id": "3.2.2", + "phase": "phase_3", + "name": "Handle device disconnection", + "description": "Gracefully handle audio device disconnection during playback", + "dependencies": ["3.2.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Log error, skip playback, continue processing queue" + }, + { + "id": "3.2.3", + "phase": "phase_3", + "name": "Implement audio diagnostics", + "description": "Implement get_audio_diagnostics() for health check reporting", + "dependencies": ["3.1.2"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Return device count, default output name, sample rate" + }, + { + "id": "3.2.4", + "phase": "phase_3", + "name": "Add audio error logging", + "description": "Add detailed logging for all audio errors with device context", + "dependencies": ["3.2.2"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Include device name, error type, retry count in logs" + }, + { + "id": "4.1.1", + "phase": "phase_4", + "name": "Write tests for queue behavior", + "description": "TDD: Write tests for queue enqueue, dequeue, overflow, ordering (FIFO)", + "dependencies": ["1.2.10"], + "completed": false, + "tested": false, + "test_approach": "tdd", + "notes": "Test max size enforcement, QueueFullError raising" + }, + { + "id": "4.1.2", + "phase": "phase_4", + "name": "Create queue manager module", + "description": "Create app/queue_manager.py with TTSQueue class using asyncio.Queue", + "dependencies": ["4.1.1"], + "completed": false, + "tested": true, + "test_approach": "tdd", + "notes": "Implement to pass tests from 4.1.1" + }, + { + "id": "4.1.3", + "phase": "phase_4", + "name": "Implement queue enqueue with timeout", + "description": "Implement TTSQueue.enqueue() with 1s timeout, raising QueueFullError on timeout", + "dependencies": ["4.1.2"], + "completed": false, + "tested": true, + "test_approach": "tdd", + "notes": "Return queue position on success" + }, + { + "id": "4.1.4", + "phase": "phase_4", + "name": "Implement queue metrics", + "description": "Add stats tracking: processed count, error count, current size", + "dependencies": ["4.1.2"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Expose via TTSQueue.stats property" + }, + { + "id": "4.1.5", + "phase": "phase_4", + "name": "Implement graceful queue shutdown", + "description": "Implement TTSQueue.shutdown() to wait for current item, reject new items", + "dependencies": ["4.1.2"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Called during application shutdown via lifespan" + }, + { + "id": "4.2.1", + "phase": "phase_4", + "name": "Implement background queue processor", + "description": "Create async background task to process queue items sequentially", + "dependencies": ["4.1.2", "2.1.4", "3.1.4"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Generate TTS, play audio, wait for completion, then next item" + }, + { + "id": "4.2.2", + "phase": "phase_4", + "name": "Integrate queue with /notify endpoint", + "description": "Update /notify to enqueue validated requests to TTSQueue", + "dependencies": ["4.2.1", "1.2.5"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Return queue_position in response" + }, + { + "id": "4.2.3", + "phase": "phase_4", + "name": "Add request timeout handling", + "description": "Add 60s timeout for individual request processing in queue worker", + "dependencies": ["4.2.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Cancel TTS generation if exceeds timeout; log and continue" + }, + { + "id": "4.2.4", + "phase": "phase_4", + "name": "Implement CPU-bound TTS in thread pool", + "description": "Run TTS synthesis in thread pool executor to avoid blocking event loop", + "dependencies": ["4.2.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Use loop.run_in_executor() for TTS generation" + }, + { + "id": "4.2.5", + "phase": "phase_4", + "name": "Test queue with concurrent requests", + "description": "Test sending 20+ concurrent requests and verify sequential playback", + "dependencies": ["4.2.2"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "All requests should be processed in order received" + }, + { + "id": "4.3.1", + "phase": "phase_4", + "name": "Add queue status to /health", + "description": "Update /health to include queue size, capacity, utilization percentage", + "dependencies": ["4.1.4", "1.2.7"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Include processed and error counts" + }, + { + "id": "4.3.2", + "phase": "phase_4", + "name": "Add queue event logging", + "description": "Log queue events: enqueue, process start, process complete, errors", + "dependencies": ["4.2.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Include request_id for correlation" + }, + { + "id": "4.3.3", + "phase": "phase_4", + "name": "Test queue overflow scenarios", + "description": "Test behavior when queue reaches max size; verify 503 returned", + "dependencies": ["4.2.2"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Fill queue with slow requests, send additional request" + }, + { + "id": "5.1.1", + "phase": "phase_5", + "name": "Write tests for error responses", + "description": "TDD: Write tests for all error response formats (400, 422, 500, 503)", + "dependencies": ["1.2.2"], + "completed": false, + "tested": false, + "test_approach": "tdd", + "notes": "Verify error response structure matches API spec" + }, + { + "id": "5.1.2", + "phase": "phase_5", + "name": "Create custom exception classes", + "description": "Create app/exceptions.py with QueueFullError, TTSEngineError, AudioPlaybackError", + "dependencies": ["5.1.1"], + "completed": false, + "tested": true, + "test_approach": "tdd", + "notes": "Include error codes and details for each exception type" + }, + { + "id": "5.1.3", + "phase": "phase_5", + "name": "Implement exception handlers", + "description": "Add FastAPI exception handlers for each custom exception type", + "dependencies": ["5.1.2"], + "completed": false, + "tested": true, + "test_approach": "tdd", + "notes": "Map exceptions to appropriate HTTP status codes" + }, + { + "id": "5.1.4", + "phase": "phase_5", + "name": "Implement generic exception handler", + "description": "Add catch-all exception handler for unexpected errors (500)", + "dependencies": ["5.1.3"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Log full traceback but return sanitized error to client" + }, + { + "id": "5.2.1", + "phase": "phase_5", + "name": "Configure structured JSON logging", + "description": "Set up logging with JSON formatter including timestamp, level, message, context", + "dependencies": ["1.2.9"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Use python-json-logger or custom formatter" + }, + { + "id": "5.2.2", + "phase": "phase_5", + "name": "Implement rotating file handler", + "description": "Configure RotatingFileHandler with 10MB max size, 5 backups", + "dependencies": ["5.2.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Write to voice-server.log in project directory" + }, + { + "id": "5.2.3", + "phase": "phase_5", + "name": "Add request ID tracking", + "description": "Generate unique request_id for each request; include in all related logs", + "dependencies": ["5.2.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Use UUID4; add to response headers for client correlation" + }, + { + "id": "5.2.4", + "phase": "phase_5", + "name": "Test log rotation", + "description": "Verify log files rotate correctly when size limit reached", + "dependencies": ["5.2.2"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Generate enough log entries to trigger rotation" + }, + { + "id": "5.3.1", + "phase": "phase_5", + "name": "Implement comprehensive /health endpoint", + "description": "Update /health with TTS engine, audio system, queue, and system resource checks", + "dependencies": ["4.3.1", "3.2.3"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Return 200 if all healthy, 503 if any component unhealthy" + }, + { + "id": "5.3.2", + "phase": "phase_5", + "name": "Add TTS engine health check", + "description": "Test TTS engine can synthesize short test phrase", + "dependencies": ["2.1.4"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Use minimal text to minimize overhead" + }, + { + "id": "5.3.3", + "phase": "phase_5", + "name": "Add system resource monitoring", + "description": "Include CPU and memory usage in health check response", + "dependencies": ["5.3.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Use psutil for system metrics" + }, + { + "id": "5.3.4", + "phase": "phase_5", + "name": "Test health endpoint under load", + "description": "Verify /health responds correctly while queue is processing", + "dependencies": ["5.3.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Health check should not block or be blocked by TTS processing" + }, + { + "id": "6.1.1", + "phase": "phase_6", + "name": "Set up pytest infrastructure", + "description": "Create tests/ directory with conftest.py, pytest.ini, test fixtures", + "dependencies": ["1.1.4"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Include async fixtures for FastAPI testing" + }, + { + "id": "6.1.2", + "phase": "phase_6", + "name": "Write remaining unit tests", + "description": "Complete any missing unit tests to achieve 80%+ coverage", + "dependencies": ["6.1.1", "5.1.3"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Use pytest-cov for coverage reporting" + }, + { + "id": "6.2.1", + "phase": "phase_6", + "name": "Write end-to-end integration tests", + "description": "Test complete request flow from HTTP request to audio playback", + "dependencies": ["4.2.5"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "May need to mock audio output for CI environments" + }, + { + "id": "6.2.2", + "phase": "phase_6", + "name": "Write error scenario tests", + "description": "Test TTS failure, audio failure, queue overflow scenarios end-to-end", + "dependencies": ["6.2.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Use dependency injection to simulate failures" + }, + { + "id": "6.3.1", + "phase": "phase_6", + "name": "Create load testing script", + "description": "Create load test using locust or wrk for performance benchmarking", + "dependencies": ["4.2.5"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Target: 50+ req/s, <50ms API response time" + }, + { + "id": "6.3.2", + "phase": "phase_6", + "name": "Measure performance benchmarks", + "description": "Record p50, p95, p99 latency; TTS generation time; memory usage", + "dependencies": ["6.3.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Document baseline performance for future comparison" + }, + { + "id": "6.4.1", + "phase": "phase_6", + "name": "Test on target environment", + "description": "Run full test suite on Nobara/Fedora 42 with real audio hardware", + "dependencies": ["6.2.2"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Test with both PulseAudio and ALSA if possible" + }, + { + "id": "6.4.2", + "phase": "phase_6", + "name": "Test long-running stability", + "description": "Run server for 24+ hours with periodic requests; check for memory leaks", + "dependencies": ["6.4.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Monitor memory usage over time" + }, + { + "id": "7.1.1", + "phase": "phase_7", + "name": "Create comprehensive README", + "description": "Write README.md with overview, installation, configuration, usage, API docs, troubleshooting", + "dependencies": ["6.4.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Include example curl commands and Python client code" + }, + { + "id": "7.1.2", + "phase": "phase_7", + "name": "Document voice model installation", + "description": "Create guide for downloading and installing additional Piper voice models", + "dependencies": ["2.2.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Link to Piper voice model repository" + }, + { + "id": "7.1.3", + "phase": "phase_7", + "name": "Create example client scripts", + "description": "Create examples/ directory with curl, Python, and JavaScript client examples", + "dependencies": ["7.1.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Include async Python example using httpx" + }, + { + "id": "7.2.1", + "phase": "phase_7", + "name": "Create systemd service file", + "description": "Create voice-server.service for systemd deployment", + "dependencies": ["6.4.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Include restart on failure, proper user/group, working directory" + }, + { + "id": "7.2.2", + "phase": "phase_7", + "name": "Test systemd service", + "description": "Test service installation, start, stop, restart, and auto-restart on failure", + "dependencies": ["7.2.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Document installation steps in README" + }, + { + "id": "7.2.3", + "phase": "phase_7", + "name": "Create deployment script", + "description": "Create deploy.sh script for automated deployment", + "dependencies": ["7.2.1"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Include venv setup, dependency install, service restart" + }, + { + "id": "7.3.1", + "phase": "phase_7", + "name": "Configure production logging", + "description": "Set up production log levels (INFO default, DEBUG disabled)", + "dependencies": ["5.2.2"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Configurable via LOG_LEVEL environment variable" + }, + { + "id": "7.3.2", + "phase": "phase_7", + "name": "Implement graceful shutdown", + "description": "Handle SIGTERM/SIGINT for graceful shutdown with queue draining", + "dependencies": ["4.1.5"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Wait for current playback to complete before exit" + }, + { + "id": "7.3.3", + "phase": "phase_7", + "name": "Security audit", + "description": "Review input sanitization, resource limits, error message exposure", + "dependencies": ["6.2.2"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Verify no internal details leaked in error responses" + }, + { + "id": "7.3.4", + "phase": "phase_7", + "name": "Performance tuning", + "description": "Tune queue size, worker count, timeouts based on benchmark results", + "dependencies": ["6.3.2"], + "completed": false, + "tested": false, + "test_approach": null, + "notes": "Document recommended settings for different use cases" + } + ], + "summary": { + "total_tasks": 78, + "tdd_tasks": 12, + "implementation_first_tasks": 8, + "phases": 7, + "estimated_days": "4-7" + } +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..7375bca --- /dev/null +++ b/README.md @@ -0,0 +1,41 @@ +# Voice Server + +Local HTTP service for text-to-speech playback using Piper TTS. + +## Features + +- HTTP POST endpoint for text-to-speech requests +- High-quality neural TTS using Piper +- Non-blocking audio playback with sounddevice +- Async request queue for concurrent handling +- Automatic OpenAPI documentation + +## Quick Start + +```bash +# Install dependencies +uv pip install -e ".[dev]" + +# Run server +uvicorn app.main:app --reload + +# Test endpoint +curl -X POST http://localhost:8000/notify \ + -H "Content-Type: application/json" \ + -d '{"message": "Hello, world!"}' +``` + +## API Endpoints + +- `POST /notify` - Submit text for TTS playback +- `GET /health` - Health check endpoint +- `GET /voices` - List available voice models +- `GET /docs` - OpenAPI documentation + +## Configuration + +See `.env.example` for configuration options. + +## License + +MIT diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/audio_player.py b/app/audio_player.py new file mode 100644 index 0000000..0551377 --- /dev/null +++ b/app/audio_player.py @@ -0,0 +1,192 @@ +""" +Audio playback module for voice-server. + +Provides non-blocking audio playback using sounddevice. +""" + +import asyncio +import logging +import time +from typing import Protocol + +import numpy as np + +logger = logging.getLogger(__name__) + + +class AudioPlayer(Protocol): + """Protocol defining the audio player interface.""" + + def play(self, audio_data: np.ndarray, sample_rate: int) -> None: + """Play audio data (non-blocking).""" + ... + + def is_playing(self) -> bool: + """Check if audio is currently playing.""" + ... + + def stop(self) -> None: + """Stop current playback.""" + ... + + async def wait_async(self) -> None: + """Wait asynchronously for playback to complete.""" + ... + + +class SounddevicePlayer: + """ + Audio player implementation using sounddevice. + + Provides non-blocking playback with async wait support. + """ + + def __init__(self, default_sample_rate: int = 22050, retry_attempts: int = 3): + """ + Initialize the audio player. + + Args: + default_sample_rate: Default sample rate if not specified in play() + retry_attempts: Number of retry attempts on playback failure + """ + self.default_sample_rate = default_sample_rate + self.retry_attempts = retry_attempts + self._initialized = False + + # Lazy import sounddevice to defer PortAudio initialization + self._sd = None + + def _ensure_initialized(self): + """Ensure sounddevice is imported and initialized.""" + if self._sd is None: + try: + import sounddevice as sd + + self._sd = sd + self._initialized = True + logger.info("SounddevicePlayer initialized successfully") + except OSError as e: + logger.error(f"Failed to initialize sounddevice: {e}") + raise RuntimeError(f"Audio system unavailable: {e}") from e + + def play(self, audio_data: np.ndarray, sample_rate: int | None = None) -> None: + """ + Play audio data (non-blocking). + + The audio plays in a background thread. Use is_playing() to check status + or wait_async() to wait for completion. + + Args: + audio_data: NumPy array of audio samples (float32 or int16) + sample_rate: Sample rate in Hz (uses default if not specified) + """ + self._ensure_initialized() + + if len(audio_data) == 0: + logger.debug("Skipping playback of empty audio") + return + + rate = sample_rate or self.default_sample_rate + + # Stop any currently playing audio + self.stop() + + for attempt in range(self.retry_attempts): + try: + # Play audio - returns immediately, audio plays in background + self._sd.play(audio_data, rate) + logger.debug(f"Started playback: {len(audio_data)} samples at {rate}Hz") + return + except self._sd.PortAudioError as e: + logger.warning(f"Playback attempt {attempt + 1} failed: {e}") + if attempt < self.retry_attempts - 1: + time.sleep(0.5) + else: + raise RuntimeError(f"Audio playback failed after {self.retry_attempts} attempts: {e}") + + def is_playing(self) -> bool: + """Check if audio is currently playing.""" + if self._sd is None: + return False + + try: + stream = self._sd.get_stream() + return stream is not None and stream.active + except Exception: + return False + + def stop(self) -> None: + """Stop current playback.""" + if self._sd is not None: + try: + self._sd.stop() + except Exception as e: + logger.warning(f"Error stopping playback: {e}") + + def wait(self) -> None: + """Block until current playback completes.""" + if self._sd is not None: + try: + self._sd.wait() + except Exception as e: + logger.warning(f"Error waiting for playback: {e}") + + async def wait_async(self, poll_interval: float = 0.05) -> None: + """ + Wait asynchronously for playback to complete. + + Uses polling to avoid blocking the event loop. + + Args: + poll_interval: How often to check playback status (seconds) + """ + while self.is_playing(): + await asyncio.sleep(poll_interval) + + def get_diagnostics(self) -> dict: + """ + Get audio system diagnostics for health checks. + + Returns: + Dictionary with audio device information and status + """ + try: + self._ensure_initialized() + + devices = self._sd.query_devices() + output_devices = [d for d in devices if d["max_output_channels"] > 0] + + if not output_devices: + return { + "status": "unavailable", + "error": "No audio output devices found", + } + + default_output = self._sd.query_devices(kind="output") + + return { + "status": "available", + "device_count": len(output_devices), + "default_output": default_output["name"], + "default_sample_rate": int(default_output["default_samplerate"]), + } + + except Exception as e: + return { + "status": "unavailable", + "error": str(e), + } + + def health_check(self) -> dict: + """ + Perform a health check on the audio system. + + Returns: + Dictionary with status and any error messages + """ + diagnostics = self.get_diagnostics() + + if diagnostics["status"] == "available": + return {"status": "healthy", "details": diagnostics} + else: + return {"status": "unhealthy", "error": diagnostics.get("error", "Unknown error")} diff --git a/app/config.py b/app/config.py new file mode 100644 index 0000000..4b488ea --- /dev/null +++ b/app/config.py @@ -0,0 +1,98 @@ +""" +Configuration management for voice-server. + +Loads configuration from environment variables with sensible defaults. +Uses pydantic-settings for type-safe configuration loading and validation. +""" + +from functools import lru_cache +from typing import Annotated, Literal + +from pydantic import Field, field_validator +from pydantic_settings import BaseSettings, SettingsConfigDict + + +# Valid log levels +LogLevel = Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] + + +class Settings(BaseSettings): + """ + Application settings loaded from environment variables. + + All settings have sensible defaults and can be overridden via environment + variables or a .env file. + """ + + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + case_sensitive=False, + extra="ignore", + ) + + # Server settings + host: Annotated[str, Field(default="0.0.0.0", description="Host to bind to")] + port: Annotated[ + int, + Field(default=8888, ge=1, le=65535, description="Port to listen on"), + ] + + # TTS settings + model_dir: Annotated[ + str, + Field(default="./models", description="Directory containing voice models"), + ] + default_voice: Annotated[ + str, + Field(default="en_US-lessac-medium", description="Default voice model"), + ] + default_rate: Annotated[ + int, + Field(default=170, ge=50, le=400, description="Default speech rate (WPM)"), + ] + + # Queue settings + queue_max_size: Annotated[ + int, + Field(default=50, gt=0, description="Maximum TTS queue size"), + ] + request_timeout_seconds: Annotated[ + int, + Field(default=60, gt=0, description="Request processing timeout"), + ] + + # Logging + log_level: Annotated[ + LogLevel, + Field(default="INFO", description="Logging level"), + ] + log_file: Annotated[ + str | None, + Field(default=None, description="Log file path (None for stdout only)"), + ] + + # Debug + voice_enabled: Annotated[ + bool, + Field(default=True, description="Enable/disable TTS playback"), + ] + + @field_validator("log_level", mode="before") + @classmethod + def uppercase_log_level(cls, v: str) -> str: + """Ensure log level is uppercase.""" + if isinstance(v, str): + return v.upper() + return v + + +@lru_cache +def get_settings() -> Settings: + """ + Get cached application settings. + + Returns the same Settings instance on subsequent calls for efficiency. + The cache can be cleared by calling get_settings.cache_clear(). + """ + return Settings() diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..4ebc8b5 --- /dev/null +++ b/app/main.py @@ -0,0 +1,140 @@ +""" +Voice Server - Local HTTP service for text-to-speech playback. + +This module provides the FastAPI application with endpoints for: +- POST /notify: Submit text for TTS playback +- GET /health: Health check endpoint +- GET /voices: List available voice models +""" + +import logging +import time +from contextlib import asynccontextmanager + +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware + +from app.audio_player import SounddevicePlayer +from app.config import get_settings +from app.queue_manager import TTSQueueManager +from app.tts_engine import PiperTTSEngine + +logger = logging.getLogger(__name__) + +# Track server start time for uptime calculation +_start_time: float = 0.0 + +# Global instances (initialized in lifespan) +tts_engine: PiperTTSEngine | None = None +audio_player: SounddevicePlayer | None = None +queue_manager: TTSQueueManager | None = None + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """ + Application lifespan handler. + + Handles startup and shutdown events: + - Startup: Initialize TTS engine, audio player, queue processor + - Shutdown: Stop audio playback, drain queue + """ + global _start_time, tts_engine, audio_player, queue_manager + + settings = get_settings() + _start_time = time.time() + + # Initialize TTS engine + logger.info("Initializing TTS engine...") + tts_engine = PiperTTSEngine( + model_dir=settings.model_dir, + default_voice=settings.default_voice, + ) + + # Initialize audio player + logger.info("Initializing audio player...") + audio_player = SounddevicePlayer( + default_sample_rate=tts_engine.get_sample_rate(), + ) + + # Initialize and start queue manager + logger.info("Starting queue manager...") + queue_manager = TTSQueueManager( + tts_engine=tts_engine, + audio_player=audio_player, + max_size=settings.queue_max_size, + request_timeout=settings.request_timeout_seconds, + ) + await queue_manager.start() + + logger.info("Voice server started successfully") + + yield + + # Shutdown cleanup + logger.info("Shutting down voice server...") + if queue_manager: + await queue_manager.stop() + if audio_player: + audio_player.stop() + logger.info("Voice server stopped") + + +def create_app() -> FastAPI: + """ + Create and configure the FastAPI application. + + Returns a configured FastAPI instance with all routes and middleware. + """ + settings = get_settings() + + app = FastAPI( + title="Voice Server", + description="Local HTTP service for text-to-speech playback using Piper TTS", + version="1.0.0", + lifespan=lifespan, + ) + + # Configure CORS + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + # Register routes + from app.routes import router + + app.include_router(router) + + return app + + +def get_uptime_seconds() -> int: + """Get server uptime in seconds.""" + if _start_time == 0.0: + return 0 + return int(time.time() - _start_time) + + +# Create the application instance +app = create_app() + + +def run(): + """Run the server using uvicorn (for CLI entry point).""" + import uvicorn + + settings = get_settings() + uvicorn.run( + "app.main:app", + host=settings.host, + port=settings.port, + reload=True, + ) + + +if __name__ == "__main__": + run() diff --git a/app/models.py b/app/models.py new file mode 100644 index 0000000..8832b58 --- /dev/null +++ b/app/models.py @@ -0,0 +1,162 @@ +""" +Pydantic models for voice-server request/response validation. + +This module defines the API contract for all endpoints: +- NotifyRequest/NotifyResponse: POST /notify +- HealthResponse: GET /health +- VoicesResponse: GET /voices +- ErrorResponse: Error responses +""" + +from datetime import datetime, timezone +from typing import Annotated + +from pydantic import BaseModel, Field, field_validator + + +class NotifyRequest(BaseModel): + """ + Request model for POST /notify endpoint. + + Validates incoming TTS requests with message content and optional parameters. + """ + + message: Annotated[ + str, + Field( + min_length=1, + max_length=10000, + description="Text to convert to speech (1-10000 characters)", + ), + ] + voice: Annotated[ + str, + Field( + default="en_US-lessac-medium", + pattern=r"^[\w-]+$", + description="Piper voice model name", + ), + ] + rate: Annotated[ + int, + Field( + default=170, + ge=50, + le=400, + description="Speech rate in words per minute (50-400)", + ), + ] + voice_enabled: Annotated[ + bool, + Field( + default=True, + description="Enable/disable TTS playback (for debugging)", + ), + ] + + @field_validator("message", mode="before") + @classmethod + def strip_message_whitespace(cls, v: str) -> str: + """Strip leading and trailing whitespace from message.""" + if isinstance(v, str): + return v.strip() + return v + + +class NotifyResponse(BaseModel): + """ + Response model for successful POST /notify requests. + + Returned when a TTS request is successfully queued for processing. + """ + + status: Annotated[str, Field(description="Request status (e.g., 'queued')")] + message_length: Annotated[int, Field(description="Length of the message in characters")] + queue_position: Annotated[int, Field(description="Position in the TTS queue")] + voice_model: Annotated[str, Field(description="Voice model being used")] + estimated_duration: Annotated[ + float | None, + Field(default=None, description="Estimated playback duration in seconds"), + ] + + +class QueueStatus(BaseModel): + """Queue status information for health checks.""" + + size: Annotated[int, Field(description="Current number of items in queue")] + capacity: Annotated[int, Field(description="Maximum queue capacity")] + utilization: Annotated[float, Field(description="Queue utilization percentage")] + + +class HealthResponse(BaseModel): + """ + Response model for GET /health endpoint. + + Provides comprehensive health status including TTS engine, audio, and queue status. + """ + + status: Annotated[str, Field(description="Overall health status ('healthy' or 'unhealthy')")] + uptime_seconds: Annotated[int, Field(description="Server uptime in seconds")] + queue: Annotated[QueueStatus, Field(description="Queue status information")] + tts_engine: Annotated[str, Field(description="TTS engine name")] + audio_output: Annotated[str, Field(description="Audio output status")] + voice_models_loaded: Annotated[ + list[str] | None, + Field(default=None, description="List of loaded voice models"), + ] + total_requests: Annotated[ + int | None, + Field(default=None, description="Total requests processed"), + ] + failed_requests: Annotated[ + int | None, + Field(default=None, description="Number of failed requests"), + ] + errors: Annotated[ + list[str] | None, + Field(default=None, description="List of error messages if unhealthy"), + ] + timestamp: Annotated[ + datetime, + Field(default_factory=lambda: datetime.now(timezone.utc), description="Response timestamp"), + ] + + +class ErrorResponse(BaseModel): + """ + Response model for error conditions. + + Used for 4xx and 5xx error responses with consistent structure. + """ + + error: Annotated[str, Field(description="Error type identifier")] + detail: Annotated[str, Field(description="Human-readable error description")] + timestamp: Annotated[ + datetime, + Field(default_factory=lambda: datetime.now(timezone.utc), description="Error timestamp"), + ] + queue_size: Annotated[ + int | None, + Field(default=None, description="Current queue size (for queue_full errors)"), + ] + + +class VoiceInfo(BaseModel): + """Information about a single voice model.""" + + name: Annotated[str, Field(description="Voice model name")] + language: Annotated[str, Field(description="Language code (e.g., 'en_US')")] + quality: Annotated[str, Field(description="Quality level (low, medium, high)")] + size_mb: Annotated[float, Field(description="Model size in megabytes")] + installed: Annotated[bool, Field(description="Whether the model is installed locally")] + + +class VoicesResponse(BaseModel): + """ + Response model for GET /voices endpoint. + + Lists available voice models and the default voice. + """ + + voices: Annotated[list[VoiceInfo], Field(description="List of available voices")] + default_voice: Annotated[str, Field(description="Default voice model name")] diff --git a/app/queue_manager.py b/app/queue_manager.py new file mode 100644 index 0000000..83e15e8 --- /dev/null +++ b/app/queue_manager.py @@ -0,0 +1,236 @@ +""" +TTS Queue Manager for voice-server. + +Manages an async queue of TTS requests and processes them sequentially. +""" + +import asyncio +import logging +import time +from dataclasses import dataclass, field +from typing import Any + +logger = logging.getLogger(__name__) + + +class QueueFullError(Exception): + """Raised when the TTS queue is full.""" + + pass + + +@dataclass +class TTSRequest: + """A TTS request in the queue.""" + + message: str + voice: str + rate: int + voice_enabled: bool + timestamp: float = field(default_factory=time.time) + request_id: str | None = None + + +@dataclass +class QueueStats: + """Statistics about queue processing.""" + + processed: int = 0 + errors: int = 0 + total_audio_seconds: float = 0.0 + + +class TTSQueueManager: + """ + Manages the TTS request queue and processes requests sequentially. + + Ensures audio doesn't overlap by processing one request at a time. + """ + + def __init__( + self, + tts_engine: Any, + audio_player: Any, + max_size: int = 50, + request_timeout: float = 60.0, + ): + """ + Initialize the queue manager. + + Args: + tts_engine: TTS engine instance for synthesis + audio_player: Audio player instance for playback + max_size: Maximum queue size + request_timeout: Timeout for processing each request (seconds) + """ + self.tts_engine = tts_engine + self.audio_player = audio_player + self.max_size = max_size + self.request_timeout = request_timeout + + self._queue: asyncio.Queue[TTSRequest] = asyncio.Queue(maxsize=max_size) + self._stats = QueueStats() + self._running = False + self._processor_task: asyncio.Task | None = None + + async def start(self) -> None: + """Start the queue processor background task.""" + if self._running: + return + + self._running = True + self._processor_task = asyncio.create_task(self._process_queue()) + logger.info("TTS queue processor started") + + async def stop(self) -> None: + """Stop the queue processor and wait for current item to complete.""" + self._running = False + + if self._processor_task: + # Cancel the task + self._processor_task.cancel() + try: + await self._processor_task + except asyncio.CancelledError: + pass + self._processor_task = None + + # Stop any playing audio + self.audio_player.stop() + logger.info("TTS queue processor stopped") + + async def enqueue(self, request: TTSRequest) -> int: + """ + Add a TTS request to the queue. + + Args: + request: The TTS request to queue + + Returns: + Queue position (1-indexed) + + Raises: + QueueFullError: If the queue is full + """ + try: + # Use a short timeout to avoid blocking + await asyncio.wait_for( + self._queue.put(request), + timeout=1.0, + ) + position = self._queue.qsize() + logger.debug(f"Enqueued request: {request.message[:50]}... (position={position})") + return position + except asyncio.TimeoutError: + raise QueueFullError(f"TTS queue is full (max_size={self.max_size})") + + async def _process_queue(self) -> None: + """Background task that processes queued requests.""" + while self._running: + try: + # Wait for a request (with timeout to allow checking _running) + try: + request = await asyncio.wait_for( + self._queue.get(), + timeout=1.0, + ) + except asyncio.TimeoutError: + continue + + await self._process_request(request) + self._queue.task_done() + + except asyncio.CancelledError: + raise + except Exception as e: + logger.error(f"Error in queue processor: {e}") + self._stats.errors += 1 + + async def _process_request(self, request: TTSRequest) -> None: + """ + Process a single TTS request. + + Args: + request: The TTS request to process + """ + start_time = time.time() + + try: + logger.debug(f"Processing TTS request: {request.message[:50]}...") + + if not request.voice_enabled: + logger.debug("Voice disabled, skipping TTS") + self._stats.processed += 1 + return + + # Synthesize audio (run in thread pool to avoid blocking) + loop = asyncio.get_event_loop() + audio_data = await asyncio.wait_for( + loop.run_in_executor( + None, + self.tts_engine.synthesize_to_float32, + request.message, + request.voice, + ), + timeout=self.request_timeout, + ) + + if len(audio_data) == 0: + logger.warning("TTS generated empty audio") + self._stats.processed += 1 + return + + # Play audio + self.audio_player.play(audio_data, self.tts_engine.get_sample_rate()) + + # Wait for playback to complete + await self.audio_player.wait_async() + + # Update stats + duration = len(audio_data) / self.tts_engine.get_sample_rate() + self._stats.processed += 1 + self._stats.total_audio_seconds += duration + + elapsed = time.time() - start_time + logger.debug(f"Request processed: {duration:.2f}s audio in {elapsed:.2f}s") + + except asyncio.TimeoutError: + logger.error(f"Request timed out after {self.request_timeout}s") + self._stats.errors += 1 + except Exception as e: + logger.error(f"Error processing request: {e}") + self._stats.errors += 1 + + @property + def size(self) -> int: + """Get current queue size.""" + return self._queue.qsize() + + @property + def capacity(self) -> int: + """Get queue capacity.""" + return self.max_size + + @property + def utilization(self) -> float: + """Get queue utilization percentage.""" + if self.max_size == 0: + return 0.0 + return (self.size / self.max_size) * 100.0 + + @property + def stats(self) -> QueueStats: + """Get queue statistics.""" + return self._stats + + def get_status(self) -> dict: + """Get queue status for health checks.""" + return { + "size": self.size, + "capacity": self.capacity, + "utilization": round(self.utilization, 1), + "processed": self._stats.processed, + "errors": self._stats.errors, + "total_audio_seconds": round(self._stats.total_audio_seconds, 1), + "running": self._running, + } diff --git a/app/routes.py b/app/routes.py new file mode 100644 index 0000000..5791f22 --- /dev/null +++ b/app/routes.py @@ -0,0 +1,198 @@ +""" +API routes for voice-server. + +Defines all HTTP endpoints: +- POST /notify: Submit text for TTS playback +- GET /health: Health check endpoint +- GET /voices: List available voice models +""" + +from fastapi import APIRouter, HTTPException, Response, status + +from app.config import get_settings +from app.models import ( + ErrorResponse, + HealthResponse, + NotifyRequest, + NotifyResponse, + QueueStatus, + VoiceInfo, + VoicesResponse, +) +from app.queue_manager import QueueFullError, TTSRequest + +router = APIRouter() + + +def _get_components(): + """Get the TTS components from main module.""" + from app import main + + return main.tts_engine, main.audio_player, main.queue_manager, main.get_uptime_seconds + + +@router.post( + "/notify", + response_model=NotifyResponse, + status_code=status.HTTP_202_ACCEPTED, + responses={ + 422: {"model": ErrorResponse, "description": "Validation error"}, + 503: {"model": ErrorResponse, "description": "Queue full"}, + }, +) +async def notify(request: NotifyRequest) -> NotifyResponse: + """ + Submit text for TTS playback. + + Accepts a text message and queues it for text-to-speech conversion + and playback through the system speakers. + + Returns immediately with queue position; audio plays asynchronously. + """ + tts_engine, audio_player, queue_manager, _ = _get_components() + settings = get_settings() + + # Validate voice exists if specified + voice = request.voice or settings.default_voice + if tts_engine and not tts_engine.is_voice_available(voice): + available = [v["name"] for v in tts_engine.list_voices()] + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=f"Voice '{voice}' not found. Available: {available}", + ) + + # Create TTS request + tts_request = TTSRequest( + message=request.message, + voice=voice, + rate=request.rate, + voice_enabled=request.voice_enabled and settings.voice_enabled, + ) + + # Enqueue request + try: + if queue_manager: + position = await queue_manager.enqueue(tts_request) + else: + position = 1 # Fallback for testing + except QueueFullError as e: + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail=str(e), + ) + + return NotifyResponse( + status="queued", + message_length=len(request.message), + queue_position=position, + voice_model=voice, + ) + + +@router.get( + "/health", + response_model=HealthResponse, + responses={ + 503: {"model": HealthResponse, "description": "Service unhealthy"}, + }, +) +async def health(response: Response) -> HealthResponse: + """ + Health check endpoint. + + Returns comprehensive health status including: + - TTS engine status + - Audio output status + - Queue status + - System metrics + """ + tts_engine, audio_player, queue_manager, get_uptime_seconds = _get_components() + settings = get_settings() + + errors = [] + + # Check TTS engine health + tts_status = "unknown" + if tts_engine: + tts_health = tts_engine.health_check() + tts_status = tts_health.get("status", "unknown") + if tts_status != "healthy": + errors.append(f"TTS: {tts_health.get('error', 'Unknown error')}") + + # Check audio health + audio_status = "unknown" + if audio_player: + audio_health = audio_player.health_check() + audio_status = audio_health.get("status", "unknown") + if audio_status != "healthy": + errors.append(f"Audio: {audio_health.get('error', 'Unknown error')}") + + # Get queue status + if queue_manager: + queue_info = queue_manager.get_status() + queue_status = QueueStatus( + size=queue_info["size"], + capacity=queue_info["capacity"], + utilization=queue_info["utilization"], + ) + total_requests = queue_info["processed"] + failed_requests = queue_info["errors"] + else: + queue_status = QueueStatus( + size=0, + capacity=settings.queue_max_size, + utilization=0.0, + ) + total_requests = None + failed_requests = None + + # Determine overall status + overall_status = "healthy" if not errors else "unhealthy" + + # Set response status code for unhealthy + if overall_status == "unhealthy": + response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE + + return HealthResponse( + status=overall_status, + uptime_seconds=get_uptime_seconds(), + queue=queue_status, + tts_engine="piper", + audio_output="available" if audio_status == "healthy" else "unavailable", + total_requests=total_requests, + failed_requests=failed_requests, + errors=errors if errors else None, + ) + + +@router.get( + "/voices", + response_model=VoicesResponse, +) +async def list_voices() -> VoicesResponse: + """ + List available voice models. + + Returns a list of installed voice models with their metadata + and the current default voice. + """ + tts_engine, _, _, _ = _get_components() + settings = get_settings() + + voices = [] + if tts_engine: + for voice_data in tts_engine.list_voices(): + voices.append( + VoiceInfo( + name=voice_data["name"], + language=voice_data["language"], + quality=voice_data["quality"], + size_mb=voice_data["size_mb"], + installed=voice_data["installed"], + ) + ) + + return VoicesResponse( + voices=voices, + default_voice=settings.default_voice, + ) diff --git a/app/tts_engine.py b/app/tts_engine.py new file mode 100644 index 0000000..a0a868a --- /dev/null +++ b/app/tts_engine.py @@ -0,0 +1,287 @@ +""" +TTS Engine module for voice-server. + +Provides text-to-speech synthesis using Piper TTS. +Supports multiple voice models with lazy loading and caching. +""" + +import logging +from pathlib import Path +from typing import Protocol + +import numpy as np + +logger = logging.getLogger(__name__) + + +class TTSEngine(Protocol): + """Protocol defining the TTS engine interface.""" + + def synthesize(self, text: str, voice: str | None = None) -> np.ndarray: + """Convert text to audio samples.""" + ... + + def get_sample_rate(self) -> int: + """Get the audio sample rate.""" + ... + + def list_voices(self) -> list[dict]: + """List available voice models.""" + ... + + +class PiperTTSEngine: + """ + Piper TTS engine implementation. + + Provides high-quality neural text-to-speech using Piper's ONNX models. + Voice models are loaded lazily and cached for performance. + """ + + def __init__(self, model_dir: str = "./models", default_voice: str = "en_US-lessac-medium"): + """ + Initialize the Piper TTS engine. + + Args: + model_dir: Directory containing voice model files (.onnx + .onnx.json) + default_voice: Default voice model name to use + """ + self.model_dir = Path(model_dir) + self.default_voice = default_voice + self._voices: dict = {} # Cache of loaded PiperVoice instances + self._voice_metadata: dict = {} # Cache of voice metadata + self._sample_rate: int = 22050 # Piper default sample rate + + # Ensure model directory exists + self.model_dir.mkdir(parents=True, exist_ok=True) + + logger.info(f"PiperTTSEngine initialized with model_dir={model_dir}") + + def _get_voice_path(self, voice_name: str) -> tuple[Path, Path]: + """ + Get paths to voice model files. + + Args: + voice_name: Name of the voice model + + Returns: + Tuple of (onnx_path, json_path) + """ + onnx_path = self.model_dir / f"{voice_name}.onnx" + json_path = self.model_dir / f"{voice_name}.onnx.json" + return onnx_path, json_path + + def _load_voice(self, voice_name: str): + """ + Load a voice model (lazy loading with caching). + + Args: + voice_name: Name of the voice model to load + + Returns: + Loaded PiperVoice instance + + Raises: + FileNotFoundError: If voice model files don't exist + RuntimeError: If voice model fails to load + """ + if voice_name in self._voices: + return self._voices[voice_name] + + onnx_path, json_path = self._get_voice_path(voice_name) + + if not onnx_path.exists(): + raise FileNotFoundError( + f"Voice model not found: {voice_name}. " + f"Expected file: {onnx_path}" + ) + + try: + from piper import PiperVoice + + logger.info(f"Loading voice model: {voice_name}") + voice = PiperVoice.load(str(onnx_path), config_path=str(json_path) if json_path.exists() else None) + self._voices[voice_name] = voice + + # Update sample rate from loaded voice + if hasattr(voice, 'config') and voice.config: + self._sample_rate = voice.config.sample_rate + + logger.info(f"Voice model loaded: {voice_name} (sample_rate={self._sample_rate})") + return voice + + except Exception as e: + logger.error(f"Failed to load voice model {voice_name}: {e}") + raise RuntimeError(f"Failed to load voice model: {e}") from e + + def synthesize(self, text: str, voice: str | None = None) -> np.ndarray: + """ + Convert text to audio samples. + + Args: + text: Text to convert to speech + voice: Voice model name (uses default if None) + + Returns: + NumPy array of audio samples (int16) + + Raises: + FileNotFoundError: If voice model not found + RuntimeError: If synthesis fails + """ + voice_name = voice or self.default_voice + + if not text or not text.strip(): + # Return empty audio for empty text + return np.array([], dtype=np.int16) + + try: + piper_voice = self._load_voice(voice_name) + + # Synthesize audio - piper returns an iterator of AudioChunk objects + audio_chunks = [] + for chunk in piper_voice.synthesize(text): + # Each chunk has audio_int16_array property + audio_chunks.append(chunk.audio_int16_array) + + if not audio_chunks: + return np.array([], dtype=np.int16) + + # Concatenate all chunks + audio_array = np.concatenate(audio_chunks) + + logger.debug(f"Synthesized {len(text)} chars -> {len(audio_array)} samples") + return audio_array + + except FileNotFoundError: + raise + except Exception as e: + logger.error(f"TTS synthesis failed: {e}") + raise RuntimeError(f"TTS synthesis failed: {e}") from e + + def synthesize_to_float32(self, text: str, voice: str | None = None) -> np.ndarray: + """ + Convert text to float32 audio samples (normalized -1.0 to 1.0). + + This format is preferred by sounddevice for playback. + + Args: + text: Text to convert to speech + voice: Voice model name (uses default if None) + + Returns: + NumPy array of float32 audio samples + """ + int16_audio = self.synthesize(text, voice) + + if len(int16_audio) == 0: + return np.array([], dtype=np.float32) + + # Convert int16 to float32 normalized + float32_audio = int16_audio.astype(np.float32) / 32768.0 + return float32_audio + + def get_sample_rate(self) -> int: + """Get the audio sample rate for the current voice.""" + return self._sample_rate + + def list_voices(self) -> list[dict]: + """ + List available voice models in the model directory. + + Returns: + List of voice info dictionaries with name, language, quality, etc. + """ + voices = [] + + if not self.model_dir.exists(): + return voices + + # Find all .onnx files + for onnx_file in self.model_dir.glob("*.onnx"): + voice_name = onnx_file.stem + json_file = onnx_file.with_suffix(".onnx.json") + + voice_info = { + "name": voice_name, + "language": self._extract_language(voice_name), + "quality": self._extract_quality(voice_name), + "size_mb": round(onnx_file.stat().st_size / (1024 * 1024), 1), + "installed": True, + } + + # Try to load additional metadata from JSON config + if json_file.exists(): + try: + import json + with open(json_file) as f: + config = json.load(f) + if "language" in config: + voice_info["language"] = config["language"].get("code", voice_info["language"]) + except Exception: + pass # Use extracted values if JSON parsing fails + + voices.append(voice_info) + + return sorted(voices, key=lambda v: v["name"]) + + def _extract_language(self, voice_name: str) -> str: + """Extract language code from voice name (e.g., 'en_US' from 'en_US-lessac-medium').""" + parts = voice_name.split("-") + if parts: + return parts[0] + return "unknown" + + def _extract_quality(self, voice_name: str) -> str: + """Extract quality level from voice name (e.g., 'medium' from 'en_US-lessac-medium').""" + parts = voice_name.split("-") + if len(parts) >= 3: + quality = parts[-1].lower() + if quality in ("low", "medium", "high", "x_low", "x_high"): + return quality + return "medium" + + def is_voice_available(self, voice_name: str) -> bool: + """Check if a voice model is installed.""" + onnx_path, _ = self._get_voice_path(voice_name) + return onnx_path.exists() + + def health_check(self) -> dict: + """ + Perform a health check on the TTS engine. + + Returns: + Dict with status and any error messages + """ + try: + # Check if piper is importable + from piper import PiperVoice # noqa: F401 + + # Check if model directory exists + if not self.model_dir.exists(): + return { + "status": "degraded", + "error": f"Model directory does not exist: {self.model_dir}", + } + + # Check if default voice is available + if not self.is_voice_available(self.default_voice): + available = [v["name"] for v in self.list_voices()] + return { + "status": "degraded", + "error": f"Default voice not found: {self.default_voice}", + "available_voices": available, + } + + return {"status": "healthy"} + + except ImportError as e: + return { + "status": "unhealthy", + "error": f"Piper TTS not installed: {e}", + } + except Exception as e: + return { + "status": "unhealthy", + "error": str(e), + } diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f75ec8f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,69 @@ +[project] +name = "voice-server" +version = "1.0.0" +description = "Local HTTP service for text-to-speech playback" +readme = "README.md" +requires-python = ">=3.10" +license = {text = "MIT"} +authors = [ + {name = "Cal Corum", email = "cal.corum@gmail.com"} +] +keywords = ["tts", "text-to-speech", "piper", "fastapi", "voice"] + +dependencies = [ + "fastapi>=0.115.0", + "uvicorn[standard]>=0.32.0", + "piper-tts>=1.2.0", + "sounddevice>=0.5.0", + "numpy>=1.26.0", + "pydantic>=2.10.0", + "pydantic-settings>=2.6.0", + "python-dotenv>=1.0.0", + "psutil>=6.0.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.3.0", + "pytest-asyncio>=0.24.0", + "pytest-cov>=6.0.0", + "httpx>=0.28.0", + "ruff>=0.8.0", +] + +[project.scripts] +voice-server = "app.main:run" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["app"] + +[tool.pytest.ini_options] +asyncio_mode = "auto" +asyncio_default_fixture_loop_scope = "function" +testpaths = ["tests"] +python_files = ["test_*.py"] +python_functions = ["test_*"] +addopts = "-v --tb=short" + +[tool.ruff] +line-length = 100 +target-version = "py310" + +[tool.ruff.lint] +select = ["E", "F", "I", "N", "W", "UP"] +ignore = ["E501"] + +[tool.coverage.run] +source = ["app"] +omit = ["tests/*"] + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "if TYPE_CHECKING:", + "raise NotImplementedError", +] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_api.py b/tests/test_api.py new file mode 100644 index 0000000..02e804c --- /dev/null +++ b/tests/test_api.py @@ -0,0 +1,324 @@ +""" +TDD Tests for API endpoints. + +These tests verify the API contract for all voice-server endpoints: +- POST /notify: TTS request submission +- GET /health: Health check +- GET /voices: Voice model listing + +Uses httpx.AsyncClient for async endpoint testing with FastAPI's TestClient. +""" + +import pytest +from httpx import AsyncClient, ASGITransport + +from app.main import app + + +@pytest.fixture +async def client(): + """Create an async test client for the FastAPI app.""" + async with AsyncClient( + transport=ASGITransport(app=app), + base_url="http://test", + ) as client: + yield client + + +class TestNotifyEndpoint: + """Tests for POST /notify endpoint.""" + + async def test_valid_request_returns_202(self, client: AsyncClient): + """ + A valid request with just a message should return 202 Accepted. + """ + response = await client.post( + "/notify", + json={"message": "Hello, world!"}, + ) + + assert response.status_code == 202 + + async def test_valid_request_returns_queued_status(self, client: AsyncClient): + """ + Response should include status='queued' for successful requests. + """ + response = await client.post( + "/notify", + json={"message": "Test message"}, + ) + + data = response.json() + assert data["status"] == "queued" + + async def test_response_includes_message_length(self, client: AsyncClient): + """ + Response should include the length of the submitted message. + """ + message = "This is a test message" + response = await client.post( + "/notify", + json={"message": message}, + ) + + data = response.json() + assert data["message_length"] == len(message) + + async def test_response_includes_queue_position(self, client: AsyncClient): + """ + Response should include the queue position. + """ + response = await client.post( + "/notify", + json={"message": "Test"}, + ) + + data = response.json() + assert "queue_position" in data + assert isinstance(data["queue_position"], int) + assert data["queue_position"] >= 1 + + async def test_response_includes_voice_model(self, client: AsyncClient): + """ + Response should include the voice model being used. + """ + response = await client.post( + "/notify", + json={"message": "Test"}, + ) + + data = response.json() + assert "voice_model" in data + assert data["voice_model"] == "en_US-lessac-medium" # default + + async def test_custom_voice_is_preserved(self, client: AsyncClient): + """ + Custom voice selection should be reflected in response. + """ + response = await client.post( + "/notify", + json={"message": "Test", "voice": "en_US-libritts-high"}, + ) + + data = response.json() + assert data["voice_model"] == "en_US-libritts-high" + + async def test_missing_message_returns_422(self, client: AsyncClient): + """ + Request without message should return 422 Unprocessable Entity. + """ + response = await client.post( + "/notify", + json={}, + ) + + assert response.status_code == 422 + + async def test_empty_message_returns_422(self, client: AsyncClient): + """ + Empty message string should return 422 Unprocessable Entity. + """ + response = await client.post( + "/notify", + json={"message": ""}, + ) + + assert response.status_code == 422 + + async def test_message_too_long_returns_422(self, client: AsyncClient): + """ + Message over 10000 characters should return 422. + """ + response = await client.post( + "/notify", + json={"message": "a" * 10001}, + ) + + assert response.status_code == 422 + + async def test_invalid_rate_returns_422(self, client: AsyncClient): + """ + Rate outside valid range should return 422. + """ + response = await client.post( + "/notify", + json={"message": "Test", "rate": 500}, + ) + + assert response.status_code == 422 + + async def test_invalid_voice_pattern_returns_422(self, client: AsyncClient): + """ + Voice with invalid characters should return 422. + """ + response = await client.post( + "/notify", + json={"message": "Test", "voice": "invalid/voice"}, + ) + + assert response.status_code == 422 + + async def test_malformed_json_returns_422(self, client: AsyncClient): + """ + Malformed JSON should return 422. + """ + response = await client.post( + "/notify", + content="not valid json", + headers={"Content-Type": "application/json"}, + ) + + assert response.status_code == 422 + + async def test_whitespace_message_is_stripped(self, client: AsyncClient): + """ + Whitespace in message should be stripped. + """ + response = await client.post( + "/notify", + json={"message": " Hello "}, + ) + + assert response.status_code == 202 + data = response.json() + assert data["message_length"] == 5 # "Hello" without whitespace + + +class TestHealthEndpoint: + """Tests for GET /health endpoint.""" + + async def test_health_returns_200(self, client: AsyncClient): + """ + Health endpoint should return 200 when healthy. + """ + response = await client.get("/health") + + assert response.status_code == 200 + + async def test_health_returns_status(self, client: AsyncClient): + """ + Health response should include status field. + """ + response = await client.get("/health") + + data = response.json() + assert "status" in data + assert data["status"] in ["healthy", "unhealthy"] + + async def test_health_returns_uptime(self, client: AsyncClient): + """ + Health response should include uptime in seconds. + """ + response = await client.get("/health") + + data = response.json() + assert "uptime_seconds" in data + assert isinstance(data["uptime_seconds"], int) + assert data["uptime_seconds"] >= 0 + + async def test_health_returns_queue_status(self, client: AsyncClient): + """ + Health response should include queue status. + """ + response = await client.get("/health") + + data = response.json() + assert "queue" in data + assert "size" in data["queue"] + assert "capacity" in data["queue"] + assert "utilization" in data["queue"] + + async def test_health_returns_tts_engine(self, client: AsyncClient): + """ + Health response should include TTS engine info. + """ + response = await client.get("/health") + + data = response.json() + assert "tts_engine" in data + assert data["tts_engine"] == "piper" + + async def test_health_returns_audio_output(self, client: AsyncClient): + """ + Health response should include audio output status. + """ + response = await client.get("/health") + + data = response.json() + assert "audio_output" in data + + +class TestVoicesEndpoint: + """Tests for GET /voices endpoint.""" + + async def test_voices_returns_200(self, client: AsyncClient): + """ + Voices endpoint should return 200. + """ + response = await client.get("/voices") + + assert response.status_code == 200 + + async def test_voices_returns_list(self, client: AsyncClient): + """ + Voices response should include a list of voices. + """ + response = await client.get("/voices") + + data = response.json() + assert "voices" in data + assert isinstance(data["voices"], list) + + async def test_voices_returns_default_voice(self, client: AsyncClient): + """ + Voices response should include the default voice. + """ + response = await client.get("/voices") + + data = response.json() + assert "default_voice" in data + assert data["default_voice"] == "en_US-lessac-medium" + + +class TestOpenAPIDocumentation: + """Tests for API documentation endpoints.""" + + async def test_openapi_json_available(self, client: AsyncClient): + """ + OpenAPI JSON should be available at /openapi.json. + """ + response = await client.get("/openapi.json") + + assert response.status_code == 200 + data = response.json() + assert "openapi" in data + assert "paths" in data + + async def test_docs_endpoint_available(self, client: AsyncClient): + """ + Swagger UI should be available at /docs. + """ + response = await client.get("/docs") + + assert response.status_code == 200 + assert "text/html" in response.headers.get("content-type", "") + + +class TestCORS: + """Tests for CORS middleware.""" + + async def test_cors_headers_present(self, client: AsyncClient): + """ + CORS headers should be present in responses. + """ + response = await client.options( + "/notify", + headers={ + "Origin": "http://localhost:3000", + "Access-Control-Request-Method": "POST", + }, + ) + + # FastAPI returns 200 for OPTIONS with CORS + assert response.status_code == 200 + assert "access-control-allow-origin" in response.headers diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..0046e7a --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,300 @@ +""" +TDD Tests for configuration loading. + +These tests define the expected behavior for the Settings class which loads +configuration from environment variables with sensible defaults. + +Test Coverage: +- Default values when no environment variables are set +- Environment variable overrides +- Validation of configuration values +- Path handling for model directory +""" + +import os +import pytest + + +class TestSettingsDefaults: + """Tests for default configuration values.""" + + def test_default_host(self, monkeypatch): + """ + Default host should be 0.0.0.0 (listen on all interfaces). + """ + # Clear any existing env vars + monkeypatch.delenv("HOST", raising=False) + + from app.config import Settings + + settings = Settings() + assert settings.host == "0.0.0.0" + + def test_default_port(self, monkeypatch): + """ + Default port should be 8888. + """ + monkeypatch.delenv("PORT", raising=False) + + from app.config import Settings + + settings = Settings() + assert settings.port == 8888 + + def test_default_model_dir(self, monkeypatch): + """ + Default model directory should be ./models. + """ + monkeypatch.delenv("MODEL_DIR", raising=False) + + from app.config import Settings + + settings = Settings() + assert settings.model_dir == "./models" + + def test_default_voice(self, monkeypatch): + """ + Default voice should be en_US-lessac-medium. + """ + monkeypatch.delenv("DEFAULT_VOICE", raising=False) + + from app.config import Settings + + settings = Settings() + assert settings.default_voice == "en_US-lessac-medium" + + def test_default_rate(self, monkeypatch): + """ + Default speech rate should be 170 WPM. + """ + monkeypatch.delenv("DEFAULT_RATE", raising=False) + + from app.config import Settings + + settings = Settings() + assert settings.default_rate == 170 + + def test_default_queue_max_size(self, monkeypatch): + """ + Default queue max size should be 50. + """ + monkeypatch.delenv("QUEUE_MAX_SIZE", raising=False) + + from app.config import Settings + + settings = Settings() + assert settings.queue_max_size == 50 + + def test_default_request_timeout(self, monkeypatch): + """ + Default request timeout should be 60 seconds. + """ + monkeypatch.delenv("REQUEST_TIMEOUT_SECONDS", raising=False) + + from app.config import Settings + + settings = Settings() + assert settings.request_timeout_seconds == 60 + + def test_default_log_level(self, monkeypatch): + """ + Default log level should be INFO. + """ + monkeypatch.delenv("LOG_LEVEL", raising=False) + + from app.config import Settings + + settings = Settings() + assert settings.log_level == "INFO" + + def test_default_voice_enabled(self, monkeypatch): + """ + Voice should be enabled by default. + """ + monkeypatch.delenv("VOICE_ENABLED", raising=False) + + from app.config import Settings + + settings = Settings() + assert settings.voice_enabled is True + + +class TestSettingsEnvOverrides: + """Tests for environment variable overrides.""" + + def test_host_override(self, monkeypatch): + """ + HOST environment variable should override default. + """ + monkeypatch.setenv("HOST", "127.0.0.1") + + from app.config import Settings + + settings = Settings() + assert settings.host == "127.0.0.1" + + def test_port_override(self, monkeypatch): + """ + PORT environment variable should override default. + """ + monkeypatch.setenv("PORT", "9000") + + from app.config import Settings + + settings = Settings() + assert settings.port == 9000 + + def test_model_dir_override(self, monkeypatch): + """ + MODEL_DIR environment variable should override default. + """ + monkeypatch.setenv("MODEL_DIR", "/opt/voice-models") + + from app.config import Settings + + settings = Settings() + assert settings.model_dir == "/opt/voice-models" + + def test_default_voice_override(self, monkeypatch): + """ + DEFAULT_VOICE environment variable should override default. + """ + monkeypatch.setenv("DEFAULT_VOICE", "en_US-libritts-high") + + from app.config import Settings + + settings = Settings() + assert settings.default_voice == "en_US-libritts-high" + + def test_queue_max_size_override(self, monkeypatch): + """ + QUEUE_MAX_SIZE environment variable should override default. + """ + monkeypatch.setenv("QUEUE_MAX_SIZE", "100") + + from app.config import Settings + + settings = Settings() + assert settings.queue_max_size == 100 + + def test_log_level_override(self, monkeypatch): + """ + LOG_LEVEL environment variable should override default. + """ + monkeypatch.setenv("LOG_LEVEL", "DEBUG") + + from app.config import Settings + + settings = Settings() + assert settings.log_level == "DEBUG" + + def test_voice_enabled_false(self, monkeypatch): + """ + VOICE_ENABLED=false should disable voice output. + """ + monkeypatch.setenv("VOICE_ENABLED", "false") + + from app.config import Settings + + settings = Settings() + assert settings.voice_enabled is False + + +class TestSettingsValidation: + """Tests for configuration validation.""" + + def test_port_must_be_positive(self, monkeypatch): + """ + Port must be a positive integer. + """ + monkeypatch.setenv("PORT", "-1") + + from pydantic import ValidationError + from app.config import Settings + + with pytest.raises(ValidationError): + Settings() + + def test_port_must_be_valid_range(self, monkeypatch): + """ + Port must be in valid range (1-65535). + """ + monkeypatch.setenv("PORT", "70000") + + from pydantic import ValidationError + from app.config import Settings + + with pytest.raises(ValidationError): + Settings() + + def test_queue_max_size_must_be_positive(self, monkeypatch): + """ + Queue max size must be positive. + """ + monkeypatch.setenv("QUEUE_MAX_SIZE", "0") + + from pydantic import ValidationError + from app.config import Settings + + with pytest.raises(ValidationError): + Settings() + + def test_request_timeout_must_be_positive(self, monkeypatch): + """ + Request timeout must be positive. + """ + monkeypatch.setenv("REQUEST_TIMEOUT_SECONDS", "0") + + from pydantic import ValidationError + from app.config import Settings + + with pytest.raises(ValidationError): + Settings() + + def test_default_rate_must_be_in_range(self, monkeypatch): + """ + Default rate must be between 50 and 400. + """ + monkeypatch.setenv("DEFAULT_RATE", "500") + + from pydantic import ValidationError + from app.config import Settings + + with pytest.raises(ValidationError): + Settings() + + def test_log_level_must_be_valid(self, monkeypatch): + """ + Log level must be a valid Python logging level. + """ + monkeypatch.setenv("LOG_LEVEL", "INVALID") + + from pydantic import ValidationError + from app.config import Settings + + with pytest.raises(ValidationError): + Settings() + + +class TestGetSettings: + """Tests for the get_settings function.""" + + def test_get_settings_returns_settings_instance(self, monkeypatch): + """ + get_settings should return a Settings instance. + """ + # Clear cache to ensure fresh settings + from app.config import get_settings, Settings + + settings = get_settings() + assert isinstance(settings, Settings) + + def test_get_settings_is_cached(self, monkeypatch): + """ + get_settings should return the same cached instance. + """ + from app.config import get_settings + + settings1 = get_settings() + settings2 = get_settings() + assert settings1 is settings2 diff --git a/tests/test_models.py b/tests/test_models.py new file mode 100644 index 0000000..c9edc76 --- /dev/null +++ b/tests/test_models.py @@ -0,0 +1,388 @@ +""" +TDD Tests for Pydantic request/response models. + +These tests define the API contract for the voice server's request and response models. +Tests are written BEFORE implementation to drive the design. + +Test Coverage: +- NotifyRequest: Validates incoming TTS requests with message, voice, rate, voice_enabled +- NotifyResponse: Validates successful queue responses +- HealthResponse: Validates health check responses +- ErrorResponse: Validates error response format +- VoiceInfo/VoicesResponse: Validates voice listing responses +""" + +import pytest +from datetime import datetime +from pydantic import ValidationError + + +class TestNotifyRequest: + """Tests for the NotifyRequest model - validates incoming TTS requests.""" + + def test_valid_request_with_message_only(self): + """ + A minimal valid request should only require the message field. + All other fields should use sensible defaults. + """ + from app.models import NotifyRequest + + request = NotifyRequest(message="Hello, world!") + + assert request.message == "Hello, world!" + assert request.voice == "en_US-lessac-medium" # default voice + assert request.rate == 170 # default rate + assert request.voice_enabled is True # default enabled + + def test_valid_request_with_all_fields(self): + """ + A request with all fields specified should preserve those values. + """ + from app.models import NotifyRequest + + request = NotifyRequest( + message="Test message", + voice="en_US-libritts-high", + rate=200, + voice_enabled=False, + ) + + assert request.message == "Test message" + assert request.voice == "en_US-libritts-high" + assert request.rate == 200 + assert request.voice_enabled is False + + def test_message_is_required(self): + """ + The message field is required - omitting it should raise ValidationError. + """ + from app.models import NotifyRequest + + with pytest.raises(ValidationError) as exc_info: + NotifyRequest() + + errors = exc_info.value.errors() + assert any(e["loc"] == ("message",) and e["type"] == "missing" for e in errors) + + def test_message_cannot_be_empty(self): + """ + An empty message string should be rejected. + """ + from app.models import NotifyRequest + + with pytest.raises(ValidationError) as exc_info: + NotifyRequest(message="") + + errors = exc_info.value.errors() + assert any("message" in str(e["loc"]) for e in errors) + + def test_message_minimum_length_is_1(self): + """ + A single character message should be valid. + """ + from app.models import NotifyRequest + + request = NotifyRequest(message="X") + assert request.message == "X" + + def test_message_maximum_length_is_10000(self): + """ + Messages up to 10,000 characters should be accepted. + """ + from app.models import NotifyRequest + + long_message = "a" * 10000 + request = NotifyRequest(message=long_message) + assert len(request.message) == 10000 + + def test_message_over_10000_characters_rejected(self): + """ + Messages over 10,000 characters should be rejected. + """ + from app.models import NotifyRequest + + too_long = "a" * 10001 + with pytest.raises(ValidationError) as exc_info: + NotifyRequest(message=too_long) + + errors = exc_info.value.errors() + assert any("message" in str(e["loc"]) for e in errors) + + def test_message_whitespace_is_stripped(self): + """ + Leading and trailing whitespace should be stripped from messages. + """ + from app.models import NotifyRequest + + request = NotifyRequest(message=" Hello, world! ") + assert request.message == "Hello, world!" + + def test_rate_minimum_is_50(self): + """ + Rate below 50 should be rejected. + """ + from app.models import NotifyRequest + + with pytest.raises(ValidationError) as exc_info: + NotifyRequest(message="Test", rate=49) + + errors = exc_info.value.errors() + assert any("rate" in str(e["loc"]) for e in errors) + + def test_rate_maximum_is_400(self): + """ + Rate above 400 should be rejected. + """ + from app.models import NotifyRequest + + with pytest.raises(ValidationError) as exc_info: + NotifyRequest(message="Test", rate=401) + + errors = exc_info.value.errors() + assert any("rate" in str(e["loc"]) for e in errors) + + def test_rate_at_boundaries(self): + """ + Rate values at exact boundaries (50, 400) should be valid. + """ + from app.models import NotifyRequest + + request_min = NotifyRequest(message="Test", rate=50) + assert request_min.rate == 50 + + request_max = NotifyRequest(message="Test", rate=400) + assert request_max.rate == 400 + + def test_voice_pattern_validation(self): + """ + Voice names should match expected pattern (alphanumeric, underscores, hyphens). + """ + from app.models import NotifyRequest + + # Valid patterns + request = NotifyRequest(message="Test", voice="en_US-lessac-medium") + assert request.voice == "en_US-lessac-medium" + + request2 = NotifyRequest(message="Test", voice="voice_123") + assert request2.voice == "voice_123" + + def test_invalid_voice_pattern_rejected(self): + """ + Voice names with invalid characters should be rejected. + """ + from app.models import NotifyRequest + + with pytest.raises(ValidationError): + NotifyRequest(message="Test", voice="invalid/voice") + + with pytest.raises(ValidationError): + NotifyRequest(message="Test", voice="invalid voice") + + +class TestNotifyResponse: + """Tests for the NotifyResponse model - returned when request is queued.""" + + def test_successful_response_structure(self): + """ + A successful response should contain status, message_length, queue_position. + """ + from app.models import NotifyResponse + + response = NotifyResponse( + status="queued", + message_length=42, + queue_position=3, + voice_model="en_US-lessac-medium", + ) + + assert response.status == "queued" + assert response.message_length == 42 + assert response.queue_position == 3 + assert response.voice_model == "en_US-lessac-medium" + + def test_estimated_duration_is_optional(self): + """ + Estimated duration can be omitted. + """ + from app.models import NotifyResponse + + response = NotifyResponse( + status="queued", + message_length=42, + queue_position=1, + voice_model="en_US-lessac-medium", + ) + + assert response.estimated_duration is None + + def test_estimated_duration_when_provided(self): + """ + Estimated duration should be preserved when provided. + """ + from app.models import NotifyResponse + + response = NotifyResponse( + status="queued", + message_length=42, + queue_position=1, + voice_model="en_US-lessac-medium", + estimated_duration=2.5, + ) + + assert response.estimated_duration == 2.5 + + +class TestHealthResponse: + """Tests for the HealthResponse model - returned by /health endpoint.""" + + def test_healthy_response_structure(self): + """ + A healthy response should contain all required fields. + """ + from app.models import HealthResponse, QueueStatus + + queue_status = QueueStatus(size=2, capacity=50, utilization=4.0) + response = HealthResponse( + status="healthy", + uptime_seconds=3600, + queue=queue_status, + tts_engine="piper", + audio_output="available", + ) + + assert response.status == "healthy" + assert response.uptime_seconds == 3600 + assert response.queue.size == 2 + assert response.queue.capacity == 50 + assert response.tts_engine == "piper" + assert response.audio_output == "available" + + def test_unhealthy_response_with_errors(self): + """ + An unhealthy response can include error messages. + """ + from app.models import HealthResponse, QueueStatus + + queue_status = QueueStatus(size=0, capacity=50, utilization=0.0) + response = HealthResponse( + status="unhealthy", + uptime_seconds=100, + queue=queue_status, + tts_engine="piper", + audio_output="unavailable", + errors=["Audio device not found", "TTS engine failed to initialize"], + ) + + assert response.status == "unhealthy" + assert len(response.errors) == 2 + assert "Audio device not found" in response.errors + + def test_statistics_fields_are_optional(self): + """ + Statistics like total_requests and failed_requests are optional. + """ + from app.models import HealthResponse, QueueStatus + + queue_status = QueueStatus(size=0, capacity=50, utilization=0.0) + response = HealthResponse( + status="healthy", + uptime_seconds=0, + queue=queue_status, + tts_engine="piper", + audio_output="available", + ) + + assert response.total_requests is None + assert response.failed_requests is None + + +class TestErrorResponse: + """Tests for the ErrorResponse model - returned for error conditions.""" + + def test_error_response_structure(self): + """ + An error response should contain error type, detail, and timestamp. + """ + from app.models import ErrorResponse + + response = ErrorResponse( + error="validation_error", + detail="message field is required", + ) + + assert response.error == "validation_error" + assert response.detail == "message field is required" + assert response.timestamp is not None + + def test_timestamp_auto_generated(self): + """ + Timestamp should be auto-generated if not provided. + """ + from app.models import ErrorResponse + + response = ErrorResponse( + error="queue_full", + detail="TTS queue is full", + ) + + assert isinstance(response.timestamp, datetime) + + def test_queue_full_error_includes_queue_size(self): + """ + Queue full errors can include the current queue size. + """ + from app.models import ErrorResponse + + response = ErrorResponse( + error="queue_full", + detail="TTS queue is full, please retry later", + queue_size=50, + ) + + assert response.queue_size == 50 + + +class TestVoiceModels: + """Tests for voice-related models.""" + + def test_voice_info_structure(self): + """ + VoiceInfo should contain name, language, quality, and installation status. + """ + from app.models import VoiceInfo + + voice = VoiceInfo( + name="en_US-lessac-medium", + language="en_US", + quality="medium", + size_mb=63.5, + installed=True, + ) + + assert voice.name == "en_US-lessac-medium" + assert voice.language == "en_US" + assert voice.quality == "medium" + assert voice.size_mb == 63.5 + assert voice.installed is True + + def test_voices_response_structure(self): + """ + VoicesResponse should contain a list of voices and the default voice. + """ + from app.models import VoiceInfo, VoicesResponse + + voice = VoiceInfo( + name="en_US-lessac-medium", + language="en_US", + quality="medium", + size_mb=63.5, + installed=True, + ) + + response = VoicesResponse( + voices=[voice], + default_voice="en_US-lessac-medium", + ) + + assert len(response.voices) == 1 + assert response.default_voice == "en_US-lessac-medium"