| | """Pydantic schemas for Speech-to-Text and Text-to-Speech endpoints""" |
| |
|
| | from pydantic import BaseModel, Field, ConfigDict |
| | from typing import Optional |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | class STTResponse(BaseModel): |
| | """Response model for Whisper speech → text""" |
| | model_config = ConfigDict( |
| | json_schema_extra={ |
| | "example": { |
| | "text": "hello how are you", |
| | "model_name": "openai/whisper-large-v3", |
| | "language": "en", |
| | "duration_seconds": 3.2 |
| | } |
| | } |
| | ) |
| |
|
| | text: str = Field(..., description="Transcribed text from the input audio") |
| | model_name: str = Field(..., description="STT model used for inference") |
| | language: Optional[str] = Field(None, description="Detected language") |
| | duration_seconds: Optional[float] = Field( |
| | None, |
| | description="Approximate audio duration in seconds" |
| | ) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | class TTSRequest(BaseModel): |
| | """Text input for TTS conversion""" |
| | model_config = ConfigDict( |
| | json_schema_extra={ |
| | "example": { |
| | "text": "Hello, welcome to our AI system." |
| | } |
| | } |
| | ) |
| |
|
| | text: str = Field( |
| | ..., min_length=1, max_length=500, |
| | description="Text that will be converted into speech" |
| | ) |
| |
|
| |
|
| | class TTSResponse(BaseModel): |
| | """Metadata response for TTS generation""" |
| | model_config = ConfigDict( |
| | json_schema_extra={ |
| | "example": { |
| | "message": "Audio generated successfully", |
| | "audio_format": "wav", |
| | "length_seconds": 2.5, |
| | "model_name": "suno/bark" |
| | } |
| | } |
| | ) |
| |
|
| | message: str |
| | audio_format: str |
| | length_seconds: Optional[float] = None |
| | model_name: str |
| |
|