zeroclaw_channels/
tts.rs

1//! Multi-provider Text-to-Speech (TTS) subsystem.
2//!
3//! Supports OpenAI, ElevenLabs, Google Cloud TTS, Edge TTS (free, subprocess-based),
4//! and Piper TTS (local GPU-accelerated, OpenAI-compatible endpoint).
5//!
6//! per-instance configs live under `[tts_providers.<type>.<alias>]`; agents
7//! pick which instance to use via the `tts_provider` dotted alias reference.
8//! Global runtime knobs (default_voice, max_text_length, etc.) live on `[tts]`.
9
10use std::collections::HashMap;
11
12use anyhow::{Context, Result, bail};
13
14use zeroclaw_config::schema::{Config, TtsProviderConfig};
15
16/// Maximum text length before synthesis is rejected (default: 4096 chars).
17const DEFAULT_MAX_TEXT_LENGTH: usize = 4096;
18
19/// Default HTTP request timeout for TTS API calls.
20const TTS_HTTP_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(60);
21
22// ── TtsProvider trait ────────────────────────────────────────────
23
24/// Trait for pluggable TTS backends.
25#[async_trait::async_trait]
26pub trait TtsProvider: Send + Sync + ::zeroclaw_api::attribution::Attributable {
27    /// ModelProvider identifier (e.g. `"openai"`, `"elevenlabs"`).
28    fn name(&self) -> &str;
29
30    /// Synthesize `text` using the given `voice`, returning raw audio bytes.
31    async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>>;
32
33    /// Voices supported by this model_provider.
34    fn supported_voices(&self) -> Vec<String>;
35
36    /// Audio output formats supported by this model_provider.
37    fn supported_formats(&self) -> Vec<String>;
38}
39
40// ── OpenAI TTS ───────────────────────────────────────────────────
41
42/// OpenAI TTS model_provider (`POST /v1/audio/speech`).
43pub struct OpenAiTtsProvider {
44    alias: String,
45    api_key: String,
46    model: String,
47    speed: f64,
48    /// Full endpoint URL. Defaults to the OpenAI production endpoint; can be
49    /// overridden via `[providers.tts.openai.<alias>].uri` to point at any
50    /// OpenAI-compatible TTS backend (Groq, Azure, self-hosted proxies).
51    base_url: String,
52    /// Audio response format. Defaults to `"opus"`; override to `"wav"` for
53    /// Orpheus-class models or `"mp3"` for broader compatibility.
54    response_format: String,
55    client: reqwest::Client,
56}
57
58impl OpenAiTtsProvider {
59    /// Create a new OpenAI TTS model_provider from config. Reads
60    /// `[tts_providers.openai.<alias>].api_key` (or via the schema-mirror
61    /// env grammar). Legacy `OPENAI_API_KEY` env-var fallback eradicated
62    /// in V0.8.0.
63    pub fn new(alias: &str, config: &TtsProviderConfig) -> Result<Self> {
64        let api_key = config
65            .api_key
66            .as_deref()
67            .map(str::trim)
68            .filter(|k| !k.is_empty())
69            .map(ToOwned::to_owned)
70            .context(
71                "Missing OpenAI TTS API key: set `[tts_providers.openai.<alias>].api_key` (or via \
72                 `ZEROCLAW_providers__tts__openai__<alias>__api_key=...`).",
73            )?;
74
75        Ok(Self {
76            alias: alias.to_string(),
77            api_key,
78            model: config
79                .model
80                .clone()
81                .filter(|m| !m.trim().is_empty())
82                .unwrap_or_else(|| "tts-1".to_string()),
83            speed: config.speed.unwrap_or(1.0),
84            base_url: config
85                .uri
86                .clone()
87                .filter(|u| !u.trim().is_empty())
88                .unwrap_or_else(|| "https://api.openai.com/v1/audio/speech".to_string()),
89            response_format: config
90                .response_format
91                .clone()
92                .filter(|f| !f.trim().is_empty())
93                .unwrap_or_else(|| "opus".to_string()),
94            client: reqwest::Client::builder()
95                .timeout(TTS_HTTP_TIMEOUT)
96                .build()
97                .context("Failed to build HTTP client for OpenAI TTS")?,
98        })
99    }
100}
101
102#[async_trait::async_trait]
103impl TtsProvider for OpenAiTtsProvider {
104    fn name(&self) -> &str {
105        "openai"
106    }
107
108    async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
109        let body = serde_json::json!({
110            "model": self.model,
111            "input": text,
112            "voice": voice,
113            "speed": self.speed,
114            "response_format": self.response_format,
115        });
116
117        let resp = self
118            .client
119            .post(&self.base_url)
120            .bearer_auth(&self.api_key)
121            .json(&body)
122            .send()
123            .await
124            .context("Failed to send OpenAI TTS request")?;
125
126        let status = resp.status();
127        if !status.is_success() {
128            let error_body: serde_json::Value = resp
129                .json()
130                .await
131                .unwrap_or_else(|_| serde_json::json!({"error": "unknown"}));
132            let msg = error_body["error"]["message"]
133                .as_str()
134                .unwrap_or("unknown error");
135            bail!("OpenAI TTS API error ({}): {}", status, msg);
136        }
137
138        let bytes = resp
139            .bytes()
140            .await
141            .context("Failed to read OpenAI TTS response body")?;
142        Ok(bytes.to_vec())
143    }
144
145    fn supported_voices(&self) -> Vec<String> {
146        ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
147            .iter()
148            .map(|s| (*s).to_string())
149            .collect()
150    }
151
152    fn supported_formats(&self) -> Vec<String> {
153        ["mp3", "opus", "aac", "flac", "wav", "pcm"]
154            .iter()
155            .map(|s| (*s).to_string())
156            .collect()
157    }
158}
159
160// ── ElevenLabs TTS ───────────────────────────────────────────────
161
162/// ElevenLabs TTS model_provider (`POST /v1/text-to-speech/{voice_id}`).
163pub struct ElevenLabsTtsProvider {
164    alias: String,
165    api_key: String,
166    model_id: String,
167    stability: f64,
168    similarity_boost: f64,
169    client: reqwest::Client,
170}
171
172impl ElevenLabsTtsProvider {
173    /// Create a new ElevenLabs TTS model_provider from config. Reads
174    /// `[tts_providers.elevenlabs.<alias>].api_key`. Legacy
175    /// `ELEVENLABS_API_KEY` env-var fallback eradicated in V0.8.0.
176    pub fn new(alias: &str, config: &TtsProviderConfig) -> Result<Self> {
177        let api_key = config
178            .api_key
179            .as_deref()
180            .map(str::trim)
181            .filter(|k| !k.is_empty())
182            .map(ToOwned::to_owned)
183            .context(
184                "Missing ElevenLabs API key: set `[tts_providers.elevenlabs.<alias>].api_key` (or \
185                 via `ZEROCLAW_providers__tts__elevenlabs__<alias>__api_key=...`).",
186            )?;
187
188        Ok(Self {
189            alias: alias.to_string(),
190            api_key,
191            model_id: config
192                .model
193                .clone()
194                .filter(|m| !m.trim().is_empty())
195                .unwrap_or_else(|| "eleven_monolingual_v1".to_string()),
196            stability: config.stability.unwrap_or(0.5),
197            similarity_boost: config.similarity_boost.unwrap_or(0.5),
198            client: reqwest::Client::builder()
199                .timeout(TTS_HTTP_TIMEOUT)
200                .build()
201                .context("Failed to build HTTP client for ElevenLabs TTS")?,
202        })
203    }
204}
205
206#[async_trait::async_trait]
207impl TtsProvider for ElevenLabsTtsProvider {
208    fn name(&self) -> &str {
209        "elevenlabs"
210    }
211
212    async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
213        if !voice
214            .chars()
215            .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
216        {
217            bail!("ElevenLabs voice ID contains invalid characters: {voice}");
218        }
219        let url = format!("https://api.elevenlabs.io/v1/text-to-speech/{voice}");
220        let body = serde_json::json!({
221            "text": text,
222            "model_id": self.model_id,
223            "voice_settings": {
224                "stability": self.stability,
225                "similarity_boost": self.similarity_boost,
226            },
227        });
228
229        let resp = self
230            .client
231            .post(&url)
232            .header("xi-api-key", &self.api_key)
233            .json(&body)
234            .send()
235            .await
236            .context("Failed to send ElevenLabs TTS request")?;
237
238        let status = resp.status();
239        if !status.is_success() {
240            let error_body: serde_json::Value = resp
241                .json()
242                .await
243                .unwrap_or_else(|_| serde_json::json!({"error": "unknown"}));
244            let msg = error_body["detail"]["message"]
245                .as_str()
246                .or_else(|| error_body["detail"].as_str())
247                .unwrap_or("unknown error");
248            bail!("ElevenLabs TTS API error ({}): {}", status, msg);
249        }
250
251        let bytes = resp
252            .bytes()
253            .await
254            .context("Failed to read ElevenLabs TTS response body")?;
255        Ok(bytes.to_vec())
256    }
257
258    fn supported_voices(&self) -> Vec<String> {
259        // ElevenLabs voices are user-specific; return empty (dynamic lookup).
260        Vec::new()
261    }
262
263    fn supported_formats(&self) -> Vec<String> {
264        ["mp3", "pcm", "ulaw"]
265            .iter()
266            .map(|s| (*s).to_string())
267            .collect()
268    }
269}
270
271// ── Google Cloud TTS ─────────────────────────────────────────────
272
273/// Google Cloud TTS model_provider (`POST /v1/text:synthesize`).
274pub struct GoogleTtsProvider {
275    alias: String,
276    api_key: String,
277    language_code: String,
278    client: reqwest::Client,
279}
280
281impl GoogleTtsProvider {
282    /// Create a new Google Cloud TTS model_provider from config, resolving the API key
283    /// from `[tts_providers.google.<alias>].api_key`. Legacy
284    /// `GOOGLE_TTS_API_KEY` env-var fallback eradicated in V0.8.0.
285    pub fn new(alias: &str, config: &TtsProviderConfig) -> Result<Self> {
286        let api_key = config
287            .api_key
288            .as_deref()
289            .map(str::trim)
290            .filter(|k| !k.is_empty())
291            .map(ToOwned::to_owned)
292            .context(
293                "Missing Google TTS API key: set `[tts_providers.google.<alias>].api_key` (or via \
294                 `ZEROCLAW_providers__tts__google__<alias>__api_key=...`).",
295            )?;
296
297        Ok(Self {
298            alias: alias.to_string(),
299            api_key,
300            language_code: config
301                .language_code
302                .clone()
303                .filter(|c| !c.trim().is_empty())
304                .unwrap_or_else(|| "en-US".to_string()),
305            client: reqwest::Client::builder()
306                .timeout(TTS_HTTP_TIMEOUT)
307                .build()
308                .context("Failed to build HTTP client for Google TTS")?,
309        })
310    }
311}
312
313#[async_trait::async_trait]
314impl TtsProvider for GoogleTtsProvider {
315    fn name(&self) -> &str {
316        "google"
317    }
318
319    async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
320        let url = "https://texttospeech.googleapis.com/v1/text:synthesize";
321        let body = serde_json::json!({
322            "input": { "text": text },
323            "voice": {
324                "languageCode": self.language_code,
325                "name": voice,
326            },
327            "audioConfig": {
328                "audioEncoding": "MP3",
329            },
330        });
331
332        let resp = self
333            .client
334            .post(url)
335            .header("x-goog-api-key", &self.api_key)
336            .json(&body)
337            .send()
338            .await
339            .context("Failed to send Google TTS request")?;
340
341        let status = resp.status();
342        let resp_body: serde_json::Value = resp
343            .json()
344            .await
345            .context("Failed to parse Google TTS response")?;
346
347        if !status.is_success() {
348            let msg = resp_body["error"]["message"]
349                .as_str()
350                .unwrap_or("unknown error");
351            bail!("Google TTS API error ({}): {}", status, msg);
352        }
353
354        let audio_b64 = resp_body["audioContent"]
355            .as_str()
356            .context("Google TTS response missing 'audioContent' field")?;
357
358        use base64::Engine;
359        let bytes = base64::engine::general_purpose::STANDARD
360            .decode(audio_b64)
361            .context("Failed to decode Google TTS base64 audio")?;
362        Ok(bytes)
363    }
364
365    fn supported_voices(&self) -> Vec<String> {
366        // Google voices vary by language; return common English defaults.
367        [
368            "en-US-Standard-A",
369            "en-US-Standard-B",
370            "en-US-Standard-C",
371            "en-US-Standard-D",
372        ]
373        .iter()
374        .map(|s| (*s).to_string())
375        .collect()
376    }
377
378    fn supported_formats(&self) -> Vec<String> {
379        ["mp3", "wav", "ogg"]
380            .iter()
381            .map(|s| (*s).to_string())
382            .collect()
383    }
384}
385
386// ── Edge TTS (subprocess) ────────────────────────────────────────
387
388/// Edge TTS model_provider — free, uses the `edge-tts` CLI subprocess.
389pub struct EdgeTtsProvider {
390    alias: String,
391    binary_path: String,
392}
393
394impl EdgeTtsProvider {
395    /// Allowed basenames for the Edge TTS binary.
396    const ALLOWED_BINARIES: &[&str] = &["edge-tts", "edge-playback"];
397
398    /// Create a new Edge TTS model_provider from config.
399    ///
400    /// `binary_path` must be a bare command name (no path separators) matching
401    /// one of `ALLOWED_BINARIES`. This prevents arbitrary executable
402    /// paths like `/tmp/malicious/edge-tts` from passing the basename check.
403    pub fn new(alias: &str, config: &TtsProviderConfig) -> Result<Self> {
404        let raw_path = config
405            .binary_path
406            .clone()
407            .filter(|p| !p.trim().is_empty())
408            .unwrap_or_else(|| "edge-tts".to_string());
409        if raw_path.contains('/') || raw_path.contains('\\') {
410            bail!(
411                "Edge TTS binary_path must be a bare command name without path separators, got: {raw_path}"
412            );
413        }
414        if !Self::ALLOWED_BINARIES.contains(&raw_path.as_str()) {
415            bail!(
416                "Edge TTS binary_path must be one of {:?}, got: {raw_path}",
417                Self::ALLOWED_BINARIES,
418            );
419        }
420        Ok(Self {
421            alias: alias.to_string(),
422            binary_path: raw_path,
423        })
424    }
425}
426
427#[async_trait::async_trait]
428impl TtsProvider for EdgeTtsProvider {
429    fn name(&self) -> &str {
430        "edge"
431    }
432
433    async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
434        let temp_dir = std::env::temp_dir();
435        let output_file = temp_dir.join(format!("zeroclaw_tts_{}.mp3", uuid::Uuid::new_v4()));
436        let output_path = output_file
437            .to_str()
438            .context("Failed to build temp file path for Edge TTS")?;
439
440        let output = tokio::time::timeout(
441            TTS_HTTP_TIMEOUT,
442            tokio::process::Command::new(&self.binary_path)
443                .arg("--text")
444                .arg(text)
445                .arg("--voice")
446                .arg(voice)
447                .arg("--write-media")
448                .arg(output_path)
449                .output(),
450        )
451        .await
452        .context("Edge TTS subprocess timed out")?
453        .context("Failed to spawn edge-tts subprocess")?;
454
455        if !output.status.success() {
456            let stderr = String::from_utf8_lossy(&output.stderr);
457            // Clean up temp file on failure.
458            let _ = tokio::fs::remove_file(&output_file).await;
459            bail!("edge-tts failed (exit {}): {}", output.status, stderr);
460        }
461
462        let bytes = tokio::fs::read(&output_file)
463            .await
464            .context("Failed to read edge-tts output file")?;
465
466        // Clean up temp file.
467        let _ = tokio::fs::remove_file(&output_file).await;
468
469        Ok(bytes)
470    }
471
472    fn supported_voices(&self) -> Vec<String> {
473        // Edge TTS has many voices; return common defaults.
474        [
475            "en-US-AriaNeural",
476            "en-US-GuyNeural",
477            "en-US-JennyNeural",
478            "en-GB-SoniaNeural",
479        ]
480        .iter()
481        .map(|s| (*s).to_string())
482        .collect()
483    }
484
485    fn supported_formats(&self) -> Vec<String> {
486        vec!["mp3".to_string()]
487    }
488}
489
490// ── Piper TTS (local, OpenAI-compatible) ─────────────────────────
491
492/// Piper TTS model_provider — local GPU-accelerated server with an OpenAI-compatible endpoint.
493pub struct PiperTtsProvider {
494    alias: String,
495    client: reqwest::Client,
496    api_url: String,
497}
498
499impl PiperTtsProvider {
500    /// Create a new Piper TTS model_provider from config. Falls back to
501    /// `http://127.0.0.1:5000/v1/audio/speech` when no `api_url` is supplied.
502    pub fn new(alias: &str, config: &TtsProviderConfig) -> Self {
503        let api_url = config
504            .uri
505            .clone()
506            .filter(|u| !u.trim().is_empty())
507            .unwrap_or_else(|| "http://127.0.0.1:5000/v1/audio/speech".to_string());
508        Self {
509            alias: alias.to_string(),
510            client: reqwest::Client::builder()
511                .timeout(TTS_HTTP_TIMEOUT)
512                .build()
513                .expect("Failed to build HTTP client for Piper TTS"),
514            api_url,
515        }
516    }
517}
518
519#[async_trait::async_trait]
520impl TtsProvider for PiperTtsProvider {
521    fn name(&self) -> &str {
522        "piper"
523    }
524
525    async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
526        let body = serde_json::json!({
527            "model": "tts-1",
528            "input": text,
529            "voice": voice,
530        });
531
532        let resp = self
533            .client
534            .post(&self.api_url)
535            .json(&body)
536            .send()
537            .await
538            .context("Failed to send Piper TTS request")?;
539
540        let status = resp.status();
541        if !status.is_success() {
542            let error_body: serde_json::Value = resp
543                .json()
544                .await
545                .unwrap_or_else(|_| serde_json::json!({"error": "unknown"}));
546            let msg = error_body["error"]["message"]
547                .as_str()
548                .unwrap_or("unknown error");
549            bail!("Piper TTS API error ({}): {}", status, msg);
550        }
551
552        let bytes = resp
553            .bytes()
554            .await
555            .context("Failed to read Piper TTS response body")?;
556        Ok(bytes.to_vec())
557    }
558
559    fn supported_voices(&self) -> Vec<String> {
560        // Piper voices depend on installed models; return empty (dynamic).
561        Vec::new()
562    }
563
564    fn supported_formats(&self) -> Vec<String> {
565        ["mp3", "wav", "opus"]
566            .iter()
567            .map(|s| (*s).to_string())
568            .collect()
569    }
570}
571
572// ── TtsManager ───────────────────────────────────────────────────
573
574/// Central manager for per-agent TTS synthesis.
575///
576/// `tts_providers` are keyed by their dotted alias (`<type>.<alias>`).
577/// Per-instance voice overrides come from the `voice` field on each
578/// `TtsProviderConfig`. The `agent_tts_provider` field carries the
579/// resolved alias for the agent that owns this manager instance — empty
580/// means the agent doesn't want TTS, and `synthesize_for_agent` fails
581/// loud rather than silently pick a default.
582pub struct TtsManager {
583    tts_providers: HashMap<String, Box<dyn TtsProvider>>,
584    voice_by_alias: HashMap<String, String>,
585    /// Resolved alias for the agent that owns this manager. Empty when
586    /// the agent has no TTS preference (opt-out).
587    agent_tts_provider: String,
588    default_voice: String,
589    max_text_length: usize,
590}
591
592impl TtsManager {
593    /// Build a `TtsManager` from `[tts_providers.<type>.<alias>]` instances
594    /// in `Config`. Each instance is registered under its dotted alias key
595    /// (`<type>.<alias>`). Failures to construct a particular instance are
596    /// logged at warn but do not abort the manager.
597    /// Build a `TtsManager` from `[tts_providers.<type>.<alias>]` instances.
598    /// The manager's resolved alias comes from the runtime-active agent's
599    /// `tts_provider` field — there is no global default-provider concept,
600    /// so when no agent-bound resolution is available the manager refuses
601    /// to silently pick a provider (`synthesize` fails loud).
602    pub fn from_config(config: &Config) -> Result<Self> {
603        let mut tts_providers: HashMap<String, Box<dyn TtsProvider>> = HashMap::new();
604        let mut voice_by_alias: HashMap<String, String> = HashMap::new();
605
606        // Typed dispatch over the TtsProviders container's named slots. The
607        // unknown-type warn-and-skip arm is gone — the typed container can't
608        // hold an unrecognized family.
609        for (family, alias, instance) in config.providers.tts.iter_entries() {
610            let dotted = format!("{family}.{alias}");
611            let result: Result<Box<dyn TtsProvider>> = match family {
612                "openai" => OpenAiTtsProvider::new(alias, instance).map(|p| Box::new(p) as _),
613                "elevenlabs" => {
614                    ElevenLabsTtsProvider::new(alias, instance).map(|p| Box::new(p) as _)
615                }
616                "google" => GoogleTtsProvider::new(alias, instance).map(|p| Box::new(p) as _),
617                "edge" => EdgeTtsProvider::new(alias, instance).map(|p| Box::new(p) as _),
618                "piper" => Ok(Box::new(PiperTtsProvider::new(alias, instance)) as _),
619                _ => unreachable!("TtsProviders typed slots cover all 5 families"),
620            };
621            match result {
622                Ok(p) => {
623                    tts_providers.insert(dotted.clone(), p);
624                    if let Some(voice) = instance
625                        .voice
626                        .as_deref()
627                        .map(str::trim)
628                        .filter(|v| !v.is_empty())
629                    {
630                        voice_by_alias.insert(dotted, voice.to_string());
631                    }
632                }
633                Err(e) => {
634                    ::zeroclaw_log::record!(
635                        WARN,
636                        ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
637                            .with_outcome(::zeroclaw_log::EventOutcome::Unknown)
638                            .with_attrs(
639                                ::serde_json::json!({"error": format!("{}", e), "dotted": dotted})
640                            ),
641                        "Skipping TTS provider"
642                    );
643                }
644            }
645        }
646
647        let max_text_length = if config.tts.max_text_length == 0 {
648            DEFAULT_MAX_TEXT_LENGTH
649        } else {
650            config.tts.max_text_length
651        };
652
653        // Per-agent join: the runtime-active agent's `tts_provider` is the
654        // resolved alias for this manager instance. Empty (or no resolved
655        // agent) = no TTS; `synthesize` fails loud rather than silently
656        // pick a provider.
657        let agent_tts_provider = config
658            .resolved_runtime_agent_alias()
659            .and_then(|alias| config.agents.get(alias))
660            .map(|a| a.tts_provider.as_str().to_string())
661            .unwrap_or_default();
662
663        Ok(Self {
664            tts_providers,
665            voice_by_alias,
666            agent_tts_provider,
667            default_voice: config.tts.default_voice.clone(),
668            max_text_length,
669        })
670    }
671
672    /// Synthesize text using the runtime-active agent's resolved
673    /// `tts_provider` reference and the per-instance voice override (or
674    /// `default_voice` as the per-instance fallback). Fails loud when the
675    /// agent has no `tts_provider` configured — there is no global
676    /// default-provider concept and this manager refuses to silently pick
677    /// one.
678    pub async fn synthesize(&self, text: &str) -> Result<Vec<u8>> {
679        let provider_alias = self.agent_tts_provider.as_str();
680        if provider_alias.is_empty() {
681            bail!(
682                "Agent has no tts_provider configured. Set \
683                 `agent.<alias>.tts_provider = \"<type>.<alias>\"` referencing a \
684                 [tts_providers.<type>.<alias>] entry."
685            );
686        }
687        let voice = self
688            .voice_by_alias
689            .get(provider_alias)
690            .map_or(self.default_voice.as_str(), String::as_str);
691        self.synthesize_with_provider(text, provider_alias, voice)
692            .await
693    }
694
695    /// Synthesize text using a specific dotted-alias model_provider and voice.
696    pub async fn synthesize_with_provider(
697        &self,
698        text: &str,
699        provider_alias: &str,
700        voice: &str,
701    ) -> Result<Vec<u8>> {
702        if text.is_empty() {
703            bail!("TTS text must not be empty");
704        }
705        let char_count = text.chars().count();
706        if char_count > self.max_text_length {
707            bail!(
708                "TTS text too long ({} chars, max {})",
709                char_count,
710                self.max_text_length
711            );
712        }
713
714        let tts = self.tts_providers.get(provider_alias).ok_or_else(|| {
715            let available = self.available_providers().join(", ");
716            ::zeroclaw_log::record!(
717                ERROR,
718                ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Reject)
719                    .with_outcome(::zeroclaw_log::EventOutcome::Failure)
720                    .with_attrs(::serde_json::json!({
721                        "tts_provider": provider_alias,
722                        "available": available,
723                    })),
724                "tts: provider not configured"
725            );
726            anyhow::Error::msg(format!(
727                "TTS model_provider '{}' not configured (available: {})",
728                provider_alias, available
729            ))
730        })?;
731
732        use ::zeroclaw_log::Instrument;
733        let span = ::zeroclaw_log::attribution_span!(tts.as_ref());
734        ::zeroclaw_log::scope!(voice: voice, => tts.synthesize(text, voice))
735            .instrument(span)
736            .await
737    }
738
739    /// List dotted aliases of all initialized tts_providers.
740    pub fn available_providers(&self) -> Vec<String> {
741        let mut names: Vec<_> = self.tts_providers.keys().cloned().collect();
742        names.sort();
743        names
744    }
745}
746
747// ── Tests ────────────────────────────────────────────────────────
748
749impl ::zeroclaw_api::attribution::Attributable for OpenAiTtsProvider {
750    fn role(&self) -> ::zeroclaw_api::attribution::Role {
751        ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
752            ::zeroclaw_api::attribution::TtsProviderKind::OpenAi,
753        ))
754    }
755    fn alias(&self) -> &str {
756        &self.alias
757    }
758}
759
760impl ::zeroclaw_api::attribution::Attributable for ElevenLabsTtsProvider {
761    fn role(&self) -> ::zeroclaw_api::attribution::Role {
762        ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
763            ::zeroclaw_api::attribution::TtsProviderKind::ElevenLabs,
764        ))
765    }
766    fn alias(&self) -> &str {
767        &self.alias
768    }
769}
770
771impl ::zeroclaw_api::attribution::Attributable for GoogleTtsProvider {
772    fn role(&self) -> ::zeroclaw_api::attribution::Role {
773        ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
774            ::zeroclaw_api::attribution::TtsProviderKind::Google,
775        ))
776    }
777    fn alias(&self) -> &str {
778        &self.alias
779    }
780}
781
782impl ::zeroclaw_api::attribution::Attributable for EdgeTtsProvider {
783    fn role(&self) -> ::zeroclaw_api::attribution::Role {
784        ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
785            ::zeroclaw_api::attribution::TtsProviderKind::Edge,
786        ))
787    }
788    fn alias(&self) -> &str {
789        &self.alias
790    }
791}
792
793impl ::zeroclaw_api::attribution::Attributable for PiperTtsProvider {
794    fn role(&self) -> ::zeroclaw_api::attribution::Role {
795        ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
796            ::zeroclaw_api::attribution::TtsProviderKind::Piper,
797        ))
798    }
799    fn alias(&self) -> &str {
800        &self.alias
801    }
802}
803
804#[cfg(test)]
805mod tests {
806    use super::*;
807
808    fn config_with_edge_alias() -> Config {
809        let mut cfg = Config::default();
810        cfg.agents.insert(
811            "default".into(),
812            zeroclaw_config::schema::AliasedAgentConfig {
813                tts_provider: "edge.default".into(),
814                ..Default::default()
815            },
816        );
817        cfg.providers.tts.edge.insert(
818            "default".to_string(),
819            zeroclaw_config::schema::EdgeTtsProviderConfig {
820                base: TtsProviderConfig {
821                    binary_path: Some("edge-tts".to_string()),
822                    ..TtsProviderConfig::default()
823                },
824            },
825        );
826        cfg
827    }
828
829    fn config_with_piper_alias() -> Config {
830        let mut cfg = Config::default();
831        cfg.agents.insert(
832            "default".into(),
833            zeroclaw_config::schema::AliasedAgentConfig {
834                tts_provider: "piper.default".into(),
835                ..Default::default()
836            },
837        );
838        cfg.providers.tts.piper.insert(
839            "default".to_string(),
840            zeroclaw_config::schema::PiperTtsProviderConfig {
841                base: TtsProviderConfig {
842                    uri: Some("http://127.0.0.1:5000/v1/audio/speech".to_string()),
843                    ..TtsProviderConfig::default()
844                },
845            },
846        );
847        cfg
848    }
849
850    #[test]
851    fn tts_manager_creation_with_defaults() {
852        let config = Config::default();
853        let manager = TtsManager::from_config(&config).unwrap();
854        assert!(manager.available_providers().is_empty());
855    }
856
857    #[test]
858    fn tts_manager_registers_alias_keyed_provider() {
859        let cfg = config_with_edge_alias();
860        let manager = TtsManager::from_config(&cfg).unwrap();
861        assert_eq!(manager.available_providers(), vec!["edge.default"]);
862    }
863
864    #[tokio::test]
865    async fn tts_rejects_empty_text() {
866        let cfg = config_with_edge_alias();
867        let manager = TtsManager::from_config(&cfg).unwrap();
868        let err = manager
869            .synthesize_with_provider("", "edge.default", "en-US-AriaNeural")
870            .await
871            .unwrap_err();
872        assert!(
873            err.to_string().contains("must not be empty"),
874            "expected empty-text error, got: {err}"
875        );
876    }
877
878    #[tokio::test]
879    async fn tts_rejects_text_exceeding_max_length() {
880        let mut cfg = config_with_edge_alias();
881        cfg.tts.max_text_length = 10;
882        let manager = TtsManager::from_config(&cfg).unwrap();
883        let long_text = "a".repeat(11);
884        let err = manager
885            .synthesize_with_provider(&long_text, "edge.default", "en-US-AriaNeural")
886            .await
887            .unwrap_err();
888        assert!(
889            err.to_string().contains("too long"),
890            "expected too-long error, got: {err}"
891        );
892    }
893
894    #[tokio::test]
895    async fn tts_rejects_unknown_provider() {
896        let cfg = Config::default();
897        let manager = TtsManager::from_config(&cfg).unwrap();
898        let err = manager
899            .synthesize_with_provider("hello", "nonexistent.alias", "voice")
900            .await
901            .unwrap_err();
902        assert!(
903            err.to_string().contains("not configured"),
904            "expected not-configured error, got: {err}"
905        );
906    }
907
908    #[test]
909    fn piper_provider_creation_uses_default_url_when_unset() {
910        let model_provider = PiperTtsProvider::new("test", &TtsProviderConfig::default());
911        assert_eq!(model_provider.name(), "piper");
912        assert_eq!(
913            model_provider.api_url,
914            "http://127.0.0.1:5000/v1/audio/speech"
915        );
916        assert_eq!(
917            model_provider.supported_formats(),
918            vec!["mp3", "wav", "opus"]
919        );
920        assert!(model_provider.supported_voices().is_empty());
921    }
922
923    #[test]
924    fn tts_manager_with_piper_alias() {
925        let cfg = config_with_piper_alias();
926        let manager = TtsManager::from_config(&cfg).unwrap();
927        assert_eq!(manager.available_providers(), vec!["piper.default"]);
928    }
929
930    #[tokio::test]
931    async fn tts_rejects_empty_text_for_piper() {
932        let cfg = config_with_piper_alias();
933        let manager = TtsManager::from_config(&cfg).unwrap();
934        let err = manager
935            .synthesize_with_provider("", "piper.default", "default")
936            .await
937            .unwrap_err();
938        assert!(
939            err.to_string().contains("must not be empty"),
940            "expected empty-text error, got: {err}"
941        );
942    }
943
944    #[test]
945    fn tts_config_defaults() {
946        let config = zeroclaw_config::schema::TtsConfig::default();
947        assert!(!config.enabled);
948        // TtsConfig has no global default-provider field; per-agent
949        // `tts_provider` is the only selector.
950        assert_eq!(config.default_voice, "alloy");
951        assert_eq!(config.default_format, "mp3");
952        assert_eq!(config.max_text_length, DEFAULT_MAX_TEXT_LENGTH);
953    }
954
955    #[test]
956    fn tts_manager_max_text_length_zero_uses_default() {
957        let mut cfg = Config::default();
958        cfg.tts.max_text_length = 0;
959        let manager = TtsManager::from_config(&cfg).unwrap();
960        assert_eq!(manager.max_text_length, DEFAULT_MAX_TEXT_LENGTH);
961    }
962
963    #[tokio::test]
964    async fn synthesize_posts_to_configured_uri_with_response_format() {
965        use wiremock::matchers::{method, path};
966        use wiremock::{Mock, MockServer, ResponseTemplate};
967
968        let server = MockServer::start().await;
969        Mock::given(method("POST"))
970            .and(path("/v1/audio/speech"))
971            .respond_with(ResponseTemplate::new(200).set_body_bytes(b"FAKE_WAV".to_vec()))
972            .mount(&server)
973            .await;
974
975        let cfg = TtsProviderConfig {
976            api_key: Some("sk-test".to_string()),
977            uri: Some(format!("{}/v1/audio/speech", server.uri())),
978            response_format: Some("wav".to_string()),
979            ..TtsProviderConfig::default()
980        };
981        let provider = OpenAiTtsProvider::new("test", &cfg).unwrap();
982
983        let audio = provider.synthesize("hello world", "hannah").await.unwrap();
984        assert_eq!(
985            audio, b"FAKE_WAV",
986            "synthesize should return the bytes served by the configured endpoint"
987        );
988
989        let reqs = server.received_requests().await.unwrap();
990        assert_eq!(
991            reqs.len(),
992            1,
993            "exactly one POST should reach the configured uri"
994        );
995        let body: serde_json::Value = serde_json::from_slice(&reqs[0].body).unwrap();
996        assert_eq!(
997            body["response_format"], "wav",
998            "configured response_format must reach the outgoing request body"
999        );
1000        assert_eq!(body["input"], "hello world");
1001        assert_eq!(body["voice"], "hannah");
1002        assert_eq!(body["model"], "tts-1");
1003    }
1004
1005    #[tokio::test]
1006    async fn synthesize_defaults_response_format_to_opus_when_unset() {
1007        use wiremock::matchers::{method, path};
1008        use wiremock::{Mock, MockServer, ResponseTemplate};
1009
1010        let server = MockServer::start().await;
1011        Mock::given(method("POST"))
1012            .and(path("/v1/audio/speech"))
1013            .respond_with(ResponseTemplate::new(200).set_body_bytes(b"AUDIO".to_vec()))
1014            .mount(&server)
1015            .await;
1016
1017        // uri points at the mock so we can inspect the body; response_format left unset.
1018        let cfg = TtsProviderConfig {
1019            api_key: Some("sk-test".to_string()),
1020            uri: Some(format!("{}/v1/audio/speech", server.uri())),
1021            ..TtsProviderConfig::default()
1022        };
1023        let provider = OpenAiTtsProvider::new("test", &cfg).unwrap();
1024        provider.synthesize("hi", "alloy").await.unwrap();
1025
1026        let reqs = server.received_requests().await.unwrap();
1027        let body: serde_json::Value = serde_json::from_slice(&reqs[0].body).unwrap();
1028        assert_eq!(
1029            body["response_format"], "opus",
1030            "unset response_format must default to opus in the outgoing request"
1031        );
1032    }
1033
1034    #[test]
1035    fn openai_defaults_to_production_endpoint_when_uri_unset() {
1036        let cfg = TtsProviderConfig {
1037            api_key: Some("sk-test".to_string()),
1038            ..TtsProviderConfig::default()
1039        };
1040        let provider = OpenAiTtsProvider::new("test", &cfg).unwrap();
1041        assert_eq!(provider.base_url, "https://api.openai.com/v1/audio/speech");
1042        assert_eq!(provider.response_format, "opus");
1043    }
1044}
zeroclaw_channels/tts.rs

zeroclaw_channels/
tts.rs