zeroclaw_channels/
tts.rs

1//! Multi-provider Text-to-Speech (TTS) subsystem.
2//!
3//! Supports OpenAI, ElevenLabs, Google Cloud TTS, Edge TTS (free, subprocess-based),
4//! and Piper TTS (local GPU-accelerated, OpenAI-compatible endpoint).
5//!
6//! per-instance configs live under `[tts_providers.<type>.<alias>]`; agents
7//! pick which instance to use via the `tts_provider` dotted alias reference.
8//! Global runtime knobs (default_voice, max_text_length, etc.) live on `[tts]`.
9
10use std::collections::HashMap;
11
12use anyhow::{Context, Result, bail};
13
14use zeroclaw_config::schema::{Config, TtsProviderConfig};
15
16/// Maximum text length before synthesis is rejected (default: 4096 chars).
17const DEFAULT_MAX_TEXT_LENGTH: usize = 4096;
18
19/// Default HTTP request timeout for TTS API calls.
20const TTS_HTTP_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(60);
21
22// ── TtsProvider trait ────────────────────────────────────────────
23
24/// Trait for pluggable TTS backends.
25#[async_trait::async_trait]
26pub trait TtsProvider: Send + Sync + ::zeroclaw_api::attribution::Attributable {
27    /// ModelProvider identifier (e.g. `"openai"`, `"elevenlabs"`).
28    fn name(&self) -> &str;
29
30    /// Synthesize `text` using the given `voice`, returning raw audio bytes.
31    async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>>;
32
33    /// The audio container/codec of the bytes returned by `synthesize`
34    /// (e.g. `"opus"`, `"wav"`, `"mp3"`). Used by `TtsManager::synthesize_opus`
35    /// to decide whether transcoding is necessary — only `"opus"` skips it.
36    fn output_format(&self) -> &str;
37
38    /// Voices supported by this model_provider.
39    fn supported_voices(&self) -> Vec<String>;
40
41    /// Audio output formats supported by this model_provider.
42    fn supported_formats(&self) -> Vec<String>;
43}
44
45// ── OpenAI TTS ───────────────────────────────────────────────────
46
47/// OpenAI TTS model_provider (`POST /v1/audio/speech`).
48pub struct OpenAiTtsProvider {
49    alias: String,
50    api_key: String,
51    model: String,
52    speed: f64,
53    /// Full endpoint URL. Defaults to the OpenAI production endpoint; can be
54    /// overridden via `[providers.tts.openai.<alias>].uri` to point at any
55    /// OpenAI-compatible TTS backend (Groq, Azure, self-hosted proxies).
56    base_url: String,
57    /// Audio response format. Defaults to `"opus"`; override to `"wav"` for
58    /// Orpheus-class models or `"mp3"` for broader compatibility.
59    response_format: String,
60    client: reqwest::Client,
61}
62
63impl OpenAiTtsProvider {
64    /// Create a new OpenAI TTS model_provider from config. Reads
65    /// `[tts_providers.openai.<alias>].api_key` (or via the schema-mirror
66    /// env grammar). Legacy `OPENAI_API_KEY` env-var fallback eradicated
67    /// in V0.8.0.
68    pub fn new(alias: &str, config: &TtsProviderConfig) -> Result<Self> {
69        let api_key = config
70            .api_key
71            .as_deref()
72            .map(str::trim)
73            .filter(|k| !k.is_empty())
74            .map(ToOwned::to_owned)
75            .context(
76                "Missing OpenAI TTS API key: set `[tts_providers.openai.<alias>].api_key` (or via \
77                 `ZEROCLAW_providers__tts__openai__<alias>__api_key=...`).",
78            )?;
79
80        Ok(Self {
81            alias: alias.to_string(),
82            api_key,
83            model: config
84                .model
85                .clone()
86                .filter(|m| !m.trim().is_empty())
87                .unwrap_or_else(|| "tts-1".to_string()),
88            speed: config.speed.unwrap_or(1.0),
89            base_url: config
90                .uri
91                .clone()
92                .filter(|u| !u.trim().is_empty())
93                .unwrap_or_else(|| "https://api.openai.com/v1/audio/speech".to_string()),
94            response_format: config
95                .response_format
96                .clone()
97                .filter(|f| !f.trim().is_empty())
98                .unwrap_or_else(|| "opus".to_string()),
99            client: reqwest::Client::builder()
100                .timeout(TTS_HTTP_TIMEOUT)
101                .build()
102                .context("Failed to build HTTP client for OpenAI TTS")?,
103        })
104    }
105}
106
107#[async_trait::async_trait]
108impl TtsProvider for OpenAiTtsProvider {
109    fn name(&self) -> &str {
110        "openai"
111    }
112
113    fn output_format(&self) -> &str {
114        &self.response_format
115    }
116
117    async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
118        let body = serde_json::json!({
119            "model": self.model,
120            "input": text,
121            "voice": voice,
122            "speed": self.speed,
123            "response_format": self.response_format,
124        });
125
126        let resp = self
127            .client
128            .post(&self.base_url)
129            .bearer_auth(&self.api_key)
130            .json(&body)
131            .send()
132            .await
133            .context("Failed to send OpenAI TTS request")?;
134
135        let status = resp.status();
136        if !status.is_success() {
137            let error_body: serde_json::Value = resp
138                .json()
139                .await
140                .unwrap_or_else(|_| serde_json::json!({"error": "unknown"}));
141            let msg = error_body["error"]["message"]
142                .as_str()
143                .unwrap_or("unknown error");
144            bail!("OpenAI TTS API error ({}): {}", status, msg);
145        }
146
147        let bytes = resp
148            .bytes()
149            .await
150            .context("Failed to read OpenAI TTS response body")?;
151        Ok(bytes.to_vec())
152    }
153
154    fn supported_voices(&self) -> Vec<String> {
155        ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
156            .iter()
157            .map(|s| (*s).to_string())
158            .collect()
159    }
160
161    fn supported_formats(&self) -> Vec<String> {
162        ["mp3", "opus", "aac", "flac", "wav", "pcm"]
163            .iter()
164            .map(|s| (*s).to_string())
165            .collect()
166    }
167}
168
169// ── ElevenLabs TTS ───────────────────────────────────────────────
170
171/// ElevenLabs TTS model_provider (`POST /v1/text-to-speech/{voice_id}`).
172pub struct ElevenLabsTtsProvider {
173    alias: String,
174    api_key: String,
175    model_id: String,
176    stability: f64,
177    similarity_boost: f64,
178    client: reqwest::Client,
179}
180
181impl ElevenLabsTtsProvider {
182    /// Create a new ElevenLabs TTS model_provider from config. Reads
183    /// `[tts_providers.elevenlabs.<alias>].api_key`. Legacy
184    /// `ELEVENLABS_API_KEY` env-var fallback eradicated in V0.8.0.
185    pub fn new(alias: &str, config: &TtsProviderConfig) -> Result<Self> {
186        let api_key = config
187            .api_key
188            .as_deref()
189            .map(str::trim)
190            .filter(|k| !k.is_empty())
191            .map(ToOwned::to_owned)
192            .context(
193                "Missing ElevenLabs API key: set `[tts_providers.elevenlabs.<alias>].api_key` (or \
194                 via `ZEROCLAW_providers__tts__elevenlabs__<alias>__api_key=...`).",
195            )?;
196
197        Ok(Self {
198            alias: alias.to_string(),
199            api_key,
200            model_id: config
201                .model
202                .clone()
203                .filter(|m| !m.trim().is_empty())
204                .unwrap_or_else(|| "eleven_monolingual_v1".to_string()),
205            stability: config.stability.unwrap_or(0.5),
206            similarity_boost: config.similarity_boost.unwrap_or(0.5),
207            client: reqwest::Client::builder()
208                .timeout(TTS_HTTP_TIMEOUT)
209                .build()
210                .context("Failed to build HTTP client for ElevenLabs TTS")?,
211        })
212    }
213}
214
215#[async_trait::async_trait]
216impl TtsProvider for ElevenLabsTtsProvider {
217    fn output_format(&self) -> &str {
218        "mp3"
219    }
220    fn name(&self) -> &str {
221        "elevenlabs"
222    }
223
224    async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
225        if !voice
226            .chars()
227            .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
228        {
229            bail!("ElevenLabs voice ID contains invalid characters: {voice}");
230        }
231        let url = format!("https://api.elevenlabs.io/v1/text-to-speech/{voice}");
232        let body = serde_json::json!({
233            "text": text,
234            "model_id": self.model_id,
235            "voice_settings": {
236                "stability": self.stability,
237                "similarity_boost": self.similarity_boost,
238            },
239        });
240
241        let resp = self
242            .client
243            .post(&url)
244            .header("xi-api-key", &self.api_key)
245            .json(&body)
246            .send()
247            .await
248            .context("Failed to send ElevenLabs TTS request")?;
249
250        let status = resp.status();
251        if !status.is_success() {
252            let error_body: serde_json::Value = resp
253                .json()
254                .await
255                .unwrap_or_else(|_| serde_json::json!({"error": "unknown"}));
256            let msg = error_body["detail"]["message"]
257                .as_str()
258                .or_else(|| error_body["detail"].as_str())
259                .unwrap_or("unknown error");
260            bail!("ElevenLabs TTS API error ({}): {}", status, msg);
261        }
262
263        let bytes = resp
264            .bytes()
265            .await
266            .context("Failed to read ElevenLabs TTS response body")?;
267        Ok(bytes.to_vec())
268    }
269
270    fn supported_voices(&self) -> Vec<String> {
271        // ElevenLabs voices are user-specific; return empty (dynamic lookup).
272        Vec::new()
273    }
274
275    fn supported_formats(&self) -> Vec<String> {
276        ["mp3", "pcm", "ulaw"]
277            .iter()
278            .map(|s| (*s).to_string())
279            .collect()
280    }
281}
282
283// ── Google Cloud TTS ─────────────────────────────────────────────
284
285/// Google Cloud TTS model_provider (`POST /v1/text:synthesize`).
286pub struct GoogleTtsProvider {
287    alias: String,
288    api_key: String,
289    language_code: String,
290    client: reqwest::Client,
291}
292
293impl GoogleTtsProvider {
294    /// Create a new Google Cloud TTS model_provider from config, resolving the API key
295    /// from `[tts_providers.google.<alias>].api_key`. Legacy
296    /// `GOOGLE_TTS_API_KEY` env-var fallback eradicated in V0.8.0.
297    pub fn new(alias: &str, config: &TtsProviderConfig) -> Result<Self> {
298        let api_key = config
299            .api_key
300            .as_deref()
301            .map(str::trim)
302            .filter(|k| !k.is_empty())
303            .map(ToOwned::to_owned)
304            .context(
305                "Missing Google TTS API key: set `[tts_providers.google.<alias>].api_key` (or via \
306                 `ZEROCLAW_providers__tts__google__<alias>__api_key=...`).",
307            )?;
308
309        Ok(Self {
310            alias: alias.to_string(),
311            api_key,
312            language_code: config
313                .language_code
314                .clone()
315                .filter(|c| !c.trim().is_empty())
316                .unwrap_or_else(|| "en-US".to_string()),
317            client: reqwest::Client::builder()
318                .timeout(TTS_HTTP_TIMEOUT)
319                .build()
320                .context("Failed to build HTTP client for Google TTS")?,
321        })
322    }
323}
324
325#[async_trait::async_trait]
326impl TtsProvider for GoogleTtsProvider {
327    fn output_format(&self) -> &str {
328        "mp3"
329    }
330
331    fn name(&self) -> &str {
332        "google"
333    }
334
335    async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
336        let url = "https://texttospeech.googleapis.com/v1/text:synthesize";
337        let body = serde_json::json!({
338            "input": { "text": text },
339            "voice": {
340                "languageCode": self.language_code,
341                "name": voice,
342            },
343            "audioConfig": {
344                "audioEncoding": "MP3",
345            },
346        });
347
348        let resp = self
349            .client
350            .post(url)
351            .header("x-goog-api-key", &self.api_key)
352            .json(&body)
353            .send()
354            .await
355            .context("Failed to send Google TTS request")?;
356
357        let status = resp.status();
358        let resp_body: serde_json::Value = resp
359            .json()
360            .await
361            .context("Failed to parse Google TTS response")?;
362
363        if !status.is_success() {
364            let msg = resp_body["error"]["message"]
365                .as_str()
366                .unwrap_or("unknown error");
367            bail!("Google TTS API error ({}): {}", status, msg);
368        }
369
370        let audio_b64 = resp_body["audioContent"]
371            .as_str()
372            .context("Google TTS response missing 'audioContent' field")?;
373
374        use base64::Engine;
375        let bytes = base64::engine::general_purpose::STANDARD
376            .decode(audio_b64)
377            .context("Failed to decode Google TTS base64 audio")?;
378        Ok(bytes)
379    }
380
381    fn supported_voices(&self) -> Vec<String> {
382        // Google voices vary by language; return common English defaults.
383        [
384            "en-US-Standard-A",
385            "en-US-Standard-B",
386            "en-US-Standard-C",
387            "en-US-Standard-D",
388        ]
389        .iter()
390        .map(|s| (*s).to_string())
391        .collect()
392    }
393
394    fn supported_formats(&self) -> Vec<String> {
395        ["mp3", "wav", "ogg"]
396            .iter()
397            .map(|s| (*s).to_string())
398            .collect()
399    }
400}
401
402// ── Edge TTS (subprocess) ────────────────────────────────────────
403
404/// Edge TTS model_provider — free, uses the `edge-tts` CLI subprocess.
405pub struct EdgeTtsProvider {
406    alias: String,
407    binary_path: String,
408}
409
410impl EdgeTtsProvider {
411    /// Allowed basenames for the Edge TTS binary.
412    const ALLOWED_BINARIES: &[&str] = &["edge-tts", "edge-playback"];
413
414    /// Create a new Edge TTS model_provider from config.
415    ///
416    /// `binary_path` must be a bare command name (no path separators) matching
417    /// one of `ALLOWED_BINARIES`. This prevents arbitrary executable
418    /// paths like `/tmp/malicious/edge-tts` from passing the basename check.
419    pub fn new(alias: &str, config: &TtsProviderConfig) -> Result<Self> {
420        let raw_path = config
421            .binary_path
422            .clone()
423            .filter(|p| !p.trim().is_empty())
424            .unwrap_or_else(|| "edge-tts".to_string());
425        if raw_path.contains('/') || raw_path.contains('\\') {
426            bail!(
427                "Edge TTS binary_path must be a bare command name without path separators, got: {raw_path}"
428            );
429        }
430        if !Self::ALLOWED_BINARIES.contains(&raw_path.as_str()) {
431            bail!(
432                "Edge TTS binary_path must be one of {:?}, got: {raw_path}",
433                Self::ALLOWED_BINARIES,
434            );
435        }
436        Ok(Self {
437            alias: alias.to_string(),
438            binary_path: raw_path,
439        })
440    }
441}
442
443#[async_trait::async_trait]
444impl TtsProvider for EdgeTtsProvider {
445    fn output_format(&self) -> &str {
446        "mp3"
447    }
448
449    fn name(&self) -> &str {
450        "edge"
451    }
452
453    async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
454        let temp_dir = std::env::temp_dir();
455        let output_file = temp_dir.join(format!("zeroclaw_tts_{}.mp3", uuid::Uuid::new_v4()));
456        let output_path = output_file
457            .to_str()
458            .context("Failed to build temp file path for Edge TTS")?;
459
460        let output = tokio::time::timeout(
461            TTS_HTTP_TIMEOUT,
462            tokio::process::Command::new(&self.binary_path)
463                .arg("--text")
464                .arg(text)
465                .arg("--voice")
466                .arg(voice)
467                .arg("--write-media")
468                .arg(output_path)
469                .output(),
470        )
471        .await
472        .context("Edge TTS subprocess timed out")?
473        .context("Failed to spawn edge-tts subprocess")?;
474
475        if !output.status.success() {
476            let stderr = String::from_utf8_lossy(&output.stderr);
477            // Clean up temp file on failure.
478            let _ = tokio::fs::remove_file(&output_file).await;
479            bail!("edge-tts failed (exit {}): {}", output.status, stderr);
480        }
481
482        let bytes = tokio::fs::read(&output_file)
483            .await
484            .context("Failed to read edge-tts output file")?;
485
486        // Clean up temp file.
487        let _ = tokio::fs::remove_file(&output_file).await;
488
489        Ok(bytes)
490    }
491
492    fn supported_voices(&self) -> Vec<String> {
493        // Edge TTS has many voices; return common defaults.
494        [
495            "en-US-AriaNeural",
496            "en-US-GuyNeural",
497            "en-US-JennyNeural",
498            "en-GB-SoniaNeural",
499        ]
500        .iter()
501        .map(|s| (*s).to_string())
502        .collect()
503    }
504
505    fn supported_formats(&self) -> Vec<String> {
506        vec!["mp3".to_string()]
507    }
508}
509
510// ── Piper TTS (local, OpenAI-compatible) ─────────────────────────
511
512/// Piper TTS model_provider — local GPU-accelerated server with an OpenAI-compatible endpoint.
513pub struct PiperTtsProvider {
514    alias: String,
515    client: reqwest::Client,
516    api_url: String,
517}
518
519impl PiperTtsProvider {
520    /// Create a new Piper TTS model_provider from config. Falls back to
521    /// `http://127.0.0.1:5000/v1/audio/speech` when no `api_url` is supplied.
522    pub fn new(alias: &str, config: &TtsProviderConfig) -> Self {
523        let api_url = config
524            .uri
525            .clone()
526            .filter(|u| !u.trim().is_empty())
527            .unwrap_or_else(|| "http://127.0.0.1:5000/v1/audio/speech".to_string());
528        Self {
529            alias: alias.to_string(),
530            client: reqwest::Client::builder()
531                .timeout(TTS_HTTP_TIMEOUT)
532                .build()
533                .expect("Failed to build HTTP client for Piper TTS"),
534            api_url,
535        }
536    }
537}
538
539#[async_trait::async_trait]
540impl TtsProvider for PiperTtsProvider {
541    fn output_format(&self) -> &str {
542        "wav"
543    }
544
545    fn name(&self) -> &str {
546        "piper"
547    }
548
549    async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
550        let body = serde_json::json!({
551            "model": "tts-1",
552            "input": text,
553            "voice": voice,
554        });
555
556        let resp = self
557            .client
558            .post(&self.api_url)
559            .json(&body)
560            .send()
561            .await
562            .context("Failed to send Piper TTS request")?;
563
564        let status = resp.status();
565        if !status.is_success() {
566            let error_body: serde_json::Value = resp
567                .json()
568                .await
569                .unwrap_or_else(|_| serde_json::json!({"error": "unknown"}));
570            let msg = error_body["error"]["message"]
571                .as_str()
572                .unwrap_or("unknown error");
573            bail!("Piper TTS API error ({}): {}", status, msg);
574        }
575
576        let bytes = resp
577            .bytes()
578            .await
579            .context("Failed to read Piper TTS response body")?;
580        Ok(bytes.to_vec())
581    }
582
583    fn supported_voices(&self) -> Vec<String> {
584        // Piper voices depend on installed models; return empty (dynamic).
585        Vec::new()
586    }
587
588    fn supported_formats(&self) -> Vec<String> {
589        ["mp3", "wav", "opus"]
590            .iter()
591            .map(|s| (*s).to_string())
592            .collect()
593    }
594}
595
596// ── TtsManager ───────────────────────────────────────────────────
597
598/// Transcode raw audio bytes to OGG/Opus via an `ffmpeg` subprocess.
599///
600/// Pipes `audio` into ffmpeg's stdin and reads OGG/Opus from stdout.
601/// stdin and stdout are driven concurrently to avoid buffer-deadlocks on
602/// large inputs. Requires `ffmpeg` with `libopus` support installed.
603async fn transcode_to_opus(audio: Vec<u8>) -> Result<Vec<u8>> {
604    use std::process::Stdio;
605    use tokio::io::AsyncWriteExt;
606
607    let mut child = tokio::process::Command::new("ffmpeg")
608        .args([
609            "-hide_banner",
610            "-loglevel",
611            "error",
612            "-i",
613            "pipe:0",
614            "-f",
615            "ogg",
616            "-acodec",
617            "libopus",
618            "-b:a",
619            "32k",
620            "-vbr",
621            "on",
622            "pipe:1",
623        ])
624        .stdin(Stdio::piped())
625        .stdout(Stdio::piped())
626        .stderr(Stdio::piped())
627        .spawn()
628        .context(
629            "failed to spawn ffmpeg — ensure ffmpeg with libopus support is installed \
630             (e.g. `sudo dnf install ffmpeg` / `sudo apt install ffmpeg`)",
631        )?;
632
633    let mut stdin = child.stdin.take().expect("stdin configured above");
634
635    // Drive stdin and wait concurrently: if ffmpeg fills its stdout pipe
636    // before we finish writing stdin, sequential operation would deadlock.
637    let (write_result, output) = tokio::join!(
638        async move {
639            stdin.write_all(&audio).await?;
640            stdin.shutdown().await
641        },
642        child.wait_with_output()
643    );
644
645    write_result.context("failed to write audio to ffmpeg stdin")?;
646    let output = output.context("ffmpeg process error")?;
647
648    if !output.status.success() {
649        let stderr = String::from_utf8_lossy(&output.stderr);
650        bail!("ffmpeg transcode to opus failed: {stderr}");
651    }
652
653    anyhow::ensure!(
654        !output.stdout.is_empty(),
655        "ffmpeg produced empty output — check that libopus is available"
656    );
657
658    Ok(output.stdout)
659}
660
661/// Central manager for per-agent TTS synthesis.
662///
663/// `tts_providers` are keyed by their dotted alias (`<type>.<alias>`).
664/// Per-instance voice overrides come from the `voice` field on each
665/// `TtsProviderConfig`. The `agent_tts_provider` field carries the
666/// resolved alias for the agent that owns this manager instance — empty
667/// means the agent doesn't want TTS, and `synthesize_for_agent` fails
668/// loud rather than silently pick a default.
669pub struct TtsManager {
670    tts_providers: HashMap<String, Box<dyn TtsProvider>>,
671    voice_by_alias: HashMap<String, String>,
672    /// Resolved alias for the agent that owns this manager. Empty when
673    /// the agent has no TTS preference (opt-out).
674    agent_tts_provider: String,
675    default_voice: String,
676    max_text_length: usize,
677}
678
679impl TtsManager {
680    /// Build a `TtsManager` from `[tts_providers.<type>.<alias>]` instances
681    /// in `Config`. Each instance is registered under its dotted alias key
682    /// (`<type>.<alias>`). Failures to construct a particular instance are
683    /// logged at warn but do not abort the manager.
684    /// Build a `TtsManager` from `[tts_providers.<type>.<alias>]` instances.
685    /// The manager's resolved alias comes from the runtime-active agent's
686    /// `tts_provider` field — there is no global default-provider concept,
687    /// so when no agent-bound resolution is available the manager refuses
688    /// to silently pick a provider (`synthesize` fails loud).
689    pub fn from_config(config: &Config) -> Result<Self> {
690        Self::from_config_for_agent(config, None)
691    }
692
693    /// Build a `TtsManager` bound to a specific agent's `tts_provider`.
694    ///
695    /// `agent_alias` is the channel-owning agent (resolve via
696    /// [`Config::agent_for_channel`]). When `None`, falls back to the
697    /// runtime-active agent ([`Config::resolved_runtime_agent_alias`]) for
698    /// callers that cannot determine the owning agent. Binding to the
699    /// owning agent is what lets a channel owned by e.g. `primary` use
700    /// `primary`'s `tts_provider` instead of whichever enabled agent
701    /// happens to sort first.
702    pub fn from_config_for_agent(config: &Config, agent_alias: Option<&str>) -> Result<Self> {
703        let mut tts_providers: HashMap<String, Box<dyn TtsProvider>> = HashMap::new();
704        let mut voice_by_alias: HashMap<String, String> = HashMap::new();
705
706        // Typed dispatch over the TtsProviders container's named slots. The
707        // unknown-type warn-and-skip arm is gone — the typed container can't
708        // hold an unrecognized family.
709        for (family, alias, instance) in config.providers.tts.iter_entries() {
710            let dotted = format!("{family}.{alias}");
711            let result: Result<Box<dyn TtsProvider>> = match family {
712                "openai" => OpenAiTtsProvider::new(alias, instance).map(|p| Box::new(p) as _),
713                "elevenlabs" => {
714                    ElevenLabsTtsProvider::new(alias, instance).map(|p| Box::new(p) as _)
715                }
716                "google" => GoogleTtsProvider::new(alias, instance).map(|p| Box::new(p) as _),
717                "edge" => EdgeTtsProvider::new(alias, instance).map(|p| Box::new(p) as _),
718                "piper" => Ok(Box::new(PiperTtsProvider::new(alias, instance)) as _),
719                _ => unreachable!("TtsProviders typed slots cover all 5 families"),
720            };
721            match result {
722                Ok(p) => {
723                    tts_providers.insert(dotted.clone(), p);
724                    if let Some(voice) = instance
725                        .voice
726                        .as_deref()
727                        .map(str::trim)
728                        .filter(|v| !v.is_empty())
729                    {
730                        voice_by_alias.insert(dotted, voice.to_string());
731                    }
732                }
733                Err(e) => {
734                    ::zeroclaw_log::record!(
735                        WARN,
736                        ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
737                            .with_outcome(::zeroclaw_log::EventOutcome::Unknown)
738                            .with_attrs(
739                                ::serde_json::json!({"error": format!("{}", e), "dotted": dotted})
740                            ),
741                        "Skipping TTS provider"
742                    );
743                }
744            }
745        }
746
747        let max_text_length = if config.tts.max_text_length == 0 {
748            DEFAULT_MAX_TEXT_LENGTH
749        } else {
750            config.tts.max_text_length
751        };
752
753        // Per-agent join: bind to the channel-owning agent's `tts_provider`
754        // when known, else the runtime-active agent. Empty (or no resolved
755        // agent) = no TTS; `synthesize` fails loud rather than silently
756        // pick a provider.
757        let agent_tts_provider = agent_alias
758            .or_else(|| config.resolved_runtime_agent_alias())
759            .and_then(|alias| config.agents.get(alias))
760            .map(|a| a.tts_provider.as_str().to_string())
761            .unwrap_or_default();
762
763        Ok(Self {
764            tts_providers,
765            voice_by_alias,
766            agent_tts_provider,
767            default_voice: config.tts.default_voice.clone(),
768            max_text_length,
769        })
770    }
771
772    /// Synthesize `text` and return OGG/Opus audio suitable for Telegram
773    /// `sendVoice` and WhatsApp PTT voice notes. If the active provider
774    /// already outputs Opus (e.g. OpenAI with `response_format = "opus"`),
775    /// the bytes pass through unchanged; otherwise they are transcoded via an
776    /// `ffmpeg` subprocess. Requires `ffmpeg` with `libopus` support installed.
777    pub async fn synthesize_opus(&self, text: &str) -> Result<Vec<u8>> {
778        let audio = self.synthesize(text).await?;
779        let provider_alias = self.agent_tts_provider.as_str();
780        let format = self
781            .tts_providers
782            .get(provider_alias)
783            .map(|p| p.output_format())
784            .unwrap_or("unknown");
785        if format == "opus" {
786            return Ok(audio);
787        }
788        transcode_to_opus(audio).await
789    }
790
791    /// Synthesize text using the runtime-active agent's resolved
792    /// `tts_provider` reference and the per-instance voice override (or
793    /// `default_voice` as the per-instance fallback). Fails loud when the
794    /// agent has no `tts_provider` configured — there is no global
795    /// default-provider concept and this manager refuses to silently pick
796    /// one.
797    pub async fn synthesize(&self, text: &str) -> Result<Vec<u8>> {
798        let provider_alias = self.agent_tts_provider.as_str();
799        if provider_alias.is_empty() {
800            bail!(
801                "Agent has no tts_provider configured. Set \
802                 `agent.<alias>.tts_provider = \"<type>.<alias>\"` referencing a \
803                 [tts_providers.<type>.<alias>] entry."
804            );
805        }
806        let voice = self
807            .voice_by_alias
808            .get(provider_alias)
809            .map_or(self.default_voice.as_str(), String::as_str);
810        self.synthesize_with_provider(text, provider_alias, voice)
811            .await
812    }
813
814    /// Synthesize text using a specific dotted-alias model_provider and voice.
815    pub async fn synthesize_with_provider(
816        &self,
817        text: &str,
818        provider_alias: &str,
819        voice: &str,
820    ) -> Result<Vec<u8>> {
821        if text.is_empty() {
822            bail!("TTS text must not be empty");
823        }
824        let char_count = text.chars().count();
825        if char_count > self.max_text_length {
826            bail!(
827                "TTS text too long ({} chars, max {})",
828                char_count,
829                self.max_text_length
830            );
831        }
832
833        let tts = self.tts_providers.get(provider_alias).ok_or_else(|| {
834            let available = self.available_providers().join(", ");
835            ::zeroclaw_log::record!(
836                ERROR,
837                ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Reject)
838                    .with_outcome(::zeroclaw_log::EventOutcome::Failure)
839                    .with_attrs(::serde_json::json!({
840                        "tts_provider": provider_alias,
841                        "available": available,
842                    })),
843                "tts: provider not configured"
844            );
845            anyhow::Error::msg(format!(
846                "TTS model_provider '{}' not configured (available: {})",
847                provider_alias, available
848            ))
849        })?;
850
851        use ::zeroclaw_log::Instrument;
852        let span = ::zeroclaw_log::attribution_span!(tts.as_ref());
853        ::zeroclaw_log::scope!(voice: voice, => tts.synthesize(text, voice))
854            .instrument(span)
855            .await
856    }
857
858    /// List dotted aliases of all initialized tts_providers.
859    pub fn available_providers(&self) -> Vec<String> {
860        let mut names: Vec<_> = self.tts_providers.keys().cloned().collect();
861        names.sort();
862        names
863    }
864}
865
866// ── Tests ────────────────────────────────────────────────────────
867
868impl ::zeroclaw_api::attribution::Attributable for OpenAiTtsProvider {
869    fn role(&self) -> ::zeroclaw_api::attribution::Role {
870        ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
871            ::zeroclaw_api::attribution::TtsProviderKind::OpenAi,
872        ))
873    }
874    fn alias(&self) -> &str {
875        &self.alias
876    }
877}
878
879impl ::zeroclaw_api::attribution::Attributable for ElevenLabsTtsProvider {
880    fn role(&self) -> ::zeroclaw_api::attribution::Role {
881        ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
882            ::zeroclaw_api::attribution::TtsProviderKind::ElevenLabs,
883        ))
884    }
885    fn alias(&self) -> &str {
886        &self.alias
887    }
888}
889
890impl ::zeroclaw_api::attribution::Attributable for GoogleTtsProvider {
891    fn role(&self) -> ::zeroclaw_api::attribution::Role {
892        ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
893            ::zeroclaw_api::attribution::TtsProviderKind::Google,
894        ))
895    }
896    fn alias(&self) -> &str {
897        &self.alias
898    }
899}
900
901impl ::zeroclaw_api::attribution::Attributable for EdgeTtsProvider {
902    fn role(&self) -> ::zeroclaw_api::attribution::Role {
903        ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
904            ::zeroclaw_api::attribution::TtsProviderKind::Edge,
905        ))
906    }
907    fn alias(&self) -> &str {
908        &self.alias
909    }
910}
911
912impl ::zeroclaw_api::attribution::Attributable for PiperTtsProvider {
913    fn role(&self) -> ::zeroclaw_api::attribution::Role {
914        ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
915            ::zeroclaw_api::attribution::TtsProviderKind::Piper,
916        ))
917    }
918    fn alias(&self) -> &str {
919        &self.alias
920    }
921}
922
923#[cfg(test)]
924mod tests {
925    use super::*;
926
927    fn config_with_edge_alias() -> Config {
928        let mut cfg = Config::default();
929        cfg.agents.insert(
930            "default".into(),
931            zeroclaw_config::schema::AliasedAgentConfig {
932                tts_provider: "edge.default".into(),
933                ..Default::default()
934            },
935        );
936        cfg.providers.tts.edge.insert(
937            "default".to_string(),
938            zeroclaw_config::schema::EdgeTtsProviderConfig {
939                base: TtsProviderConfig {
940                    binary_path: Some("edge-tts".to_string()),
941                    ..TtsProviderConfig::default()
942                },
943            },
944        );
945        cfg
946    }
947
948    fn config_with_piper_alias() -> Config {
949        let mut cfg = Config::default();
950        cfg.agents.insert(
951            "default".into(),
952            zeroclaw_config::schema::AliasedAgentConfig {
953                tts_provider: "piper.default".into(),
954                ..Default::default()
955            },
956        );
957        cfg.providers.tts.piper.insert(
958            "default".to_string(),
959            zeroclaw_config::schema::PiperTtsProviderConfig {
960                base: TtsProviderConfig {
961                    uri: Some("http://127.0.0.1:5000/v1/audio/speech".to_string()),
962                    ..TtsProviderConfig::default()
963                },
964            },
965        );
966        cfg
967    }
968
969    #[test]
970    fn tts_manager_creation_with_defaults() {
971        let config = Config::default();
972        let manager = TtsManager::from_config(&config).unwrap();
973        assert!(manager.available_providers().is_empty());
974    }
975
976    #[test]
977    fn tts_manager_registers_alias_keyed_provider() {
978        let cfg = config_with_edge_alias();
979        let manager = TtsManager::from_config(&cfg).unwrap();
980        assert_eq!(manager.available_providers(), vec!["edge.default"]);
981    }
982
983    /// Regression for #7001: a channel-owning agent's `tts_provider` must win
984    /// over a lexicographically-earlier enabled agent that has none. Binding
985    /// the manager to the owner (`from_config_for_agent(cfg, Some("primary"))`)
986    /// resolves `primary`'s provider, not the first-sorting agent's empty one.
987    #[test]
988    fn tts_manager_binds_owning_agent_provider() {
989        // Reuse the edge.default provider registration, but install two agents:
990        // `primary` (the channel owner, has the provider) and a
991        // lexicographically-earlier `background` agent with no `tts_provider`.
992        let mut cfg = config_with_edge_alias();
993        cfg.agents.clear();
994        cfg.agents.insert(
995            "primary".into(),
996            zeroclaw_config::schema::AliasedAgentConfig {
997                tts_provider: "edge.default".into(),
998                ..Default::default()
999            },
1000        );
1001        cfg.agents.insert(
1002            "background".into(),
1003            zeroclaw_config::schema::AliasedAgentConfig {
1004                ..Default::default()
1005            },
1006        );
1007
1008        // Owner-bound resolution picks primary's provider...
1009        let owner_bound = TtsManager::from_config_for_agent(&cfg, Some("primary")).unwrap();
1010        assert_eq!(
1011            owner_bound.agent_tts_provider, "edge.default",
1012            "owner-bound manager must resolve the channel owner's tts_provider"
1013        );
1014
1015        // ...while binding to the provider-less first-sorting agent stays empty,
1016        // proving the binding is per-agent and not a global/first-sorting pick.
1017        let background_bound = TtsManager::from_config_for_agent(&cfg, Some("background")).unwrap();
1018        assert!(
1019            background_bound.agent_tts_provider.is_empty(),
1020            "an agent with no tts_provider must not inherit another agent's provider"
1021        );
1022    }
1023
1024    #[tokio::test]
1025    async fn tts_rejects_empty_text() {
1026        let cfg = config_with_edge_alias();
1027        let manager = TtsManager::from_config(&cfg).unwrap();
1028        let err = manager
1029            .synthesize_with_provider("", "edge.default", "en-US-AriaNeural")
1030            .await
1031            .unwrap_err();
1032        assert!(
1033            err.to_string().contains("must not be empty"),
1034            "expected empty-text error, got: {err}"
1035        );
1036    }
1037
1038    #[tokio::test]
1039    async fn tts_rejects_text_exceeding_max_length() {
1040        let mut cfg = config_with_edge_alias();
1041        cfg.tts.max_text_length = 10;
1042        let manager = TtsManager::from_config(&cfg).unwrap();
1043        let long_text = "a".repeat(11);
1044        let err = manager
1045            .synthesize_with_provider(&long_text, "edge.default", "en-US-AriaNeural")
1046            .await
1047            .unwrap_err();
1048        assert!(
1049            err.to_string().contains("too long"),
1050            "expected too-long error, got: {err}"
1051        );
1052    }
1053
1054    #[tokio::test]
1055    async fn tts_rejects_unknown_provider() {
1056        let cfg = Config::default();
1057        let manager = TtsManager::from_config(&cfg).unwrap();
1058        let err = manager
1059            .synthesize_with_provider("hello", "nonexistent.alias", "voice")
1060            .await
1061            .unwrap_err();
1062        assert!(
1063            err.to_string().contains("not configured"),
1064            "expected not-configured error, got: {err}"
1065        );
1066    }
1067
1068    #[test]
1069    fn piper_provider_creation_uses_default_url_when_unset() {
1070        let model_provider = PiperTtsProvider::new("test", &TtsProviderConfig::default());
1071        assert_eq!(model_provider.name(), "piper");
1072        assert_eq!(
1073            model_provider.api_url,
1074            "http://127.0.0.1:5000/v1/audio/speech"
1075        );
1076        assert_eq!(
1077            model_provider.supported_formats(),
1078            vec!["mp3", "wav", "opus"]
1079        );
1080        assert!(model_provider.supported_voices().is_empty());
1081    }
1082
1083    #[test]
1084    fn tts_manager_with_piper_alias() {
1085        let cfg = config_with_piper_alias();
1086        let manager = TtsManager::from_config(&cfg).unwrap();
1087        assert_eq!(manager.available_providers(), vec!["piper.default"]);
1088    }
1089
1090    #[tokio::test]
1091    async fn tts_rejects_empty_text_for_piper() {
1092        let cfg = config_with_piper_alias();
1093        let manager = TtsManager::from_config(&cfg).unwrap();
1094        let err = manager
1095            .synthesize_with_provider("", "piper.default", "default")
1096            .await
1097            .unwrap_err();
1098        assert!(
1099            err.to_string().contains("must not be empty"),
1100            "expected empty-text error, got: {err}"
1101        );
1102    }
1103
1104    #[test]
1105    fn tts_config_defaults() {
1106        let config = zeroclaw_config::schema::TtsConfig::default();
1107        assert!(!config.enabled);
1108        // TtsConfig has no global default-provider field; per-agent
1109        // `tts_provider` is the only selector.
1110        assert_eq!(config.default_voice, "alloy");
1111        assert_eq!(config.default_format, "mp3");
1112        assert_eq!(config.max_text_length, DEFAULT_MAX_TEXT_LENGTH);
1113    }
1114
1115    #[test]
1116    fn tts_manager_max_text_length_zero_uses_default() {
1117        let mut cfg = Config::default();
1118        cfg.tts.max_text_length = 0;
1119        let manager = TtsManager::from_config(&cfg).unwrap();
1120        assert_eq!(manager.max_text_length, DEFAULT_MAX_TEXT_LENGTH);
1121    }
1122
1123    #[tokio::test]
1124    async fn synthesize_posts_to_configured_uri_with_response_format() {
1125        use wiremock::matchers::{method, path};
1126        use wiremock::{Mock, MockServer, ResponseTemplate};
1127
1128        let server = MockServer::start().await;
1129        Mock::given(method("POST"))
1130            .and(path("/v1/audio/speech"))
1131            .respond_with(ResponseTemplate::new(200).set_body_bytes(b"FAKE_WAV".to_vec()))
1132            .mount(&server)
1133            .await;
1134
1135        let cfg = TtsProviderConfig {
1136            api_key: Some("sk-test".to_string()),
1137            uri: Some(format!("{}/v1/audio/speech", server.uri())),
1138            response_format: Some("wav".to_string()),
1139            ..TtsProviderConfig::default()
1140        };
1141        let provider = OpenAiTtsProvider::new("test", &cfg).unwrap();
1142
1143        let audio = provider.synthesize("hello world", "hannah").await.unwrap();
1144        assert_eq!(
1145            audio, b"FAKE_WAV",
1146            "synthesize should return the bytes served by the configured endpoint"
1147        );
1148
1149        let reqs = server.received_requests().await.unwrap();
1150        assert_eq!(
1151            reqs.len(),
1152            1,
1153            "exactly one POST should reach the configured uri"
1154        );
1155        let body: serde_json::Value = serde_json::from_slice(&reqs[0].body).unwrap();
1156        assert_eq!(
1157            body["response_format"], "wav",
1158            "configured response_format must reach the outgoing request body"
1159        );
1160        assert_eq!(body["input"], "hello world");
1161        assert_eq!(body["voice"], "hannah");
1162        assert_eq!(body["model"], "tts-1");
1163    }
1164
1165    #[tokio::test]
1166    async fn synthesize_defaults_response_format_to_opus_when_unset() {
1167        use wiremock::matchers::{method, path};
1168        use wiremock::{Mock, MockServer, ResponseTemplate};
1169
1170        let server = MockServer::start().await;
1171        Mock::given(method("POST"))
1172            .and(path("/v1/audio/speech"))
1173            .respond_with(ResponseTemplate::new(200).set_body_bytes(b"AUDIO".to_vec()))
1174            .mount(&server)
1175            .await;
1176
1177        // uri points at the mock so we can inspect the body; response_format left unset.
1178        let cfg = TtsProviderConfig {
1179            api_key: Some("sk-test".to_string()),
1180            uri: Some(format!("{}/v1/audio/speech", server.uri())),
1181            ..TtsProviderConfig::default()
1182        };
1183        let provider = OpenAiTtsProvider::new("test", &cfg).unwrap();
1184        provider.synthesize("hi", "alloy").await.unwrap();
1185
1186        let reqs = server.received_requests().await.unwrap();
1187        let body: serde_json::Value = serde_json::from_slice(&reqs[0].body).unwrap();
1188        assert_eq!(
1189            body["response_format"], "opus",
1190            "unset response_format must default to opus in the outgoing request"
1191        );
1192    }
1193
1194    #[test]
1195    fn openai_defaults_to_production_endpoint_when_uri_unset() {
1196        let cfg = TtsProviderConfig {
1197            api_key: Some("sk-test".to_string()),
1198            ..TtsProviderConfig::default()
1199        };
1200        let provider = OpenAiTtsProvider::new("test", &cfg).unwrap();
1201        assert_eq!(provider.base_url, "https://api.openai.com/v1/audio/speech");
1202        assert_eq!(provider.response_format, "opus");
1203    }
1204}
zeroclaw_channels/tts.rs

zeroclaw_channels/
tts.rs