1use std::collections::HashMap;
11
12use anyhow::{Context, Result, bail};
13
14use zeroclaw_config::schema::{Config, TtsProviderConfig};
15
16const DEFAULT_MAX_TEXT_LENGTH: usize = 4096;
18
19const TTS_HTTP_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(60);
21
22#[async_trait::async_trait]
26pub trait TtsProvider: Send + Sync + ::zeroclaw_api::attribution::Attributable {
27 fn name(&self) -> &str;
29
30 async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>>;
32
33 fn output_format(&self) -> &str;
37
38 fn supported_voices(&self) -> Vec<String>;
40
41 fn supported_formats(&self) -> Vec<String>;
43}
44
45pub struct OpenAiTtsProvider {
49 alias: String,
50 api_key: String,
51 model: String,
52 speed: f64,
53 base_url: String,
57 response_format: String,
60 client: reqwest::Client,
61}
62
63impl OpenAiTtsProvider {
64 pub fn new(alias: &str, config: &TtsProviderConfig) -> Result<Self> {
69 let api_key = config
70 .api_key
71 .as_deref()
72 .map(str::trim)
73 .filter(|k| !k.is_empty())
74 .map(ToOwned::to_owned)
75 .context(
76 "Missing OpenAI TTS API key: set `[tts_providers.openai.<alias>].api_key` (or via \
77 `ZEROCLAW_providers__tts__openai__<alias>__api_key=...`).",
78 )?;
79
80 Ok(Self {
81 alias: alias.to_string(),
82 api_key,
83 model: config
84 .model
85 .clone()
86 .filter(|m| !m.trim().is_empty())
87 .unwrap_or_else(|| "tts-1".to_string()),
88 speed: config.speed.unwrap_or(1.0),
89 base_url: config
90 .uri
91 .clone()
92 .filter(|u| !u.trim().is_empty())
93 .unwrap_or_else(|| "https://api.openai.com/v1/audio/speech".to_string()),
94 response_format: config
95 .response_format
96 .clone()
97 .filter(|f| !f.trim().is_empty())
98 .unwrap_or_else(|| "opus".to_string()),
99 client: reqwest::Client::builder()
100 .timeout(TTS_HTTP_TIMEOUT)
101 .build()
102 .context("Failed to build HTTP client for OpenAI TTS")?,
103 })
104 }
105}
106
107#[async_trait::async_trait]
108impl TtsProvider for OpenAiTtsProvider {
109 fn name(&self) -> &str {
110 "openai"
111 }
112
113 fn output_format(&self) -> &str {
114 &self.response_format
115 }
116
117 async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
118 let body = serde_json::json!({
119 "model": self.model,
120 "input": text,
121 "voice": voice,
122 "speed": self.speed,
123 "response_format": self.response_format,
124 });
125
126 let resp = self
127 .client
128 .post(&self.base_url)
129 .bearer_auth(&self.api_key)
130 .json(&body)
131 .send()
132 .await
133 .context("Failed to send OpenAI TTS request")?;
134
135 let status = resp.status();
136 if !status.is_success() {
137 let error_body: serde_json::Value = resp
138 .json()
139 .await
140 .unwrap_or_else(|_| serde_json::json!({"error": "unknown"}));
141 let msg = error_body["error"]["message"]
142 .as_str()
143 .unwrap_or("unknown error");
144 bail!("OpenAI TTS API error ({}): {}", status, msg);
145 }
146
147 let bytes = resp
148 .bytes()
149 .await
150 .context("Failed to read OpenAI TTS response body")?;
151 Ok(bytes.to_vec())
152 }
153
154 fn supported_voices(&self) -> Vec<String> {
155 ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
156 .iter()
157 .map(|s| (*s).to_string())
158 .collect()
159 }
160
161 fn supported_formats(&self) -> Vec<String> {
162 ["mp3", "opus", "aac", "flac", "wav", "pcm"]
163 .iter()
164 .map(|s| (*s).to_string())
165 .collect()
166 }
167}
168
169pub struct ElevenLabsTtsProvider {
173 alias: String,
174 api_key: String,
175 model_id: String,
176 stability: f64,
177 similarity_boost: f64,
178 client: reqwest::Client,
179}
180
181impl ElevenLabsTtsProvider {
182 pub fn new(alias: &str, config: &TtsProviderConfig) -> Result<Self> {
186 let api_key = config
187 .api_key
188 .as_deref()
189 .map(str::trim)
190 .filter(|k| !k.is_empty())
191 .map(ToOwned::to_owned)
192 .context(
193 "Missing ElevenLabs API key: set `[tts_providers.elevenlabs.<alias>].api_key` (or \
194 via `ZEROCLAW_providers__tts__elevenlabs__<alias>__api_key=...`).",
195 )?;
196
197 Ok(Self {
198 alias: alias.to_string(),
199 api_key,
200 model_id: config
201 .model
202 .clone()
203 .filter(|m| !m.trim().is_empty())
204 .unwrap_or_else(|| "eleven_monolingual_v1".to_string()),
205 stability: config.stability.unwrap_or(0.5),
206 similarity_boost: config.similarity_boost.unwrap_or(0.5),
207 client: reqwest::Client::builder()
208 .timeout(TTS_HTTP_TIMEOUT)
209 .build()
210 .context("Failed to build HTTP client for ElevenLabs TTS")?,
211 })
212 }
213}
214
215#[async_trait::async_trait]
216impl TtsProvider for ElevenLabsTtsProvider {
217 fn output_format(&self) -> &str {
218 "mp3"
219 }
220 fn name(&self) -> &str {
221 "elevenlabs"
222 }
223
224 async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
225 if !voice
226 .chars()
227 .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
228 {
229 bail!("ElevenLabs voice ID contains invalid characters: {voice}");
230 }
231 let url = format!("https://api.elevenlabs.io/v1/text-to-speech/{voice}");
232 let body = serde_json::json!({
233 "text": text,
234 "model_id": self.model_id,
235 "voice_settings": {
236 "stability": self.stability,
237 "similarity_boost": self.similarity_boost,
238 },
239 });
240
241 let resp = self
242 .client
243 .post(&url)
244 .header("xi-api-key", &self.api_key)
245 .json(&body)
246 .send()
247 .await
248 .context("Failed to send ElevenLabs TTS request")?;
249
250 let status = resp.status();
251 if !status.is_success() {
252 let error_body: serde_json::Value = resp
253 .json()
254 .await
255 .unwrap_or_else(|_| serde_json::json!({"error": "unknown"}));
256 let msg = error_body["detail"]["message"]
257 .as_str()
258 .or_else(|| error_body["detail"].as_str())
259 .unwrap_or("unknown error");
260 bail!("ElevenLabs TTS API error ({}): {}", status, msg);
261 }
262
263 let bytes = resp
264 .bytes()
265 .await
266 .context("Failed to read ElevenLabs TTS response body")?;
267 Ok(bytes.to_vec())
268 }
269
270 fn supported_voices(&self) -> Vec<String> {
271 Vec::new()
273 }
274
275 fn supported_formats(&self) -> Vec<String> {
276 ["mp3", "pcm", "ulaw"]
277 .iter()
278 .map(|s| (*s).to_string())
279 .collect()
280 }
281}
282
283pub struct GoogleTtsProvider {
287 alias: String,
288 api_key: String,
289 language_code: String,
290 client: reqwest::Client,
291}
292
293impl GoogleTtsProvider {
294 pub fn new(alias: &str, config: &TtsProviderConfig) -> Result<Self> {
298 let api_key = config
299 .api_key
300 .as_deref()
301 .map(str::trim)
302 .filter(|k| !k.is_empty())
303 .map(ToOwned::to_owned)
304 .context(
305 "Missing Google TTS API key: set `[tts_providers.google.<alias>].api_key` (or via \
306 `ZEROCLAW_providers__tts__google__<alias>__api_key=...`).",
307 )?;
308
309 Ok(Self {
310 alias: alias.to_string(),
311 api_key,
312 language_code: config
313 .language_code
314 .clone()
315 .filter(|c| !c.trim().is_empty())
316 .unwrap_or_else(|| "en-US".to_string()),
317 client: reqwest::Client::builder()
318 .timeout(TTS_HTTP_TIMEOUT)
319 .build()
320 .context("Failed to build HTTP client for Google TTS")?,
321 })
322 }
323}
324
325#[async_trait::async_trait]
326impl TtsProvider for GoogleTtsProvider {
327 fn output_format(&self) -> &str {
328 "mp3"
329 }
330
331 fn name(&self) -> &str {
332 "google"
333 }
334
335 async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
336 let url = "https://texttospeech.googleapis.com/v1/text:synthesize";
337 let body = serde_json::json!({
338 "input": { "text": text },
339 "voice": {
340 "languageCode": self.language_code,
341 "name": voice,
342 },
343 "audioConfig": {
344 "audioEncoding": "MP3",
345 },
346 });
347
348 let resp = self
349 .client
350 .post(url)
351 .header("x-goog-api-key", &self.api_key)
352 .json(&body)
353 .send()
354 .await
355 .context("Failed to send Google TTS request")?;
356
357 let status = resp.status();
358 let resp_body: serde_json::Value = resp
359 .json()
360 .await
361 .context("Failed to parse Google TTS response")?;
362
363 if !status.is_success() {
364 let msg = resp_body["error"]["message"]
365 .as_str()
366 .unwrap_or("unknown error");
367 bail!("Google TTS API error ({}): {}", status, msg);
368 }
369
370 let audio_b64 = resp_body["audioContent"]
371 .as_str()
372 .context("Google TTS response missing 'audioContent' field")?;
373
374 use base64::Engine;
375 let bytes = base64::engine::general_purpose::STANDARD
376 .decode(audio_b64)
377 .context("Failed to decode Google TTS base64 audio")?;
378 Ok(bytes)
379 }
380
381 fn supported_voices(&self) -> Vec<String> {
382 [
384 "en-US-Standard-A",
385 "en-US-Standard-B",
386 "en-US-Standard-C",
387 "en-US-Standard-D",
388 ]
389 .iter()
390 .map(|s| (*s).to_string())
391 .collect()
392 }
393
394 fn supported_formats(&self) -> Vec<String> {
395 ["mp3", "wav", "ogg"]
396 .iter()
397 .map(|s| (*s).to_string())
398 .collect()
399 }
400}
401
402pub struct EdgeTtsProvider {
406 alias: String,
407 binary_path: String,
408}
409
410impl EdgeTtsProvider {
411 const ALLOWED_BINARIES: &[&str] = &["edge-tts", "edge-playback"];
413
414 pub fn new(alias: &str, config: &TtsProviderConfig) -> Result<Self> {
420 let raw_path = config
421 .binary_path
422 .clone()
423 .filter(|p| !p.trim().is_empty())
424 .unwrap_or_else(|| "edge-tts".to_string());
425 if raw_path.contains('/') || raw_path.contains('\\') {
426 bail!(
427 "Edge TTS binary_path must be a bare command name without path separators, got: {raw_path}"
428 );
429 }
430 if !Self::ALLOWED_BINARIES.contains(&raw_path.as_str()) {
431 bail!(
432 "Edge TTS binary_path must be one of {:?}, got: {raw_path}",
433 Self::ALLOWED_BINARIES,
434 );
435 }
436 Ok(Self {
437 alias: alias.to_string(),
438 binary_path: raw_path,
439 })
440 }
441}
442
443#[async_trait::async_trait]
444impl TtsProvider for EdgeTtsProvider {
445 fn output_format(&self) -> &str {
446 "mp3"
447 }
448
449 fn name(&self) -> &str {
450 "edge"
451 }
452
453 async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
454 let temp_dir = std::env::temp_dir();
455 let output_file = temp_dir.join(format!("zeroclaw_tts_{}.mp3", uuid::Uuid::new_v4()));
456 let output_path = output_file
457 .to_str()
458 .context("Failed to build temp file path for Edge TTS")?;
459
460 let output = tokio::time::timeout(
461 TTS_HTTP_TIMEOUT,
462 tokio::process::Command::new(&self.binary_path)
463 .arg("--text")
464 .arg(text)
465 .arg("--voice")
466 .arg(voice)
467 .arg("--write-media")
468 .arg(output_path)
469 .output(),
470 )
471 .await
472 .context("Edge TTS subprocess timed out")?
473 .context("Failed to spawn edge-tts subprocess")?;
474
475 if !output.status.success() {
476 let stderr = String::from_utf8_lossy(&output.stderr);
477 let _ = tokio::fs::remove_file(&output_file).await;
479 bail!("edge-tts failed (exit {}): {}", output.status, stderr);
480 }
481
482 let bytes = tokio::fs::read(&output_file)
483 .await
484 .context("Failed to read edge-tts output file")?;
485
486 let _ = tokio::fs::remove_file(&output_file).await;
488
489 Ok(bytes)
490 }
491
492 fn supported_voices(&self) -> Vec<String> {
493 [
495 "en-US-AriaNeural",
496 "en-US-GuyNeural",
497 "en-US-JennyNeural",
498 "en-GB-SoniaNeural",
499 ]
500 .iter()
501 .map(|s| (*s).to_string())
502 .collect()
503 }
504
505 fn supported_formats(&self) -> Vec<String> {
506 vec!["mp3".to_string()]
507 }
508}
509
510pub struct PiperTtsProvider {
514 alias: String,
515 client: reqwest::Client,
516 api_url: String,
517}
518
519impl PiperTtsProvider {
520 pub fn new(alias: &str, config: &TtsProviderConfig) -> Self {
523 let api_url = config
524 .uri
525 .clone()
526 .filter(|u| !u.trim().is_empty())
527 .unwrap_or_else(|| "http://127.0.0.1:5000/v1/audio/speech".to_string());
528 Self {
529 alias: alias.to_string(),
530 client: reqwest::Client::builder()
531 .timeout(TTS_HTTP_TIMEOUT)
532 .build()
533 .expect("Failed to build HTTP client for Piper TTS"),
534 api_url,
535 }
536 }
537}
538
539#[async_trait::async_trait]
540impl TtsProvider for PiperTtsProvider {
541 fn output_format(&self) -> &str {
542 "wav"
543 }
544
545 fn name(&self) -> &str {
546 "piper"
547 }
548
549 async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
550 let body = serde_json::json!({
551 "model": "tts-1",
552 "input": text,
553 "voice": voice,
554 });
555
556 let resp = self
557 .client
558 .post(&self.api_url)
559 .json(&body)
560 .send()
561 .await
562 .context("Failed to send Piper TTS request")?;
563
564 let status = resp.status();
565 if !status.is_success() {
566 let error_body: serde_json::Value = resp
567 .json()
568 .await
569 .unwrap_or_else(|_| serde_json::json!({"error": "unknown"}));
570 let msg = error_body["error"]["message"]
571 .as_str()
572 .unwrap_or("unknown error");
573 bail!("Piper TTS API error ({}): {}", status, msg);
574 }
575
576 let bytes = resp
577 .bytes()
578 .await
579 .context("Failed to read Piper TTS response body")?;
580 Ok(bytes.to_vec())
581 }
582
583 fn supported_voices(&self) -> Vec<String> {
584 Vec::new()
586 }
587
588 fn supported_formats(&self) -> Vec<String> {
589 ["mp3", "wav", "opus"]
590 .iter()
591 .map(|s| (*s).to_string())
592 .collect()
593 }
594}
595
596async fn transcode_to_opus(audio: Vec<u8>) -> Result<Vec<u8>> {
604 use std::process::Stdio;
605 use tokio::io::AsyncWriteExt;
606
607 let mut child = tokio::process::Command::new("ffmpeg")
608 .args([
609 "-hide_banner",
610 "-loglevel",
611 "error",
612 "-i",
613 "pipe:0",
614 "-f",
615 "ogg",
616 "-acodec",
617 "libopus",
618 "-b:a",
619 "32k",
620 "-vbr",
621 "on",
622 "pipe:1",
623 ])
624 .stdin(Stdio::piped())
625 .stdout(Stdio::piped())
626 .stderr(Stdio::piped())
627 .spawn()
628 .context(
629 "failed to spawn ffmpeg — ensure ffmpeg with libopus support is installed \
630 (e.g. `sudo dnf install ffmpeg` / `sudo apt install ffmpeg`)",
631 )?;
632
633 let mut stdin = child.stdin.take().expect("stdin configured above");
634
635 let (write_result, output) = tokio::join!(
638 async move {
639 stdin.write_all(&audio).await?;
640 stdin.shutdown().await
641 },
642 child.wait_with_output()
643 );
644
645 write_result.context("failed to write audio to ffmpeg stdin")?;
646 let output = output.context("ffmpeg process error")?;
647
648 if !output.status.success() {
649 let stderr = String::from_utf8_lossy(&output.stderr);
650 bail!("ffmpeg transcode to opus failed: {stderr}");
651 }
652
653 anyhow::ensure!(
654 !output.stdout.is_empty(),
655 "ffmpeg produced empty output — check that libopus is available"
656 );
657
658 Ok(output.stdout)
659}
660
661pub struct TtsManager {
670 tts_providers: HashMap<String, Box<dyn TtsProvider>>,
671 voice_by_alias: HashMap<String, String>,
672 agent_tts_provider: String,
675 default_voice: String,
676 max_text_length: usize,
677}
678
679impl TtsManager {
680 pub fn from_config(config: &Config) -> Result<Self> {
690 Self::from_config_for_agent(config, None)
691 }
692
693 pub fn from_config_for_agent(config: &Config, agent_alias: Option<&str>) -> Result<Self> {
703 let mut tts_providers: HashMap<String, Box<dyn TtsProvider>> = HashMap::new();
704 let mut voice_by_alias: HashMap<String, String> = HashMap::new();
705
706 for (family, alias, instance) in config.providers.tts.iter_entries() {
710 let dotted = format!("{family}.{alias}");
711 let result: Result<Box<dyn TtsProvider>> = match family {
712 "openai" => OpenAiTtsProvider::new(alias, instance).map(|p| Box::new(p) as _),
713 "elevenlabs" => {
714 ElevenLabsTtsProvider::new(alias, instance).map(|p| Box::new(p) as _)
715 }
716 "google" => GoogleTtsProvider::new(alias, instance).map(|p| Box::new(p) as _),
717 "edge" => EdgeTtsProvider::new(alias, instance).map(|p| Box::new(p) as _),
718 "piper" => Ok(Box::new(PiperTtsProvider::new(alias, instance)) as _),
719 _ => unreachable!("TtsProviders typed slots cover all 5 families"),
720 };
721 match result {
722 Ok(p) => {
723 tts_providers.insert(dotted.clone(), p);
724 if let Some(voice) = instance
725 .voice
726 .as_deref()
727 .map(str::trim)
728 .filter(|v| !v.is_empty())
729 {
730 voice_by_alias.insert(dotted, voice.to_string());
731 }
732 }
733 Err(e) => {
734 ::zeroclaw_log::record!(
735 WARN,
736 ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
737 .with_outcome(::zeroclaw_log::EventOutcome::Unknown)
738 .with_attrs(
739 ::serde_json::json!({"error": format!("{}", e), "dotted": dotted})
740 ),
741 "Skipping TTS provider"
742 );
743 }
744 }
745 }
746
747 let max_text_length = if config.tts.max_text_length == 0 {
748 DEFAULT_MAX_TEXT_LENGTH
749 } else {
750 config.tts.max_text_length
751 };
752
753 let agent_tts_provider = agent_alias
758 .or_else(|| config.resolved_runtime_agent_alias())
759 .and_then(|alias| config.agents.get(alias))
760 .map(|a| a.tts_provider.as_str().to_string())
761 .unwrap_or_default();
762
763 Ok(Self {
764 tts_providers,
765 voice_by_alias,
766 agent_tts_provider,
767 default_voice: config.tts.default_voice.clone(),
768 max_text_length,
769 })
770 }
771
772 pub async fn synthesize_opus(&self, text: &str) -> Result<Vec<u8>> {
778 let audio = self.synthesize(text).await?;
779 let provider_alias = self.agent_tts_provider.as_str();
780 let format = self
781 .tts_providers
782 .get(provider_alias)
783 .map(|p| p.output_format())
784 .unwrap_or("unknown");
785 if format == "opus" {
786 return Ok(audio);
787 }
788 transcode_to_opus(audio).await
789 }
790
791 pub async fn synthesize(&self, text: &str) -> Result<Vec<u8>> {
798 let provider_alias = self.agent_tts_provider.as_str();
799 if provider_alias.is_empty() {
800 bail!(
801 "Agent has no tts_provider configured. Set \
802 `agent.<alias>.tts_provider = \"<type>.<alias>\"` referencing a \
803 [tts_providers.<type>.<alias>] entry."
804 );
805 }
806 let voice = self
807 .voice_by_alias
808 .get(provider_alias)
809 .map_or(self.default_voice.as_str(), String::as_str);
810 self.synthesize_with_provider(text, provider_alias, voice)
811 .await
812 }
813
814 pub async fn synthesize_with_provider(
816 &self,
817 text: &str,
818 provider_alias: &str,
819 voice: &str,
820 ) -> Result<Vec<u8>> {
821 if text.is_empty() {
822 bail!("TTS text must not be empty");
823 }
824 let char_count = text.chars().count();
825 if char_count > self.max_text_length {
826 bail!(
827 "TTS text too long ({} chars, max {})",
828 char_count,
829 self.max_text_length
830 );
831 }
832
833 let tts = self.tts_providers.get(provider_alias).ok_or_else(|| {
834 let available = self.available_providers().join(", ");
835 ::zeroclaw_log::record!(
836 ERROR,
837 ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Reject)
838 .with_outcome(::zeroclaw_log::EventOutcome::Failure)
839 .with_attrs(::serde_json::json!({
840 "tts_provider": provider_alias,
841 "available": available,
842 })),
843 "tts: provider not configured"
844 );
845 anyhow::Error::msg(format!(
846 "TTS model_provider '{}' not configured (available: {})",
847 provider_alias, available
848 ))
849 })?;
850
851 use ::zeroclaw_log::Instrument;
852 let span = ::zeroclaw_log::attribution_span!(tts.as_ref());
853 ::zeroclaw_log::scope!(voice: voice, => tts.synthesize(text, voice))
854 .instrument(span)
855 .await
856 }
857
858 pub fn available_providers(&self) -> Vec<String> {
860 let mut names: Vec<_> = self.tts_providers.keys().cloned().collect();
861 names.sort();
862 names
863 }
864}
865
866impl ::zeroclaw_api::attribution::Attributable for OpenAiTtsProvider {
869 fn role(&self) -> ::zeroclaw_api::attribution::Role {
870 ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
871 ::zeroclaw_api::attribution::TtsProviderKind::OpenAi,
872 ))
873 }
874 fn alias(&self) -> &str {
875 &self.alias
876 }
877}
878
879impl ::zeroclaw_api::attribution::Attributable for ElevenLabsTtsProvider {
880 fn role(&self) -> ::zeroclaw_api::attribution::Role {
881 ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
882 ::zeroclaw_api::attribution::TtsProviderKind::ElevenLabs,
883 ))
884 }
885 fn alias(&self) -> &str {
886 &self.alias
887 }
888}
889
890impl ::zeroclaw_api::attribution::Attributable for GoogleTtsProvider {
891 fn role(&self) -> ::zeroclaw_api::attribution::Role {
892 ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
893 ::zeroclaw_api::attribution::TtsProviderKind::Google,
894 ))
895 }
896 fn alias(&self) -> &str {
897 &self.alias
898 }
899}
900
901impl ::zeroclaw_api::attribution::Attributable for EdgeTtsProvider {
902 fn role(&self) -> ::zeroclaw_api::attribution::Role {
903 ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
904 ::zeroclaw_api::attribution::TtsProviderKind::Edge,
905 ))
906 }
907 fn alias(&self) -> &str {
908 &self.alias
909 }
910}
911
912impl ::zeroclaw_api::attribution::Attributable for PiperTtsProvider {
913 fn role(&self) -> ::zeroclaw_api::attribution::Role {
914 ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
915 ::zeroclaw_api::attribution::TtsProviderKind::Piper,
916 ))
917 }
918 fn alias(&self) -> &str {
919 &self.alias
920 }
921}
922
923#[cfg(test)]
924mod tests {
925 use super::*;
926
927 fn config_with_edge_alias() -> Config {
928 let mut cfg = Config::default();
929 cfg.agents.insert(
930 "default".into(),
931 zeroclaw_config::schema::AliasedAgentConfig {
932 tts_provider: "edge.default".into(),
933 ..Default::default()
934 },
935 );
936 cfg.providers.tts.edge.insert(
937 "default".to_string(),
938 zeroclaw_config::schema::EdgeTtsProviderConfig {
939 base: TtsProviderConfig {
940 binary_path: Some("edge-tts".to_string()),
941 ..TtsProviderConfig::default()
942 },
943 },
944 );
945 cfg
946 }
947
948 fn config_with_piper_alias() -> Config {
949 let mut cfg = Config::default();
950 cfg.agents.insert(
951 "default".into(),
952 zeroclaw_config::schema::AliasedAgentConfig {
953 tts_provider: "piper.default".into(),
954 ..Default::default()
955 },
956 );
957 cfg.providers.tts.piper.insert(
958 "default".to_string(),
959 zeroclaw_config::schema::PiperTtsProviderConfig {
960 base: TtsProviderConfig {
961 uri: Some("http://127.0.0.1:5000/v1/audio/speech".to_string()),
962 ..TtsProviderConfig::default()
963 },
964 },
965 );
966 cfg
967 }
968
969 #[test]
970 fn tts_manager_creation_with_defaults() {
971 let config = Config::default();
972 let manager = TtsManager::from_config(&config).unwrap();
973 assert!(manager.available_providers().is_empty());
974 }
975
976 #[test]
977 fn tts_manager_registers_alias_keyed_provider() {
978 let cfg = config_with_edge_alias();
979 let manager = TtsManager::from_config(&cfg).unwrap();
980 assert_eq!(manager.available_providers(), vec!["edge.default"]);
981 }
982
983 #[test]
988 fn tts_manager_binds_owning_agent_provider() {
989 let mut cfg = config_with_edge_alias();
993 cfg.agents.clear();
994 cfg.agents.insert(
995 "primary".into(),
996 zeroclaw_config::schema::AliasedAgentConfig {
997 tts_provider: "edge.default".into(),
998 ..Default::default()
999 },
1000 );
1001 cfg.agents.insert(
1002 "background".into(),
1003 zeroclaw_config::schema::AliasedAgentConfig {
1004 ..Default::default()
1005 },
1006 );
1007
1008 let owner_bound = TtsManager::from_config_for_agent(&cfg, Some("primary")).unwrap();
1010 assert_eq!(
1011 owner_bound.agent_tts_provider, "edge.default",
1012 "owner-bound manager must resolve the channel owner's tts_provider"
1013 );
1014
1015 let background_bound = TtsManager::from_config_for_agent(&cfg, Some("background")).unwrap();
1018 assert!(
1019 background_bound.agent_tts_provider.is_empty(),
1020 "an agent with no tts_provider must not inherit another agent's provider"
1021 );
1022 }
1023
1024 #[tokio::test]
1025 async fn tts_rejects_empty_text() {
1026 let cfg = config_with_edge_alias();
1027 let manager = TtsManager::from_config(&cfg).unwrap();
1028 let err = manager
1029 .synthesize_with_provider("", "edge.default", "en-US-AriaNeural")
1030 .await
1031 .unwrap_err();
1032 assert!(
1033 err.to_string().contains("must not be empty"),
1034 "expected empty-text error, got: {err}"
1035 );
1036 }
1037
1038 #[tokio::test]
1039 async fn tts_rejects_text_exceeding_max_length() {
1040 let mut cfg = config_with_edge_alias();
1041 cfg.tts.max_text_length = 10;
1042 let manager = TtsManager::from_config(&cfg).unwrap();
1043 let long_text = "a".repeat(11);
1044 let err = manager
1045 .synthesize_with_provider(&long_text, "edge.default", "en-US-AriaNeural")
1046 .await
1047 .unwrap_err();
1048 assert!(
1049 err.to_string().contains("too long"),
1050 "expected too-long error, got: {err}"
1051 );
1052 }
1053
1054 #[tokio::test]
1055 async fn tts_rejects_unknown_provider() {
1056 let cfg = Config::default();
1057 let manager = TtsManager::from_config(&cfg).unwrap();
1058 let err = manager
1059 .synthesize_with_provider("hello", "nonexistent.alias", "voice")
1060 .await
1061 .unwrap_err();
1062 assert!(
1063 err.to_string().contains("not configured"),
1064 "expected not-configured error, got: {err}"
1065 );
1066 }
1067
1068 #[test]
1069 fn piper_provider_creation_uses_default_url_when_unset() {
1070 let model_provider = PiperTtsProvider::new("test", &TtsProviderConfig::default());
1071 assert_eq!(model_provider.name(), "piper");
1072 assert_eq!(
1073 model_provider.api_url,
1074 "http://127.0.0.1:5000/v1/audio/speech"
1075 );
1076 assert_eq!(
1077 model_provider.supported_formats(),
1078 vec!["mp3", "wav", "opus"]
1079 );
1080 assert!(model_provider.supported_voices().is_empty());
1081 }
1082
1083 #[test]
1084 fn tts_manager_with_piper_alias() {
1085 let cfg = config_with_piper_alias();
1086 let manager = TtsManager::from_config(&cfg).unwrap();
1087 assert_eq!(manager.available_providers(), vec!["piper.default"]);
1088 }
1089
1090 #[tokio::test]
1091 async fn tts_rejects_empty_text_for_piper() {
1092 let cfg = config_with_piper_alias();
1093 let manager = TtsManager::from_config(&cfg).unwrap();
1094 let err = manager
1095 .synthesize_with_provider("", "piper.default", "default")
1096 .await
1097 .unwrap_err();
1098 assert!(
1099 err.to_string().contains("must not be empty"),
1100 "expected empty-text error, got: {err}"
1101 );
1102 }
1103
1104 #[test]
1105 fn tts_config_defaults() {
1106 let config = zeroclaw_config::schema::TtsConfig::default();
1107 assert!(!config.enabled);
1108 assert_eq!(config.default_voice, "alloy");
1111 assert_eq!(config.default_format, "mp3");
1112 assert_eq!(config.max_text_length, DEFAULT_MAX_TEXT_LENGTH);
1113 }
1114
1115 #[test]
1116 fn tts_manager_max_text_length_zero_uses_default() {
1117 let mut cfg = Config::default();
1118 cfg.tts.max_text_length = 0;
1119 let manager = TtsManager::from_config(&cfg).unwrap();
1120 assert_eq!(manager.max_text_length, DEFAULT_MAX_TEXT_LENGTH);
1121 }
1122
1123 #[tokio::test]
1124 async fn synthesize_posts_to_configured_uri_with_response_format() {
1125 use wiremock::matchers::{method, path};
1126 use wiremock::{Mock, MockServer, ResponseTemplate};
1127
1128 let server = MockServer::start().await;
1129 Mock::given(method("POST"))
1130 .and(path("/v1/audio/speech"))
1131 .respond_with(ResponseTemplate::new(200).set_body_bytes(b"FAKE_WAV".to_vec()))
1132 .mount(&server)
1133 .await;
1134
1135 let cfg = TtsProviderConfig {
1136 api_key: Some("sk-test".to_string()),
1137 uri: Some(format!("{}/v1/audio/speech", server.uri())),
1138 response_format: Some("wav".to_string()),
1139 ..TtsProviderConfig::default()
1140 };
1141 let provider = OpenAiTtsProvider::new("test", &cfg).unwrap();
1142
1143 let audio = provider.synthesize("hello world", "hannah").await.unwrap();
1144 assert_eq!(
1145 audio, b"FAKE_WAV",
1146 "synthesize should return the bytes served by the configured endpoint"
1147 );
1148
1149 let reqs = server.received_requests().await.unwrap();
1150 assert_eq!(
1151 reqs.len(),
1152 1,
1153 "exactly one POST should reach the configured uri"
1154 );
1155 let body: serde_json::Value = serde_json::from_slice(&reqs[0].body).unwrap();
1156 assert_eq!(
1157 body["response_format"], "wav",
1158 "configured response_format must reach the outgoing request body"
1159 );
1160 assert_eq!(body["input"], "hello world");
1161 assert_eq!(body["voice"], "hannah");
1162 assert_eq!(body["model"], "tts-1");
1163 }
1164
1165 #[tokio::test]
1166 async fn synthesize_defaults_response_format_to_opus_when_unset() {
1167 use wiremock::matchers::{method, path};
1168 use wiremock::{Mock, MockServer, ResponseTemplate};
1169
1170 let server = MockServer::start().await;
1171 Mock::given(method("POST"))
1172 .and(path("/v1/audio/speech"))
1173 .respond_with(ResponseTemplate::new(200).set_body_bytes(b"AUDIO".to_vec()))
1174 .mount(&server)
1175 .await;
1176
1177 let cfg = TtsProviderConfig {
1179 api_key: Some("sk-test".to_string()),
1180 uri: Some(format!("{}/v1/audio/speech", server.uri())),
1181 ..TtsProviderConfig::default()
1182 };
1183 let provider = OpenAiTtsProvider::new("test", &cfg).unwrap();
1184 provider.synthesize("hi", "alloy").await.unwrap();
1185
1186 let reqs = server.received_requests().await.unwrap();
1187 let body: serde_json::Value = serde_json::from_slice(&reqs[0].body).unwrap();
1188 assert_eq!(
1189 body["response_format"], "opus",
1190 "unset response_format must default to opus in the outgoing request"
1191 );
1192 }
1193
1194 #[test]
1195 fn openai_defaults_to_production_endpoint_when_uri_unset() {
1196 let cfg = TtsProviderConfig {
1197 api_key: Some("sk-test".to_string()),
1198 ..TtsProviderConfig::default()
1199 };
1200 let provider = OpenAiTtsProvider::new("test", &cfg).unwrap();
1201 assert_eq!(provider.base_url, "https://api.openai.com/v1/audio/speech");
1202 assert_eq!(provider.response_format, "opus");
1203 }
1204}