1use std::collections::HashMap;
11
12use anyhow::{Context, Result, bail};
13
14use zeroclaw_config::schema::{Config, TtsProviderConfig};
15
16const DEFAULT_MAX_TEXT_LENGTH: usize = 4096;
18
19const TTS_HTTP_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(60);
21
22#[async_trait::async_trait]
26pub trait TtsProvider: Send + Sync + ::zeroclaw_api::attribution::Attributable {
27 fn name(&self) -> &str;
29
30 async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>>;
32
33 fn supported_voices(&self) -> Vec<String>;
35
36 fn supported_formats(&self) -> Vec<String>;
38}
39
40pub struct OpenAiTtsProvider {
44 alias: String,
45 api_key: String,
46 model: String,
47 speed: f64,
48 base_url: String,
52 response_format: String,
55 client: reqwest::Client,
56}
57
58impl OpenAiTtsProvider {
59 pub fn new(alias: &str, config: &TtsProviderConfig) -> Result<Self> {
64 let api_key = config
65 .api_key
66 .as_deref()
67 .map(str::trim)
68 .filter(|k| !k.is_empty())
69 .map(ToOwned::to_owned)
70 .context(
71 "Missing OpenAI TTS API key: set `[tts_providers.openai.<alias>].api_key` (or via \
72 `ZEROCLAW_providers__tts__openai__<alias>__api_key=...`).",
73 )?;
74
75 Ok(Self {
76 alias: alias.to_string(),
77 api_key,
78 model: config
79 .model
80 .clone()
81 .filter(|m| !m.trim().is_empty())
82 .unwrap_or_else(|| "tts-1".to_string()),
83 speed: config.speed.unwrap_or(1.0),
84 base_url: config
85 .uri
86 .clone()
87 .filter(|u| !u.trim().is_empty())
88 .unwrap_or_else(|| "https://api.openai.com/v1/audio/speech".to_string()),
89 response_format: config
90 .response_format
91 .clone()
92 .filter(|f| !f.trim().is_empty())
93 .unwrap_or_else(|| "opus".to_string()),
94 client: reqwest::Client::builder()
95 .timeout(TTS_HTTP_TIMEOUT)
96 .build()
97 .context("Failed to build HTTP client for OpenAI TTS")?,
98 })
99 }
100}
101
102#[async_trait::async_trait]
103impl TtsProvider for OpenAiTtsProvider {
104 fn name(&self) -> &str {
105 "openai"
106 }
107
108 async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
109 let body = serde_json::json!({
110 "model": self.model,
111 "input": text,
112 "voice": voice,
113 "speed": self.speed,
114 "response_format": self.response_format,
115 });
116
117 let resp = self
118 .client
119 .post(&self.base_url)
120 .bearer_auth(&self.api_key)
121 .json(&body)
122 .send()
123 .await
124 .context("Failed to send OpenAI TTS request")?;
125
126 let status = resp.status();
127 if !status.is_success() {
128 let error_body: serde_json::Value = resp
129 .json()
130 .await
131 .unwrap_or_else(|_| serde_json::json!({"error": "unknown"}));
132 let msg = error_body["error"]["message"]
133 .as_str()
134 .unwrap_or("unknown error");
135 bail!("OpenAI TTS API error ({}): {}", status, msg);
136 }
137
138 let bytes = resp
139 .bytes()
140 .await
141 .context("Failed to read OpenAI TTS response body")?;
142 Ok(bytes.to_vec())
143 }
144
145 fn supported_voices(&self) -> Vec<String> {
146 ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
147 .iter()
148 .map(|s| (*s).to_string())
149 .collect()
150 }
151
152 fn supported_formats(&self) -> Vec<String> {
153 ["mp3", "opus", "aac", "flac", "wav", "pcm"]
154 .iter()
155 .map(|s| (*s).to_string())
156 .collect()
157 }
158}
159
160pub struct ElevenLabsTtsProvider {
164 alias: String,
165 api_key: String,
166 model_id: String,
167 stability: f64,
168 similarity_boost: f64,
169 client: reqwest::Client,
170}
171
172impl ElevenLabsTtsProvider {
173 pub fn new(alias: &str, config: &TtsProviderConfig) -> Result<Self> {
177 let api_key = config
178 .api_key
179 .as_deref()
180 .map(str::trim)
181 .filter(|k| !k.is_empty())
182 .map(ToOwned::to_owned)
183 .context(
184 "Missing ElevenLabs API key: set `[tts_providers.elevenlabs.<alias>].api_key` (or \
185 via `ZEROCLAW_providers__tts__elevenlabs__<alias>__api_key=...`).",
186 )?;
187
188 Ok(Self {
189 alias: alias.to_string(),
190 api_key,
191 model_id: config
192 .model
193 .clone()
194 .filter(|m| !m.trim().is_empty())
195 .unwrap_or_else(|| "eleven_monolingual_v1".to_string()),
196 stability: config.stability.unwrap_or(0.5),
197 similarity_boost: config.similarity_boost.unwrap_or(0.5),
198 client: reqwest::Client::builder()
199 .timeout(TTS_HTTP_TIMEOUT)
200 .build()
201 .context("Failed to build HTTP client for ElevenLabs TTS")?,
202 })
203 }
204}
205
206#[async_trait::async_trait]
207impl TtsProvider for ElevenLabsTtsProvider {
208 fn name(&self) -> &str {
209 "elevenlabs"
210 }
211
212 async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
213 if !voice
214 .chars()
215 .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
216 {
217 bail!("ElevenLabs voice ID contains invalid characters: {voice}");
218 }
219 let url = format!("https://api.elevenlabs.io/v1/text-to-speech/{voice}");
220 let body = serde_json::json!({
221 "text": text,
222 "model_id": self.model_id,
223 "voice_settings": {
224 "stability": self.stability,
225 "similarity_boost": self.similarity_boost,
226 },
227 });
228
229 let resp = self
230 .client
231 .post(&url)
232 .header("xi-api-key", &self.api_key)
233 .json(&body)
234 .send()
235 .await
236 .context("Failed to send ElevenLabs TTS request")?;
237
238 let status = resp.status();
239 if !status.is_success() {
240 let error_body: serde_json::Value = resp
241 .json()
242 .await
243 .unwrap_or_else(|_| serde_json::json!({"error": "unknown"}));
244 let msg = error_body["detail"]["message"]
245 .as_str()
246 .or_else(|| error_body["detail"].as_str())
247 .unwrap_or("unknown error");
248 bail!("ElevenLabs TTS API error ({}): {}", status, msg);
249 }
250
251 let bytes = resp
252 .bytes()
253 .await
254 .context("Failed to read ElevenLabs TTS response body")?;
255 Ok(bytes.to_vec())
256 }
257
258 fn supported_voices(&self) -> Vec<String> {
259 Vec::new()
261 }
262
263 fn supported_formats(&self) -> Vec<String> {
264 ["mp3", "pcm", "ulaw"]
265 .iter()
266 .map(|s| (*s).to_string())
267 .collect()
268 }
269}
270
271pub struct GoogleTtsProvider {
275 alias: String,
276 api_key: String,
277 language_code: String,
278 client: reqwest::Client,
279}
280
281impl GoogleTtsProvider {
282 pub fn new(alias: &str, config: &TtsProviderConfig) -> Result<Self> {
286 let api_key = config
287 .api_key
288 .as_deref()
289 .map(str::trim)
290 .filter(|k| !k.is_empty())
291 .map(ToOwned::to_owned)
292 .context(
293 "Missing Google TTS API key: set `[tts_providers.google.<alias>].api_key` (or via \
294 `ZEROCLAW_providers__tts__google__<alias>__api_key=...`).",
295 )?;
296
297 Ok(Self {
298 alias: alias.to_string(),
299 api_key,
300 language_code: config
301 .language_code
302 .clone()
303 .filter(|c| !c.trim().is_empty())
304 .unwrap_or_else(|| "en-US".to_string()),
305 client: reqwest::Client::builder()
306 .timeout(TTS_HTTP_TIMEOUT)
307 .build()
308 .context("Failed to build HTTP client for Google TTS")?,
309 })
310 }
311}
312
313#[async_trait::async_trait]
314impl TtsProvider for GoogleTtsProvider {
315 fn name(&self) -> &str {
316 "google"
317 }
318
319 async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
320 let url = "https://texttospeech.googleapis.com/v1/text:synthesize";
321 let body = serde_json::json!({
322 "input": { "text": text },
323 "voice": {
324 "languageCode": self.language_code,
325 "name": voice,
326 },
327 "audioConfig": {
328 "audioEncoding": "MP3",
329 },
330 });
331
332 let resp = self
333 .client
334 .post(url)
335 .header("x-goog-api-key", &self.api_key)
336 .json(&body)
337 .send()
338 .await
339 .context("Failed to send Google TTS request")?;
340
341 let status = resp.status();
342 let resp_body: serde_json::Value = resp
343 .json()
344 .await
345 .context("Failed to parse Google TTS response")?;
346
347 if !status.is_success() {
348 let msg = resp_body["error"]["message"]
349 .as_str()
350 .unwrap_or("unknown error");
351 bail!("Google TTS API error ({}): {}", status, msg);
352 }
353
354 let audio_b64 = resp_body["audioContent"]
355 .as_str()
356 .context("Google TTS response missing 'audioContent' field")?;
357
358 use base64::Engine;
359 let bytes = base64::engine::general_purpose::STANDARD
360 .decode(audio_b64)
361 .context("Failed to decode Google TTS base64 audio")?;
362 Ok(bytes)
363 }
364
365 fn supported_voices(&self) -> Vec<String> {
366 [
368 "en-US-Standard-A",
369 "en-US-Standard-B",
370 "en-US-Standard-C",
371 "en-US-Standard-D",
372 ]
373 .iter()
374 .map(|s| (*s).to_string())
375 .collect()
376 }
377
378 fn supported_formats(&self) -> Vec<String> {
379 ["mp3", "wav", "ogg"]
380 .iter()
381 .map(|s| (*s).to_string())
382 .collect()
383 }
384}
385
386pub struct EdgeTtsProvider {
390 alias: String,
391 binary_path: String,
392}
393
394impl EdgeTtsProvider {
395 const ALLOWED_BINARIES: &[&str] = &["edge-tts", "edge-playback"];
397
398 pub fn new(alias: &str, config: &TtsProviderConfig) -> Result<Self> {
404 let raw_path = config
405 .binary_path
406 .clone()
407 .filter(|p| !p.trim().is_empty())
408 .unwrap_or_else(|| "edge-tts".to_string());
409 if raw_path.contains('/') || raw_path.contains('\\') {
410 bail!(
411 "Edge TTS binary_path must be a bare command name without path separators, got: {raw_path}"
412 );
413 }
414 if !Self::ALLOWED_BINARIES.contains(&raw_path.as_str()) {
415 bail!(
416 "Edge TTS binary_path must be one of {:?}, got: {raw_path}",
417 Self::ALLOWED_BINARIES,
418 );
419 }
420 Ok(Self {
421 alias: alias.to_string(),
422 binary_path: raw_path,
423 })
424 }
425}
426
427#[async_trait::async_trait]
428impl TtsProvider for EdgeTtsProvider {
429 fn name(&self) -> &str {
430 "edge"
431 }
432
433 async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
434 let temp_dir = std::env::temp_dir();
435 let output_file = temp_dir.join(format!("zeroclaw_tts_{}.mp3", uuid::Uuid::new_v4()));
436 let output_path = output_file
437 .to_str()
438 .context("Failed to build temp file path for Edge TTS")?;
439
440 let output = tokio::time::timeout(
441 TTS_HTTP_TIMEOUT,
442 tokio::process::Command::new(&self.binary_path)
443 .arg("--text")
444 .arg(text)
445 .arg("--voice")
446 .arg(voice)
447 .arg("--write-media")
448 .arg(output_path)
449 .output(),
450 )
451 .await
452 .context("Edge TTS subprocess timed out")?
453 .context("Failed to spawn edge-tts subprocess")?;
454
455 if !output.status.success() {
456 let stderr = String::from_utf8_lossy(&output.stderr);
457 let _ = tokio::fs::remove_file(&output_file).await;
459 bail!("edge-tts failed (exit {}): {}", output.status, stderr);
460 }
461
462 let bytes = tokio::fs::read(&output_file)
463 .await
464 .context("Failed to read edge-tts output file")?;
465
466 let _ = tokio::fs::remove_file(&output_file).await;
468
469 Ok(bytes)
470 }
471
472 fn supported_voices(&self) -> Vec<String> {
473 [
475 "en-US-AriaNeural",
476 "en-US-GuyNeural",
477 "en-US-JennyNeural",
478 "en-GB-SoniaNeural",
479 ]
480 .iter()
481 .map(|s| (*s).to_string())
482 .collect()
483 }
484
485 fn supported_formats(&self) -> Vec<String> {
486 vec!["mp3".to_string()]
487 }
488}
489
490pub struct PiperTtsProvider {
494 alias: String,
495 client: reqwest::Client,
496 api_url: String,
497}
498
499impl PiperTtsProvider {
500 pub fn new(alias: &str, config: &TtsProviderConfig) -> Self {
503 let api_url = config
504 .uri
505 .clone()
506 .filter(|u| !u.trim().is_empty())
507 .unwrap_or_else(|| "http://127.0.0.1:5000/v1/audio/speech".to_string());
508 Self {
509 alias: alias.to_string(),
510 client: reqwest::Client::builder()
511 .timeout(TTS_HTTP_TIMEOUT)
512 .build()
513 .expect("Failed to build HTTP client for Piper TTS"),
514 api_url,
515 }
516 }
517}
518
519#[async_trait::async_trait]
520impl TtsProvider for PiperTtsProvider {
521 fn name(&self) -> &str {
522 "piper"
523 }
524
525 async fn synthesize(&self, text: &str, voice: &str) -> Result<Vec<u8>> {
526 let body = serde_json::json!({
527 "model": "tts-1",
528 "input": text,
529 "voice": voice,
530 });
531
532 let resp = self
533 .client
534 .post(&self.api_url)
535 .json(&body)
536 .send()
537 .await
538 .context("Failed to send Piper TTS request")?;
539
540 let status = resp.status();
541 if !status.is_success() {
542 let error_body: serde_json::Value = resp
543 .json()
544 .await
545 .unwrap_or_else(|_| serde_json::json!({"error": "unknown"}));
546 let msg = error_body["error"]["message"]
547 .as_str()
548 .unwrap_or("unknown error");
549 bail!("Piper TTS API error ({}): {}", status, msg);
550 }
551
552 let bytes = resp
553 .bytes()
554 .await
555 .context("Failed to read Piper TTS response body")?;
556 Ok(bytes.to_vec())
557 }
558
559 fn supported_voices(&self) -> Vec<String> {
560 Vec::new()
562 }
563
564 fn supported_formats(&self) -> Vec<String> {
565 ["mp3", "wav", "opus"]
566 .iter()
567 .map(|s| (*s).to_string())
568 .collect()
569 }
570}
571
572pub struct TtsManager {
583 tts_providers: HashMap<String, Box<dyn TtsProvider>>,
584 voice_by_alias: HashMap<String, String>,
585 agent_tts_provider: String,
588 default_voice: String,
589 max_text_length: usize,
590}
591
592impl TtsManager {
593 pub fn from_config(config: &Config) -> Result<Self> {
603 let mut tts_providers: HashMap<String, Box<dyn TtsProvider>> = HashMap::new();
604 let mut voice_by_alias: HashMap<String, String> = HashMap::new();
605
606 for (family, alias, instance) in config.providers.tts.iter_entries() {
610 let dotted = format!("{family}.{alias}");
611 let result: Result<Box<dyn TtsProvider>> = match family {
612 "openai" => OpenAiTtsProvider::new(alias, instance).map(|p| Box::new(p) as _),
613 "elevenlabs" => {
614 ElevenLabsTtsProvider::new(alias, instance).map(|p| Box::new(p) as _)
615 }
616 "google" => GoogleTtsProvider::new(alias, instance).map(|p| Box::new(p) as _),
617 "edge" => EdgeTtsProvider::new(alias, instance).map(|p| Box::new(p) as _),
618 "piper" => Ok(Box::new(PiperTtsProvider::new(alias, instance)) as _),
619 _ => unreachable!("TtsProviders typed slots cover all 5 families"),
620 };
621 match result {
622 Ok(p) => {
623 tts_providers.insert(dotted.clone(), p);
624 if let Some(voice) = instance
625 .voice
626 .as_deref()
627 .map(str::trim)
628 .filter(|v| !v.is_empty())
629 {
630 voice_by_alias.insert(dotted, voice.to_string());
631 }
632 }
633 Err(e) => {
634 ::zeroclaw_log::record!(
635 WARN,
636 ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
637 .with_outcome(::zeroclaw_log::EventOutcome::Unknown)
638 .with_attrs(
639 ::serde_json::json!({"error": format!("{}", e), "dotted": dotted})
640 ),
641 "Skipping TTS provider"
642 );
643 }
644 }
645 }
646
647 let max_text_length = if config.tts.max_text_length == 0 {
648 DEFAULT_MAX_TEXT_LENGTH
649 } else {
650 config.tts.max_text_length
651 };
652
653 let agent_tts_provider = config
658 .resolved_runtime_agent_alias()
659 .and_then(|alias| config.agents.get(alias))
660 .map(|a| a.tts_provider.as_str().to_string())
661 .unwrap_or_default();
662
663 Ok(Self {
664 tts_providers,
665 voice_by_alias,
666 agent_tts_provider,
667 default_voice: config.tts.default_voice.clone(),
668 max_text_length,
669 })
670 }
671
672 pub async fn synthesize(&self, text: &str) -> Result<Vec<u8>> {
679 let provider_alias = self.agent_tts_provider.as_str();
680 if provider_alias.is_empty() {
681 bail!(
682 "Agent has no tts_provider configured. Set \
683 `agent.<alias>.tts_provider = \"<type>.<alias>\"` referencing a \
684 [tts_providers.<type>.<alias>] entry."
685 );
686 }
687 let voice = self
688 .voice_by_alias
689 .get(provider_alias)
690 .map_or(self.default_voice.as_str(), String::as_str);
691 self.synthesize_with_provider(text, provider_alias, voice)
692 .await
693 }
694
695 pub async fn synthesize_with_provider(
697 &self,
698 text: &str,
699 provider_alias: &str,
700 voice: &str,
701 ) -> Result<Vec<u8>> {
702 if text.is_empty() {
703 bail!("TTS text must not be empty");
704 }
705 let char_count = text.chars().count();
706 if char_count > self.max_text_length {
707 bail!(
708 "TTS text too long ({} chars, max {})",
709 char_count,
710 self.max_text_length
711 );
712 }
713
714 let tts = self.tts_providers.get(provider_alias).ok_or_else(|| {
715 let available = self.available_providers().join(", ");
716 ::zeroclaw_log::record!(
717 ERROR,
718 ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Reject)
719 .with_outcome(::zeroclaw_log::EventOutcome::Failure)
720 .with_attrs(::serde_json::json!({
721 "tts_provider": provider_alias,
722 "available": available,
723 })),
724 "tts: provider not configured"
725 );
726 anyhow::Error::msg(format!(
727 "TTS model_provider '{}' not configured (available: {})",
728 provider_alias, available
729 ))
730 })?;
731
732 use ::zeroclaw_log::Instrument;
733 let span = ::zeroclaw_log::attribution_span!(tts.as_ref());
734 ::zeroclaw_log::scope!(voice: voice, => tts.synthesize(text, voice))
735 .instrument(span)
736 .await
737 }
738
739 pub fn available_providers(&self) -> Vec<String> {
741 let mut names: Vec<_> = self.tts_providers.keys().cloned().collect();
742 names.sort();
743 names
744 }
745}
746
747impl ::zeroclaw_api::attribution::Attributable for OpenAiTtsProvider {
750 fn role(&self) -> ::zeroclaw_api::attribution::Role {
751 ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
752 ::zeroclaw_api::attribution::TtsProviderKind::OpenAi,
753 ))
754 }
755 fn alias(&self) -> &str {
756 &self.alias
757 }
758}
759
760impl ::zeroclaw_api::attribution::Attributable for ElevenLabsTtsProvider {
761 fn role(&self) -> ::zeroclaw_api::attribution::Role {
762 ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
763 ::zeroclaw_api::attribution::TtsProviderKind::ElevenLabs,
764 ))
765 }
766 fn alias(&self) -> &str {
767 &self.alias
768 }
769}
770
771impl ::zeroclaw_api::attribution::Attributable for GoogleTtsProvider {
772 fn role(&self) -> ::zeroclaw_api::attribution::Role {
773 ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
774 ::zeroclaw_api::attribution::TtsProviderKind::Google,
775 ))
776 }
777 fn alias(&self) -> &str {
778 &self.alias
779 }
780}
781
782impl ::zeroclaw_api::attribution::Attributable for EdgeTtsProvider {
783 fn role(&self) -> ::zeroclaw_api::attribution::Role {
784 ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
785 ::zeroclaw_api::attribution::TtsProviderKind::Edge,
786 ))
787 }
788 fn alias(&self) -> &str {
789 &self.alias
790 }
791}
792
793impl ::zeroclaw_api::attribution::Attributable for PiperTtsProvider {
794 fn role(&self) -> ::zeroclaw_api::attribution::Role {
795 ::zeroclaw_api::attribution::Role::Provider(::zeroclaw_api::attribution::ProviderKind::Tts(
796 ::zeroclaw_api::attribution::TtsProviderKind::Piper,
797 ))
798 }
799 fn alias(&self) -> &str {
800 &self.alias
801 }
802}
803
804#[cfg(test)]
805mod tests {
806 use super::*;
807
808 fn config_with_edge_alias() -> Config {
809 let mut cfg = Config::default();
810 cfg.agents.insert(
811 "default".into(),
812 zeroclaw_config::schema::AliasedAgentConfig {
813 tts_provider: "edge.default".into(),
814 ..Default::default()
815 },
816 );
817 cfg.providers.tts.edge.insert(
818 "default".to_string(),
819 zeroclaw_config::schema::EdgeTtsProviderConfig {
820 base: TtsProviderConfig {
821 binary_path: Some("edge-tts".to_string()),
822 ..TtsProviderConfig::default()
823 },
824 },
825 );
826 cfg
827 }
828
829 fn config_with_piper_alias() -> Config {
830 let mut cfg = Config::default();
831 cfg.agents.insert(
832 "default".into(),
833 zeroclaw_config::schema::AliasedAgentConfig {
834 tts_provider: "piper.default".into(),
835 ..Default::default()
836 },
837 );
838 cfg.providers.tts.piper.insert(
839 "default".to_string(),
840 zeroclaw_config::schema::PiperTtsProviderConfig {
841 base: TtsProviderConfig {
842 uri: Some("http://127.0.0.1:5000/v1/audio/speech".to_string()),
843 ..TtsProviderConfig::default()
844 },
845 },
846 );
847 cfg
848 }
849
850 #[test]
851 fn tts_manager_creation_with_defaults() {
852 let config = Config::default();
853 let manager = TtsManager::from_config(&config).unwrap();
854 assert!(manager.available_providers().is_empty());
855 }
856
857 #[test]
858 fn tts_manager_registers_alias_keyed_provider() {
859 let cfg = config_with_edge_alias();
860 let manager = TtsManager::from_config(&cfg).unwrap();
861 assert_eq!(manager.available_providers(), vec!["edge.default"]);
862 }
863
864 #[tokio::test]
865 async fn tts_rejects_empty_text() {
866 let cfg = config_with_edge_alias();
867 let manager = TtsManager::from_config(&cfg).unwrap();
868 let err = manager
869 .synthesize_with_provider("", "edge.default", "en-US-AriaNeural")
870 .await
871 .unwrap_err();
872 assert!(
873 err.to_string().contains("must not be empty"),
874 "expected empty-text error, got: {err}"
875 );
876 }
877
878 #[tokio::test]
879 async fn tts_rejects_text_exceeding_max_length() {
880 let mut cfg = config_with_edge_alias();
881 cfg.tts.max_text_length = 10;
882 let manager = TtsManager::from_config(&cfg).unwrap();
883 let long_text = "a".repeat(11);
884 let err = manager
885 .synthesize_with_provider(&long_text, "edge.default", "en-US-AriaNeural")
886 .await
887 .unwrap_err();
888 assert!(
889 err.to_string().contains("too long"),
890 "expected too-long error, got: {err}"
891 );
892 }
893
894 #[tokio::test]
895 async fn tts_rejects_unknown_provider() {
896 let cfg = Config::default();
897 let manager = TtsManager::from_config(&cfg).unwrap();
898 let err = manager
899 .synthesize_with_provider("hello", "nonexistent.alias", "voice")
900 .await
901 .unwrap_err();
902 assert!(
903 err.to_string().contains("not configured"),
904 "expected not-configured error, got: {err}"
905 );
906 }
907
908 #[test]
909 fn piper_provider_creation_uses_default_url_when_unset() {
910 let model_provider = PiperTtsProvider::new("test", &TtsProviderConfig::default());
911 assert_eq!(model_provider.name(), "piper");
912 assert_eq!(
913 model_provider.api_url,
914 "http://127.0.0.1:5000/v1/audio/speech"
915 );
916 assert_eq!(
917 model_provider.supported_formats(),
918 vec!["mp3", "wav", "opus"]
919 );
920 assert!(model_provider.supported_voices().is_empty());
921 }
922
923 #[test]
924 fn tts_manager_with_piper_alias() {
925 let cfg = config_with_piper_alias();
926 let manager = TtsManager::from_config(&cfg).unwrap();
927 assert_eq!(manager.available_providers(), vec!["piper.default"]);
928 }
929
930 #[tokio::test]
931 async fn tts_rejects_empty_text_for_piper() {
932 let cfg = config_with_piper_alias();
933 let manager = TtsManager::from_config(&cfg).unwrap();
934 let err = manager
935 .synthesize_with_provider("", "piper.default", "default")
936 .await
937 .unwrap_err();
938 assert!(
939 err.to_string().contains("must not be empty"),
940 "expected empty-text error, got: {err}"
941 );
942 }
943
944 #[test]
945 fn tts_config_defaults() {
946 let config = zeroclaw_config::schema::TtsConfig::default();
947 assert!(!config.enabled);
948 assert_eq!(config.default_voice, "alloy");
951 assert_eq!(config.default_format, "mp3");
952 assert_eq!(config.max_text_length, DEFAULT_MAX_TEXT_LENGTH);
953 }
954
955 #[test]
956 fn tts_manager_max_text_length_zero_uses_default() {
957 let mut cfg = Config::default();
958 cfg.tts.max_text_length = 0;
959 let manager = TtsManager::from_config(&cfg).unwrap();
960 assert_eq!(manager.max_text_length, DEFAULT_MAX_TEXT_LENGTH);
961 }
962
963 #[tokio::test]
964 async fn synthesize_posts_to_configured_uri_with_response_format() {
965 use wiremock::matchers::{method, path};
966 use wiremock::{Mock, MockServer, ResponseTemplate};
967
968 let server = MockServer::start().await;
969 Mock::given(method("POST"))
970 .and(path("/v1/audio/speech"))
971 .respond_with(ResponseTemplate::new(200).set_body_bytes(b"FAKE_WAV".to_vec()))
972 .mount(&server)
973 .await;
974
975 let cfg = TtsProviderConfig {
976 api_key: Some("sk-test".to_string()),
977 uri: Some(format!("{}/v1/audio/speech", server.uri())),
978 response_format: Some("wav".to_string()),
979 ..TtsProviderConfig::default()
980 };
981 let provider = OpenAiTtsProvider::new("test", &cfg).unwrap();
982
983 let audio = provider.synthesize("hello world", "hannah").await.unwrap();
984 assert_eq!(
985 audio, b"FAKE_WAV",
986 "synthesize should return the bytes served by the configured endpoint"
987 );
988
989 let reqs = server.received_requests().await.unwrap();
990 assert_eq!(
991 reqs.len(),
992 1,
993 "exactly one POST should reach the configured uri"
994 );
995 let body: serde_json::Value = serde_json::from_slice(&reqs[0].body).unwrap();
996 assert_eq!(
997 body["response_format"], "wav",
998 "configured response_format must reach the outgoing request body"
999 );
1000 assert_eq!(body["input"], "hello world");
1001 assert_eq!(body["voice"], "hannah");
1002 assert_eq!(body["model"], "tts-1");
1003 }
1004
1005 #[tokio::test]
1006 async fn synthesize_defaults_response_format_to_opus_when_unset() {
1007 use wiremock::matchers::{method, path};
1008 use wiremock::{Mock, MockServer, ResponseTemplate};
1009
1010 let server = MockServer::start().await;
1011 Mock::given(method("POST"))
1012 .and(path("/v1/audio/speech"))
1013 .respond_with(ResponseTemplate::new(200).set_body_bytes(b"AUDIO".to_vec()))
1014 .mount(&server)
1015 .await;
1016
1017 let cfg = TtsProviderConfig {
1019 api_key: Some("sk-test".to_string()),
1020 uri: Some(format!("{}/v1/audio/speech", server.uri())),
1021 ..TtsProviderConfig::default()
1022 };
1023 let provider = OpenAiTtsProvider::new("test", &cfg).unwrap();
1024 provider.synthesize("hi", "alloy").await.unwrap();
1025
1026 let reqs = server.received_requests().await.unwrap();
1027 let body: serde_json::Value = serde_json::from_slice(&reqs[0].body).unwrap();
1028 assert_eq!(
1029 body["response_format"], "opus",
1030 "unset response_format must default to opus in the outgoing request"
1031 );
1032 }
1033
1034 #[test]
1035 fn openai_defaults_to_production_endpoint_when_uri_unset() {
1036 let cfg = TtsProviderConfig {
1037 api_key: Some("sk-test".to_string()),
1038 ..TtsProviderConfig::default()
1039 };
1040 let provider = OpenAiTtsProvider::new("test", &cfg).unwrap();
1041 assert_eq!(provider.base_url, "https://api.openai.com/v1/audio/speech");
1042 assert_eq!(provider.response_format, "opus");
1043 }
1044}