1use base64::{Engine as _, engine::general_purpose::STANDARD};
2use reqwest::Client;
3use std::collections::{HashMap, HashSet};
4use std::path::Path;
5use zeroclaw_api::model_provider::ChatMessage;
6use zeroclaw_config::schema::{MultimodalConfig, build_runtime_proxy_client_with_timeouts};
7
8const IMAGE_MARKER_PREFIX: &str = "[IMAGE:";
9const ALLOWED_IMAGE_MIME_TYPES: &[&str] = &["image/png", "image/jpeg", "image/webp", "image/gif"];
17
18#[derive(Debug, Default)]
22pub struct LocalImageCache {
23 entries: HashMap<String, (u64, i64, String)>,
24 order: std::collections::VecDeque<String>,
25 bytes: usize,
26}
27
28const LOCAL_IMAGE_CACHE_MAX_ENTRIES: usize = 32;
29const LOCAL_IMAGE_CACHE_MAX_BYTES: usize = 64 * 1024 * 1024;
30
31impl LocalImageCache {
32 pub fn new() -> Self {
33 Self::default()
34 }
35
36 fn get(&mut self, path: &str, len: u64, mtime: i64) -> Option<&str> {
37 let (cached_len, cached_mtime, _) = self.entries.get(path)?;
38 let immutable = *cached_len == 0 && *cached_mtime == 0;
39 let fresh = *cached_len == len && *cached_mtime == mtime;
40 if !immutable && !fresh {
41 return None;
42 }
43 if let Some(pos) = self.order.iter().position(|p| p == path) {
44 let key = self.order.remove(pos).expect("position valid");
45 self.order.push_back(key);
46 }
47 self.entries.get(path).map(|(_, _, uri)| uri.as_str())
48 }
49
50 fn insert(&mut self, path: String, len: u64, mtime: i64, data_uri: String) {
51 if let Some((_, _, old)) = self.entries.remove(&path) {
52 self.bytes = self.bytes.saturating_sub(old.len());
53 if let Some(pos) = self.order.iter().position(|p| p == &path) {
54 self.order.remove(pos);
55 }
56 }
57 self.bytes += data_uri.len();
58 self.entries.insert(path.clone(), (len, mtime, data_uri));
59 self.order.push_back(path);
60 while self.entries.len() > LOCAL_IMAGE_CACHE_MAX_ENTRIES
61 || self.bytes > LOCAL_IMAGE_CACHE_MAX_BYTES
62 {
63 let Some(victim) = self.order.pop_front() else {
64 break;
65 };
66 if let Some((_, _, uri)) = self.entries.remove(&victim) {
67 self.bytes = self.bytes.saturating_sub(uri.len());
68 }
69 }
70 }
71
72 pub fn len(&self) -> usize {
73 self.entries.len()
74 }
75
76 pub fn is_empty(&self) -> bool {
77 self.entries.is_empty()
78 }
79}
80
81#[derive(Debug, Clone)]
82pub struct PreparedMessages {
83 pub messages: Vec<ChatMessage>,
84 pub contains_images: bool,
85}
86
87#[derive(Debug, thiserror::Error)]
88pub enum MultimodalError {
89 #[error("multimodal image limit exceeded: max_images={max_images}, found={found}")]
90 TooManyImages { max_images: usize, found: usize },
91
92 #[error(
93 "multimodal image size limit exceeded for '{input}': {size_bytes} bytes > {max_bytes} bytes"
94 )]
95 ImageTooLarge {
96 input: String,
97 size_bytes: usize,
98 max_bytes: usize,
99 },
100
101 #[error("multimodal image MIME type is not allowed for '{input}': {mime}")]
102 UnsupportedMime { input: String, mime: String },
103
104 #[error("multimodal remote image fetch is disabled for '{input}'")]
105 RemoteFetchDisabled { input: String },
106
107 #[error("multimodal image source not found or unreadable: '{input}'")]
108 ImageSourceNotFound { input: String },
109
110 #[error("invalid multimodal image marker '{input}': {reason}")]
111 InvalidMarker { input: String, reason: String },
112
113 #[error("failed to download remote image '{input}': {reason}")]
114 RemoteFetchFailed { input: String, reason: String },
115
116 #[error("failed to read local image '{input}': {reason}")]
117 LocalReadFailed { input: String, reason: String },
118}
119
120fn is_loadable_image_reference(candidate: &str) -> bool {
126 candidate.starts_with('/')
127 || candidate.starts_with("http://")
128 || candidate.starts_with("https://")
129 || candidate.starts_with("data:")
130 || is_windows_path(candidate)
131 || is_windows_unc_path(candidate)
132}
133
134fn is_windows_path(candidate: &str) -> bool {
136 let mut chars = candidate.chars();
137 let Some(first) = chars.next() else {
138 return false;
139 };
140 if !first.is_ascii_alphabetic() {
141 return false;
142 }
143 let Some(second) = chars.next() else {
144 return false;
145 };
146 if second != ':' {
147 return false;
148 }
149 matches!(chars.next(), Some('\\') | Some('/'))
150}
151
152fn is_windows_unc_path(candidate: &str) -> bool {
163 let Some(rest) = candidate.strip_prefix(r"\\") else {
164 return false;
165 };
166 if rest.starts_with('?') || rest.starts_with('.') {
167 return false;
168 }
169 let mut parts = rest.splitn(2, ['\\', '/']);
170 let server = parts.next().unwrap_or("");
171 let share = parts.next().unwrap_or("");
172 !server.is_empty() && !share.is_empty()
173}
174
175fn collapse_wrapped_marker(raw: &str) -> String {
182 if !raw.contains('\n') && !raw.contains('\r') {
183 return raw.trim().to_string();
184 }
185 let mut out = String::with_capacity(raw.len());
186 let mut skip_ws = false;
187 for ch in raw.chars() {
188 if ch == '\n' || ch == '\r' {
189 skip_ws = true;
190 continue;
191 }
192 if skip_ws {
193 if ch.is_whitespace() {
194 continue;
195 }
196 skip_ws = false;
197 }
198 out.push(ch);
199 }
200 out.trim().to_string()
201}
202
203pub fn parse_image_markers(content: &str) -> (String, Vec<String>) {
204 let mut refs = Vec::new();
205 let mut cleaned = String::with_capacity(content.len());
206 let mut cursor = 0usize;
207
208 while let Some(rel_start) = content[cursor..].find(IMAGE_MARKER_PREFIX) {
209 let start = cursor + rel_start;
210 cleaned.push_str(&content[cursor..start]);
211
212 let marker_start = start + IMAGE_MARKER_PREFIX.len();
213 let Some(rel_end) = content[marker_start..].find(']') else {
214 cleaned.push_str(&content[start..]);
215 cursor = content.len();
216 break;
217 };
218
219 let end = marker_start + rel_end;
220 let candidate = collapse_wrapped_marker(&content[marker_start..end]);
221
222 if candidate.is_empty() || !is_loadable_image_reference(&candidate) {
223 cleaned.push_str(&content[start..=end]);
227 } else {
228 refs.push(candidate);
229 }
230
231 cursor = end + 1;
232 }
233
234 if cursor < content.len() {
235 cleaned.push_str(&content[cursor..]);
236 }
237
238 (cleaned.trim().to_string(), refs)
239}
240
241pub fn count_image_markers(messages: &[ChatMessage]) -> usize {
242 let latest_tool_indices = latest_tool_result_indices(messages);
243 count_image_markers_with_latest_tool_results(messages, &latest_tool_indices)
244}
245
246fn count_image_markers_with_latest_tool_results(
247 messages: &[ChatMessage],
248 latest_tool_result_indices: &HashSet<usize>,
249) -> usize {
250 messages
251 .iter()
252 .enumerate()
253 .filter(|(index, message)| {
254 should_normalize_message_images(*index, message, latest_tool_result_indices)
255 })
256 .map(|(_, message)| parse_image_markers(&message.content).1.len())
257 .sum()
258}
259
260pub fn contains_image_markers(messages: &[ChatMessage]) -> bool {
261 count_image_markers(messages) > 0
262}
263
264pub fn count_user_image_markers(messages: &[ChatMessage]) -> usize {
274 messages
275 .iter()
276 .filter(|message| message.role == "user" && !is_prompt_tool_result_message(message))
277 .map(|message| parse_image_markers(&message.content).1.len())
278 .sum()
279}
280
281pub fn strip_media_markers(text: &str) -> String {
298 static RE: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| {
299 regex::Regex::new(r"(?i)\[(?:IMAGE|PHOTO|DOCUMENT|FILE|VIDEO|VOICE|AUDIO):[^\]]*\]")
300 .unwrap()
301 });
302 RE.replace_all(text, "[media attachment]").into_owned()
303}
304
305pub fn extract_ollama_image_payload(image_ref: &str) -> Option<String> {
306 if image_ref.starts_with("data:") {
307 let comma_idx = image_ref.find(',')?;
308 let (_, payload) = image_ref.split_at(comma_idx + 1);
309 let payload = payload.trim();
310 if payload.is_empty() {
311 None
312 } else {
313 Some(payload.to_string())
314 }
315 } else {
316 Some(image_ref.trim().to_string()).filter(|value| !value.is_empty())
317 }
318}
319
320fn is_prompt_tool_result_message(message: &ChatMessage) -> bool {
321 message.role == "user" && message.content.trim_start().starts_with("[Tool results]")
322}
323
324fn is_tool_result_carrier(message: &ChatMessage) -> bool {
325 message.role == "tool" || is_prompt_tool_result_message(message)
326}
327
328fn latest_tool_result_indices(messages: &[ChatMessage]) -> HashSet<usize> {
329 let mut indices = HashSet::new();
330 let Some((last_index, last_message)) = messages.iter().enumerate().next_back() else {
331 return indices;
332 };
333
334 if is_prompt_tool_result_message(last_message) {
335 indices.insert(last_index);
336 return indices;
337 }
338
339 if last_message.role == "tool" {
340 for (index, message) in messages.iter().enumerate().rev() {
341 if message.role != "tool" {
342 break;
343 }
344 indices.insert(index);
345 }
346 }
347
348 indices
349}
350
351fn should_normalize_message_images(
352 index: usize,
353 message: &ChatMessage,
354 latest_tool_result_indices: &HashSet<usize>,
355) -> bool {
356 if is_tool_result_carrier(message) {
357 return latest_tool_result_indices.contains(&index);
358 }
359
360 message.role == "user"
361}
362
363fn stripped_image_marker_text(content: &str) -> String {
364 let (cleaned, refs) = parse_image_markers(content);
365 if refs.is_empty() {
366 return content.to_string();
367 }
368
369 if cleaned.trim().is_empty() {
370 "[image removed from history]".to_string()
371 } else {
372 cleaned
373 }
374}
375
376fn strip_tool_result_image_markers(message: &ChatMessage) -> ChatMessage {
377 if !message.content.contains(IMAGE_MARKER_PREFIX) {
378 return message.clone();
379 }
380
381 if message.role == "tool"
382 && let Ok(serde_json::Value::Object(mut obj)) =
383 serde_json::from_str::<serde_json::Value>(&message.content)
384 && let Some(serde_json::Value::String(inner)) = obj.get("content").cloned()
385 {
386 let stripped = stripped_image_marker_text(&inner);
387 if stripped == inner {
388 return message.clone();
389 }
390
391 obj.insert("content".to_string(), serde_json::Value::String(stripped));
392 return ChatMessage {
393 role: message.role.clone(),
394 content: serde_json::Value::Object(obj).to_string(),
395 };
396 }
397
398 ChatMessage {
399 role: message.role.clone(),
400 content: stripped_image_marker_text(&message.content),
401 }
402}
403
404fn replay_message_without_stale_tool_images(
405 index: usize,
406 message: &ChatMessage,
407 latest_tool_result_indices: &HashSet<usize>,
408) -> ChatMessage {
409 if is_tool_result_carrier(message) && !latest_tool_result_indices.contains(&index) {
410 strip_tool_result_image_markers(message)
411 } else {
412 message.clone()
413 }
414}
415
416async fn normalize_native_tool_result_json(
428 content: &str,
429 config: &MultimodalConfig,
430 max_bytes: usize,
431 remote_client: &Client,
432 ctx: &ImageNormalizeCtx<'_>,
433 cache: Option<&mut LocalImageCache>,
434) -> Option<(String, bool)> {
435 let Ok(serde_json::Value::Object(mut obj)) = serde_json::from_str::<serde_json::Value>(content)
436 else {
437 return None;
438 };
439
440 let Some(serde_json::Value::String(inner)) = obj.get("content").cloned() else {
441 return None;
442 };
443
444 let (cleaned_text, refs) = parse_image_markers(&inner);
445 if refs.is_empty() {
446 return None;
447 }
448
449 let normalized =
450 normalize_image_references(&refs, config, max_bytes, remote_client, ctx, cache).await;
451 let new_inner = compose_multimodal_content(
452 &cleaned_text,
453 &normalized.data_uris,
454 normalized.skipped_count,
455 refs.len(),
456 );
457 obj.insert("content".to_string(), serde_json::Value::String(new_inner));
458
459 Some((
460 serde_json::Value::Object(obj).to_string(),
461 !normalized.data_uris.is_empty(),
462 ))
463}
464
465pub async fn prepare_messages_for_provider(
466 messages: &[ChatMessage],
467 config: &MultimodalConfig,
468) -> anyhow::Result<PreparedMessages> {
469 prepare_messages_inner(messages, config, None).await
470}
471
472pub async fn prepare_messages_for_provider_cached(
476 messages: &[ChatMessage],
477 config: &MultimodalConfig,
478 cache: &mut LocalImageCache,
479) -> anyhow::Result<PreparedMessages> {
480 prepare_messages_inner(messages, config, Some(cache)).await
481}
482
483async fn prepare_messages_inner(
484 messages: &[ChatMessage],
485 config: &MultimodalConfig,
486 mut cache: Option<&mut LocalImageCache>,
487) -> anyhow::Result<PreparedMessages> {
488 let (max_images, max_image_size_mb) = config.effective_limits();
489 let max_bytes = max_image_size_mb.saturating_mul(1024 * 1024);
490
491 let latest_tool_indices = latest_tool_result_indices(messages);
492 let total_images = count_image_markers_with_latest_tool_results(messages, &latest_tool_indices);
493
494 if total_images == 0 {
495 return Ok(PreparedMessages {
496 messages: messages
497 .iter()
498 .enumerate()
499 .map(|(index, message)| {
500 replay_message_without_stale_tool_images(index, message, &latest_tool_indices)
501 })
502 .collect(),
503 contains_images: false,
504 });
505 }
506
507 let trimmed = if total_images > max_images {
512 ::zeroclaw_log::record!(
513 WARN,
514 ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
515 .with_outcome(::zeroclaw_log::EventOutcome::Unknown)
516 .with_attrs(::serde_json::json!({
517 "total_images": total_images,
518 "max_images": max_images,
519 "trimmed_to": max_images,
520 })),
521 "multimodal: trimming oldest images — conversation exceeds image limit"
522 );
523 trim_old_images(messages, max_images)
524 } else {
525 messages.to_vec()
526 };
527
528 let remote_client = build_runtime_proxy_client_with_timeouts("model_provider.ollama", 30, 10);
529 let latest_tool_indices = latest_tool_result_indices(&trimmed);
530
531 let mut normalized_messages = Vec::with_capacity(messages.len());
532 let mut has_successful_images = false;
533 for (index, message) in messages.iter().enumerate() {
534 if !should_normalize_message_images(index, message, &latest_tool_indices) {
535 normalized_messages.push(replay_message_without_stale_tool_images(
536 index,
537 message,
538 &latest_tool_indices,
539 ));
540 continue;
541 }
542
543 if message.role == "tool"
554 && let Some((prepared, contains_images)) = normalize_native_tool_result_json(
555 &message.content,
556 config,
557 max_bytes,
558 &remote_client,
559 &ImageNormalizeCtx {
560 message_index: index,
561 role: &message.role,
562 },
563 cache.as_deref_mut(),
564 )
565 .await
566 {
567 normalized_messages.push(ChatMessage {
568 role: message.role.clone(),
569 content: prepared,
570 });
571 has_successful_images |= contains_images;
572 continue;
573 }
574
575 let (cleaned_text, refs) = parse_image_markers(&message.content);
576 if refs.is_empty() {
577 normalized_messages.push(message.clone());
578 continue;
579 }
580
581 let normalized = normalize_image_references(
582 &refs,
583 config,
584 max_bytes,
585 &remote_client,
586 &ImageNormalizeCtx {
587 message_index: index,
588 role: &message.role,
589 },
590 cache.as_deref_mut(),
591 )
592 .await;
593 let content = compose_multimodal_content(
594 &cleaned_text,
595 &normalized.data_uris,
596 normalized.skipped_count,
597 refs.len(),
598 );
599 has_successful_images |= !normalized.data_uris.is_empty();
600 normalized_messages.push(ChatMessage {
601 role: message.role.clone(),
602 content,
603 });
604 }
605
606 let age_trimmed = if config.max_image_turns > 0 {
610 let before = count_image_markers(&normalized_messages);
611 let trimmed = trim_images_by_age(&normalized_messages, config.max_image_turns);
612 let after = count_image_markers(&trimmed);
613 if after < before {
614 ::zeroclaw_log::record!(
615 INFO,
616 ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
617 .with_attrs(::serde_json::json!({
618 "max_image_turns": config.max_image_turns,
619 "images_before": before,
620 "images_after": after,
621 "images_dropped": before - after,
622 })),
623 "multimodal: age-trimmed old images from conversation history"
624 );
625 }
626 trimmed
627 } else {
628 normalized_messages
629 };
630
631 let capped_messages = if has_successful_images && count_image_markers(&age_trimmed) > max_images
634 {
635 ::zeroclaw_log::record!(
636 WARN,
637 ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
638 .with_outcome(::zeroclaw_log::EventOutcome::Unknown)
639 .with_attrs(::serde_json::json!({
640 "images_after_normalization": count_image_markers(&age_trimmed),
641 "max_images": max_images,
642 })),
643 "multimodal: post-normalization image cap exceeded — trimming oldest images"
644 );
645 trim_old_images(&age_trimmed, max_images)
646 } else {
647 age_trimmed
648 };
649
650 Ok(PreparedMessages {
651 contains_images: count_image_markers(&capped_messages) > 0,
652 messages: capped_messages,
653 })
654}
655fn trim_images_by_age(messages: &[ChatMessage], max_turns: usize) -> Vec<ChatMessage> {
661 let mut user_turn_count = 0usize;
663 let mut cutoff = 0usize; for (i, m) in messages.iter().enumerate().rev() {
665 if m.role == "user" {
666 user_turn_count += 1;
667 if user_turn_count > max_turns {
668 cutoff = i + 1;
670 break;
671 }
672 }
673 }
674
675 if cutoff == 0 {
676 return messages.to_vec();
677 }
678
679 messages
680 .iter()
681 .enumerate()
682 .map(|(i, m)| {
683 if i < cutoff && m.role == "user" {
684 let (cleaned, refs) = parse_image_markers(&m.content);
685 if refs.is_empty() {
686 return m.clone();
687 }
688 let text = if cleaned.trim().is_empty() {
689 "[image removed from history]".to_string()
690 } else {
691 cleaned
692 };
693 ChatMessage {
694 role: m.role.clone(),
695 content: text,
696 }
697 } else {
698 m.clone()
699 }
700 })
701 .collect()
702}
703
704fn trim_old_images(messages: &[ChatMessage], max_images: usize) -> Vec<ChatMessage> {
707 let latest_tool_indices = latest_tool_result_indices(messages);
708 let image_positions: Vec<(usize, usize)> = messages
710 .iter()
711 .enumerate()
712 .filter(|(index, message)| {
713 should_normalize_message_images(*index, message, &latest_tool_indices)
714 })
715 .filter_map(|(i, m)| {
716 let count = parse_image_markers(&m.content).1.len();
717 if count > 0 { Some((i, count)) } else { None }
718 })
719 .collect();
720
721 let total: usize = image_positions.iter().map(|(_, c)| c).sum();
723 let mut to_drop = total.saturating_sub(max_images);
724
725 let mut strip_indices = std::collections::HashSet::new();
727 for &(idx, count) in &image_positions {
728 if to_drop == 0 {
729 break;
730 }
731 strip_indices.insert(idx);
732 to_drop = to_drop.saturating_sub(count);
733 }
734
735 messages
736 .iter()
737 .enumerate()
738 .map(|(i, m)| {
739 if strip_indices.contains(&i) {
740 let (cleaned, _) = parse_image_markers(&m.content);
741 let text = if cleaned.trim().is_empty() {
742 "[image removed from history]".to_string()
743 } else {
744 cleaned
745 };
746 ChatMessage {
747 role: m.role.clone(),
748 content: text,
749 }
750 } else {
751 replay_message_without_stale_tool_images(i, m, &latest_tool_indices)
752 }
753 })
754 .collect()
755}
756
757fn compose_multimodal_message(text: &str, data_uris: &[String]) -> String {
758 let mut content = String::new();
759 let trimmed = text.trim();
760
761 if !trimmed.is_empty() {
762 content.push_str(trimmed);
763 content.push_str("\n\n");
764 }
765
766 for (index, data_uri) in data_uris.iter().enumerate() {
767 if index > 0 {
768 content.push('\n');
769 }
770 content.push_str(IMAGE_MARKER_PREFIX);
771 content.push_str(data_uri);
772 content.push(']');
773 }
774
775 content
776}
777
778struct NormalizedImageReferences {
779 data_uris: Vec<String>,
780 skipped_count: usize,
781}
782
783struct ImageNormalizeCtx<'a> {
785 message_index: usize,
787 role: &'a str,
789}
790
791async fn normalize_image_references(
792 refs: &[String],
793 config: &MultimodalConfig,
794 max_bytes: usize,
795 remote_client: &Client,
796 ctx: &ImageNormalizeCtx<'_>,
797 mut cache: Option<&mut LocalImageCache>,
798) -> NormalizedImageReferences {
799 let mut data_uris = Vec::with_capacity(refs.len());
800 let mut skipped_count = 0usize;
801
802 for reference in refs {
803 match normalize_image_reference(
804 reference,
805 config,
806 max_bytes,
807 remote_client,
808 cache.as_deref_mut(),
809 )
810 .await
811 {
812 Ok(data_uri) => data_uris.push(data_uri),
813 Err(error) => {
814 skipped_count += 1;
815 let error_reason = multimodal_error_reason(&error);
816 let marker_preview: String = reference.chars().take(120).collect();
819 let error_kind = multimodal_error_kind(&error);
820 let attrs = ::serde_json::json!({
821 "message_index": ctx.message_index,
822 "message_role": ctx.role,
823 "source_kind": image_reference_kind(reference),
824 "error_kind": error_kind,
825 "reason": error_reason.as_deref().unwrap_or(""),
826 "marker_preview": marker_preview,
827 });
828 let is_tool_role = ctx.role == "tool";
840 let is_recoverable_load_failure = matches!(
841 error_kind,
842 "image_source_not_found"
843 | "local_read_failed"
844 | "remote_fetch_failed"
845 | "invalid_marker"
846 );
847 if is_tool_role && is_recoverable_load_failure {
848 ::zeroclaw_log::record!(
849 DEBUG,
850 ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
851 .with_attrs(attrs),
852 "skipping multimodal marker in tool result (likely not a real attachment)"
853 );
854 } else {
855 ::zeroclaw_log::record!(
856 WARN,
857 ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
858 .with_outcome(::zeroclaw_log::EventOutcome::Unknown)
859 .with_attrs(attrs),
860 "skipping multimodal image that could not be loaded"
861 );
862 }
863 }
864 }
865 }
866
867 NormalizedImageReferences {
868 data_uris,
869 skipped_count,
870 }
871}
872
873fn compose_multimodal_content(
874 text: &str,
875 data_uris: &[String],
876 skipped_count: usize,
877 total_refs: usize,
878) -> String {
879 if skipped_count == 0 {
880 return compose_multimodal_message(text, data_uris);
881 }
882
883 let text_with_note = append_skipped_image_note(text, skipped_count, total_refs);
884 if data_uris.is_empty() {
885 text_with_note.trim().to_string()
886 } else {
887 compose_multimodal_message(&text_with_note, data_uris)
888 }
889}
890
891fn append_skipped_image_note(text: &str, skipped_count: usize, total_refs: usize) -> String {
892 if skipped_count == 0 {
893 return text.to_string();
894 }
895
896 let note = if skipped_count == total_refs {
898 format!("{skipped_count} attached image(s) could not be loaded")
899 } else {
900 format!("{skipped_count} of {total_refs} attached image(s) could not be loaded")
901 };
902
903 let trimmed = text.trim();
904 if trimmed.is_empty() {
905 format!("Note: {note}.")
906 } else {
907 format!("{trimmed}\n\nNote: {note}.")
908 }
909}
910
911fn image_reference_kind(reference: &str) -> &'static str {
912 if reference.starts_with("data:") {
913 "data"
914 } else if reference.starts_with("http://") || reference.starts_with("https://") {
915 "remote"
916 } else {
917 "local"
918 }
919}
920
921fn multimodal_error_kind(error: &anyhow::Error) -> &'static str {
922 match error.downcast_ref::<MultimodalError>() {
923 Some(MultimodalError::TooManyImages { .. }) => "too_many_images",
924 Some(MultimodalError::ImageTooLarge { .. }) => "image_too_large",
925 Some(MultimodalError::UnsupportedMime { .. }) => "unsupported_mime",
926 Some(MultimodalError::RemoteFetchDisabled { .. }) => "remote_fetch_disabled",
927 Some(MultimodalError::ImageSourceNotFound { .. }) => "image_source_not_found",
928 Some(MultimodalError::InvalidMarker { .. }) => "invalid_marker",
929 Some(MultimodalError::RemoteFetchFailed { .. }) => "remote_fetch_failed",
930 Some(MultimodalError::LocalReadFailed { .. }) => "local_read_failed",
931 None => "unknown",
932 }
933}
934
935fn multimodal_error_reason(error: &anyhow::Error) -> Option<String> {
936 match error.downcast_ref::<MultimodalError>() {
937 Some(MultimodalError::InvalidMarker { input, reason })
938 | Some(MultimodalError::RemoteFetchFailed { input, reason })
939 | Some(MultimodalError::LocalReadFailed { input, reason }) => {
940 Some(reason.replace(input, "<source>"))
941 }
942 _ => None,
943 }
944}
945
946async fn normalize_image_reference(
947 source: &str,
948 config: &MultimodalConfig,
949 max_bytes: usize,
950 remote_client: &Client,
951 cache: Option<&mut LocalImageCache>,
952) -> anyhow::Result<String> {
953 if source.starts_with("data:") {
954 return normalize_data_uri(source, max_bytes);
955 }
956
957 if source.starts_with("http://") || source.starts_with("https://") {
958 if !config.allow_remote_fetch {
959 return Err(MultimodalError::RemoteFetchDisabled {
960 input: source.to_string(),
961 }
962 .into());
963 }
964
965 return normalize_remote_image(source, max_bytes, remote_client).await;
966 }
967
968 match cache {
969 Some(c) => normalize_local_image_cached(source, max_bytes, c).await,
970 None => normalize_local_image(source, max_bytes).await,
971 }
972}
973
974fn normalize_data_uri(source: &str, max_bytes: usize) -> anyhow::Result<String> {
975 let Some(comma_idx) = source.find(',') else {
976 return Err(MultimodalError::InvalidMarker {
977 input: source.to_string(),
978 reason: "expected data URI payload".to_string(),
979 }
980 .into());
981 };
982
983 let header = &source[..comma_idx];
984 let payload = source[comma_idx + 1..].trim();
985
986 if !header.contains(";base64") {
987 return Err(MultimodalError::InvalidMarker {
988 input: source.to_string(),
989 reason: "only base64 data URIs are supported".to_string(),
990 }
991 .into());
992 }
993
994 let mime = header
995 .trim_start_matches("data:")
996 .split(';')
997 .next()
998 .unwrap_or_default()
999 .trim()
1000 .to_ascii_lowercase();
1001
1002 validate_mime(source, &mime)?;
1003
1004 let decoded = STANDARD
1005 .decode(payload)
1006 .map_err(|error| MultimodalError::InvalidMarker {
1007 input: source.to_string(),
1008 reason: format!("invalid base64 payload: {error}"),
1009 })?;
1010
1011 validate_size(source, decoded.len(), max_bytes)?;
1012
1013 Ok(format!("data:{mime};base64,{}", STANDARD.encode(decoded)))
1014}
1015
1016async fn normalize_remote_image(
1017 source: &str,
1018 max_bytes: usize,
1019 remote_client: &Client,
1020) -> anyhow::Result<String> {
1021 let response = remote_client.get(source).send().await.map_err(|error| {
1022 MultimodalError::RemoteFetchFailed {
1023 input: source.to_string(),
1024 reason: error.to_string(),
1025 }
1026 })?;
1027
1028 let status = response.status();
1029 if !status.is_success() {
1030 return Err(MultimodalError::RemoteFetchFailed {
1031 input: source.to_string(),
1032 reason: format!("HTTP {status}"),
1033 }
1034 .into());
1035 }
1036
1037 if let Some(content_length) = response.content_length() {
1038 let content_length = usize::try_from(content_length).unwrap_or(usize::MAX);
1039 validate_size(source, content_length, max_bytes)?;
1040 }
1041
1042 let content_type = response
1043 .headers()
1044 .get(reqwest::header::CONTENT_TYPE)
1045 .and_then(|value| value.to_str().ok())
1046 .map(ToString::to_string);
1047
1048 let bytes = response
1049 .bytes()
1050 .await
1051 .map_err(|error| MultimodalError::RemoteFetchFailed {
1052 input: source.to_string(),
1053 reason: error.to_string(),
1054 })?;
1055
1056 validate_size(source, bytes.len(), max_bytes)?;
1057
1058 let mime = detect_mime(None, bytes.as_ref(), content_type.as_deref()).ok_or_else(|| {
1059 MultimodalError::UnsupportedMime {
1060 input: source.to_string(),
1061 mime: "unknown".to_string(),
1062 }
1063 })?;
1064
1065 validate_mime(source, &mime)?;
1066
1067 Ok(format!("data:{mime};base64,{}", STANDARD.encode(bytes)))
1068}
1069
1070async fn normalize_local_image(source: &str, max_bytes: usize) -> anyhow::Result<String> {
1071 let path = Path::new(source);
1072 if !path.exists() || !path.is_file() {
1073 return Err(MultimodalError::ImageSourceNotFound {
1074 input: source.to_string(),
1075 }
1076 .into());
1077 }
1078
1079 let metadata =
1080 tokio::fs::metadata(path)
1081 .await
1082 .map_err(|error| MultimodalError::LocalReadFailed {
1083 input: source.to_string(),
1084 reason: error.to_string(),
1085 })?;
1086
1087 validate_size(
1088 source,
1089 usize::try_from(metadata.len()).unwrap_or(usize::MAX),
1090 max_bytes,
1091 )?;
1092
1093 let bytes = tokio::fs::read(path)
1094 .await
1095 .map_err(|error| MultimodalError::LocalReadFailed {
1096 input: source.to_string(),
1097 reason: error.to_string(),
1098 })?;
1099
1100 validate_size(source, bytes.len(), max_bytes)?;
1101
1102 let mime =
1103 detect_mime(Some(path), &bytes, None).ok_or_else(|| MultimodalError::UnsupportedMime {
1104 input: source.to_string(),
1105 mime: "unknown".to_string(),
1106 })?;
1107
1108 validate_mime(source, &mime)?;
1109
1110 Ok(format!("data:{mime};base64,{}", STANDARD.encode(bytes)))
1111}
1112
1113async fn normalize_local_image_cached(
1117 source: &str,
1118 max_bytes: usize,
1119 cache: &mut LocalImageCache,
1120) -> anyhow::Result<String> {
1121 let path = Path::new(source);
1122 if !path.exists() || !path.is_file() {
1123 return Err(MultimodalError::ImageSourceNotFound {
1124 input: source.to_string(),
1125 }
1126 .into());
1127 }
1128
1129 let metadata =
1130 tokio::fs::metadata(path)
1131 .await
1132 .map_err(|error| MultimodalError::LocalReadFailed {
1133 input: source.to_string(),
1134 reason: error.to_string(),
1135 })?;
1136
1137 let file_len = metadata.len();
1138 let is_immutable = source.contains("/uploads/");
1139 let mtime: i64 = if is_immutable {
1140 0
1141 } else {
1142 metadata
1143 .modified()
1144 .ok()
1145 .and_then(|t| {
1146 t.duration_since(std::time::UNIX_EPOCH)
1147 .ok()
1148 .map(|d| d.as_secs() as i64)
1149 })
1150 .unwrap_or(0)
1151 };
1152 let cache_len = if is_immutable { 0 } else { file_len };
1153
1154 if let Some(cached) = cache.get(source, cache_len, mtime) {
1155 return Ok(cached.to_string());
1156 }
1157
1158 validate_size(
1159 source,
1160 usize::try_from(file_len).unwrap_or(usize::MAX),
1161 max_bytes,
1162 )?;
1163
1164 let bytes = tokio::fs::read(path)
1165 .await
1166 .map_err(|error| MultimodalError::LocalReadFailed {
1167 input: source.to_string(),
1168 reason: error.to_string(),
1169 })?;
1170
1171 validate_size(source, bytes.len(), max_bytes)?;
1172
1173 let mime =
1174 detect_mime(Some(path), &bytes, None).ok_or_else(|| MultimodalError::UnsupportedMime {
1175 input: source.to_string(),
1176 mime: "unknown".to_string(),
1177 })?;
1178
1179 validate_mime(source, &mime)?;
1180
1181 let data_uri = format!("data:{mime};base64,{}", STANDARD.encode(&bytes));
1182 cache.insert(source.to_string(), cache_len, mtime, data_uri.clone());
1183 Ok(data_uri)
1184}
1185
1186fn validate_size(source: &str, size_bytes: usize, max_bytes: usize) -> anyhow::Result<()> {
1187 if size_bytes > max_bytes {
1188 return Err(MultimodalError::ImageTooLarge {
1189 input: source.to_string(),
1190 size_bytes,
1191 max_bytes,
1192 }
1193 .into());
1194 }
1195
1196 Ok(())
1197}
1198
1199fn validate_mime(source: &str, mime: &str) -> anyhow::Result<()> {
1200 if ALLOWED_IMAGE_MIME_TYPES.contains(&mime) {
1201 return Ok(());
1202 }
1203
1204 Err(MultimodalError::UnsupportedMime {
1205 input: source.to_string(),
1206 mime: mime.to_string(),
1207 }
1208 .into())
1209}
1210
1211fn detect_mime(
1212 path: Option<&Path>,
1213 bytes: &[u8],
1214 header_content_type: Option<&str>,
1215) -> Option<String> {
1216 if let Some(header_mime) = header_content_type.and_then(normalize_content_type) {
1217 return Some(header_mime);
1218 }
1219
1220 if let Some(path) = path
1221 && let Some(ext) = path.extension().and_then(|value| value.to_str())
1222 && let Some(mime) = mime_from_extension(ext)
1223 {
1224 return Some(mime.to_string());
1225 }
1226
1227 mime_from_magic(bytes).map(ToString::to_string)
1228}
1229
1230fn normalize_content_type(content_type: &str) -> Option<String> {
1231 let mime = content_type.split(';').next()?.trim().to_ascii_lowercase();
1232 if mime.is_empty() { None } else { Some(mime) }
1233}
1234
1235fn mime_from_extension(ext: &str) -> Option<&'static str> {
1236 match ext.to_ascii_lowercase().as_str() {
1237 "png" => Some("image/png"),
1238 "jpg" | "jpeg" => Some("image/jpeg"),
1239 "webp" => Some("image/webp"),
1240 "gif" => Some("image/gif"),
1241 "bmp" => Some("image/bmp"),
1242 _ => None,
1243 }
1244}
1245
1246fn mime_from_magic(bytes: &[u8]) -> Option<&'static str> {
1247 if bytes.len() >= 8 && bytes.starts_with(&[0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n']) {
1248 return Some("image/png");
1249 }
1250
1251 if bytes.len() >= 3 && bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1252 return Some("image/jpeg");
1253 }
1254
1255 if bytes.len() >= 6 && (bytes.starts_with(b"GIF87a") || bytes.starts_with(b"GIF89a")) {
1256 return Some("image/gif");
1257 }
1258
1259 if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1260 return Some("image/webp");
1261 }
1262
1263 if bytes.len() >= 2 && bytes.starts_with(b"BM") {
1264 return Some("image/bmp");
1265 }
1266
1267 None
1268}
1269
1270#[cfg(test)]
1271mod tests {
1272 use super::*;
1273
1274 #[test]
1275 fn strip_media_markers_replaces_image_local_path() {
1276 let input = "Look at [IMAGE:/zeroclaw-data/workspace/telegram_files/photo_1.jpg]";
1277 assert_eq!(strip_media_markers(input), "Look at [media attachment]");
1278 }
1279
1280 #[test]
1281 fn strip_media_markers_replaces_image_data_uri() {
1282 let input = "Inline [IMAGE:data:image/png;base64,abcd]";
1283 assert_eq!(strip_media_markers(input), "Inline [media attachment]");
1284 }
1285
1286 #[test]
1287 fn strip_media_markers_replaces_all_supported_kinds() {
1288 let input = "[IMAGE:/a.jpg] [PHOTO:/b.jpg] [DOCUMENT:/c.pdf] [FILE:/d.zip] [VIDEO:/e.mp4] [VOICE:/f.ogg] [AUDIO:/g.wav]";
1292 let expected = "[media attachment] [media attachment] [media attachment] [media attachment] [media attachment] [media attachment] [media attachment]";
1293 assert_eq!(strip_media_markers(input), expected);
1294 }
1295
1296 #[test]
1297 fn strip_media_markers_is_case_insensitive() {
1298 let input = "[image:/a.jpg] [Photo:/b.jpg] [video:/c.mp4]";
1303 let expected = "[media attachment] [media attachment] [media attachment]";
1304 assert_eq!(strip_media_markers(input), expected);
1305 }
1306
1307 #[test]
1308 fn strip_media_markers_leaves_plain_text_untouched() {
1309 let input = "No markers here, just text with [brackets] and (parens).";
1310 assert_eq!(strip_media_markers(input), input);
1311 }
1312
1313 #[test]
1314 fn strip_media_markers_preserves_unrelated_brackets() {
1315 let input = "Use [TODO:foo] and [NOTE:bar] but replace [IMAGE:/x.jpg]";
1317 assert_eq!(
1318 strip_media_markers(input),
1319 "Use [TODO:foo] and [NOTE:bar] but replace [media attachment]"
1320 );
1321 }
1322
1323 #[test]
1324 fn parse_image_markers_extracts_multiple_markers() {
1325 let input = "Check this [IMAGE:/tmp/a.png] and this [IMAGE:https://example.com/b.jpg]";
1326 let (cleaned, refs) = parse_image_markers(input);
1327
1328 assert_eq!(cleaned, "Check this and this");
1329 assert_eq!(refs.len(), 2);
1330 assert_eq!(refs[0], "/tmp/a.png");
1331 assert_eq!(refs[1], "https://example.com/b.jpg");
1332 }
1333
1334 #[test]
1335 fn is_windows_unc_path_accepts_shares_and_rejects_others() {
1336 assert!(is_windows_unc_path(r"\\server\share\pic.png"));
1337 assert!(is_windows_unc_path(r"\\server\share\sub\pic.png"));
1338 assert!(!is_windows_unc_path(r"\\?\C:\Users\me\a.png"));
1340 assert!(!is_windows_unc_path(r"\\?\UNC\server\share\a.png"));
1341 assert!(!is_windows_unc_path(r"\\.\PhysicalDrive0"));
1342 assert!(!is_windows_unc_path(r"\\server"));
1344 assert!(!is_windows_unc_path(r"\\"));
1345 assert!(!is_windows_unc_path("/home/me/a.png"));
1347 assert!(!is_windows_unc_path(r"C:\Users\me\a.png"));
1348 }
1349
1350 #[test]
1351 fn parse_image_markers_extracts_unc_path() {
1352 let input = r"File: [IMAGE:\\server\share\pic.png]";
1357 let (_, refs) = parse_image_markers(input);
1358 assert_eq!(refs.len(), 1, "UNC marker should be extracted as a ref");
1359 assert_eq!(refs[0], r"\\server\share\pic.png");
1360 }
1361
1362 #[test]
1363 fn validate_mime_rejects_bmp_but_accepts_provider_supported_types() {
1364 for mime in ["image/png", "image/jpeg", "image/webp", "image/gif"] {
1365 assert!(
1366 validate_mime("src", mime).is_ok(),
1367 "{mime} should be allowed"
1368 );
1369 }
1370 let err = validate_mime("src", "image/bmp").unwrap_err();
1373 assert_eq!(multimodal_error_kind(&err), "unsupported_mime");
1374 }
1375
1376 #[test]
1377 fn parse_image_markers_collapses_line_wrapped_path() {
1378 let input = "from the logs whether the agent emits\n [IMAGE:/home/zeroclaw_user/.zeroclaw/workspace/signal_i\n nbound/attachment.jpg] (which the\n channel resolves)";
1381 let (_, refs) = parse_image_markers(input);
1382 assert_eq!(refs.len(), 1);
1383 assert_eq!(
1384 refs[0],
1385 "/home/zeroclaw_user/.zeroclaw/workspace/signal_inbound/attachment.jpg"
1386 );
1387 }
1388
1389 #[test]
1390 fn parse_image_markers_leaves_placeholder_markers_as_literal_text() {
1391 let input = "example: `[IMAGE:...]` or `[IMAGE:<path>]` or `[IMAGE:example.png]`";
1396 let (cleaned, refs) = parse_image_markers(input);
1397 assert!(
1398 refs.is_empty(),
1399 "no placeholder should be treated as a loadable ref, got: {refs:?}"
1400 );
1401 assert!(cleaned.contains("[IMAGE:...]"));
1402 assert!(cleaned.contains("[IMAGE:<path>]"));
1403 assert!(cleaned.contains("[IMAGE:example.png]"));
1404 }
1405
1406 #[test]
1407 fn parse_image_markers_preserves_spaces_in_path() {
1408 let input = "look at [IMAGE:/tmp/my photos/beetle.png] please";
1411 let (_, refs) = parse_image_markers(input);
1412 assert_eq!(refs.len(), 1);
1413 assert_eq!(refs[0], "/tmp/my photos/beetle.png");
1414 }
1415
1416 #[test]
1417 fn parse_image_markers_keeps_invalid_empty_marker() {
1418 let input = "hello [IMAGE:] world";
1419 let (cleaned, refs) = parse_image_markers(input);
1420
1421 assert_eq!(cleaned, "hello [IMAGE:] world");
1422 assert!(refs.is_empty());
1423 }
1424
1425 #[tokio::test]
1426 async fn prepare_messages_normalizes_local_image_to_data_uri() {
1427 let temp = tempfile::tempdir().unwrap();
1428 let image_path = temp.path().join("sample.png");
1429
1430 std::fs::write(
1432 &image_path,
1433 [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1434 )
1435 .unwrap();
1436
1437 let messages = vec![ChatMessage::user(format!(
1438 "Please inspect this screenshot [IMAGE:{}]",
1439 image_path.display()
1440 ))];
1441
1442 let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1443 .await
1444 .unwrap();
1445
1446 assert!(prepared.contains_images);
1447 assert_eq!(prepared.messages.len(), 1);
1448
1449 let (cleaned, refs) = parse_image_markers(&prepared.messages[0].content);
1450 assert_eq!(cleaned, "Please inspect this screenshot");
1451 assert_eq!(refs.len(), 1);
1452 assert!(refs[0].starts_with("data:image/png;base64,"));
1453 }
1454
1455 #[tokio::test]
1456 async fn prepare_messages_normalizes_tool_message_local_image_to_data_uri() {
1462 let temp = tempfile::tempdir().unwrap();
1463 let image_path = temp.path().join("tool-sample.png");
1464
1465 std::fs::write(
1466 &image_path,
1467 [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1468 )
1469 .unwrap();
1470
1471 let messages = vec![ChatMessage::tool(format!(
1472 "<tool_result name=\"image_gen\">\nGenerated image [IMAGE:{}]\n</tool_result>",
1473 image_path.display()
1474 ))];
1475
1476 let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1477 .await
1478 .unwrap();
1479
1480 assert!(prepared.contains_images);
1481 assert_eq!(prepared.messages.len(), 1);
1482 assert_eq!(prepared.messages[0].role, "tool");
1483
1484 let (cleaned, refs) = parse_image_markers(&prepared.messages[0].content);
1485 assert!(cleaned.contains("<tool_result name=\"image_gen\">"));
1486 assert!(cleaned.contains("Generated image"));
1487 assert_eq!(refs.len(), 1);
1488 assert!(refs[0].starts_with("data:image/png;base64,"));
1489 }
1490
1491 #[tokio::test]
1503 async fn prepare_messages_preserves_native_tool_result_json_shape() {
1504 let temp = tempfile::tempdir().unwrap();
1505 let image_path = temp.path().join("native-tool-result.png");
1506 std::fs::write(
1507 &image_path,
1508 [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1509 )
1510 .unwrap();
1511
1512 let native_tool_content = serde_json::json!({
1513 "tool_call_id": "tc1",
1514 "content": format!("see attached [IMAGE:{}]", image_path.display().to_string()),
1515 })
1516 .to_string();
1517
1518 let messages = vec![ChatMessage::tool(native_tool_content)];
1519
1520 let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1521 .await
1522 .expect("preparation should succeed for native tool-result JSON");
1523
1524 assert!(prepared.contains_images);
1525 assert_eq!(prepared.messages.len(), 1);
1526 assert_eq!(prepared.messages[0].role, "tool");
1527
1528 let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1529 .expect("prepared tool message must remain valid JSON");
1530
1531 assert_eq!(
1532 value.get("tool_call_id").and_then(|v| v.as_str()),
1533 Some("tc1"),
1534 "tool_call_id must survive multimodal preprocessing unchanged"
1535 );
1536
1537 let inner = value
1538 .get("content")
1539 .and_then(|v| v.as_str())
1540 .expect("content must remain a JSON string");
1541 assert!(
1542 inner.contains("see attached"),
1543 "surrounding text in tool content should survive normalization"
1544 );
1545 assert!(
1546 inner.contains("data:image/png;base64,"),
1547 "local image path inside tool content should be rewritten to a data URI"
1548 );
1549 assert!(
1550 !inner.contains("native-tool-result.png"),
1551 "raw local path must not leak after normalization"
1552 );
1553 }
1554
1555 #[tokio::test]
1556 async fn prepare_messages_preserves_native_tool_json_when_image_is_skipped() {
1557 let native_tool_content = serde_json::json!({
1558 "tool_call_id": "tc1",
1559 "content": "generated screenshot [IMAGE:https://example.com/missing.png]",
1560 })
1561 .to_string();
1562
1563 let prepared = prepare_messages_for_provider(
1564 &[ChatMessage::tool(native_tool_content)],
1565 &MultimodalConfig::default(),
1566 )
1567 .await
1568 .expect("skipped native tool image should not fail message preparation");
1569
1570 assert!(!prepared.contains_images);
1571 assert_eq!(prepared.messages.len(), 1);
1572
1573 let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1574 .expect("native tool result must remain valid JSON");
1575 assert_eq!(
1576 value.get("tool_call_id").and_then(|v| v.as_str()),
1577 Some("tc1")
1578 );
1579
1580 let inner = value
1581 .get("content")
1582 .and_then(|v| v.as_str())
1583 .expect("content should remain a JSON string");
1584 assert!(inner.contains("generated screenshot"));
1585 assert!(inner.contains("1 attached image(s) could not be loaded"));
1586 assert!(!inner.contains("[IMAGE:"));
1587 assert!(!inner.contains("https://example.com/missing.png"));
1588 }
1589
1590 #[tokio::test]
1591 async fn prepare_messages_preserves_native_tool_json_with_mixed_images() {
1592 let temp = tempfile::tempdir().unwrap();
1593 let image_path = temp.path().join("mixed-native-tool-result.png");
1594 std::fs::write(
1595 &image_path,
1596 [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1597 )
1598 .unwrap();
1599
1600 let native_tool_content = serde_json::json!({
1601 "tool_call_id": "tc1",
1602 "content": format!(
1603 "generated [IMAGE:{}] and [IMAGE:https://example.com/missing.png]",
1604 image_path.display()
1605 ),
1606 })
1607 .to_string();
1608
1609 let prepared = prepare_messages_for_provider(
1610 &[ChatMessage::tool(native_tool_content)],
1611 &MultimodalConfig::default(),
1612 )
1613 .await
1614 .expect("valid native tool image should survive while bad ref is skipped");
1615
1616 assert!(prepared.contains_images);
1617 assert_eq!(prepared.messages.len(), 1);
1618
1619 let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1620 .expect("native tool result must remain valid JSON");
1621 assert_eq!(
1622 value.get("tool_call_id").and_then(|v| v.as_str()),
1623 Some("tc1")
1624 );
1625
1626 let inner = value
1627 .get("content")
1628 .and_then(|v| v.as_str())
1629 .expect("content should remain a JSON string");
1630 assert!(inner.contains("generated"));
1631 assert!(inner.contains("data:image/png;base64,"));
1632 assert!(inner.contains("1 of 2 attached image(s) could not be loaded"));
1633 assert!(!inner.contains("mixed-native-tool-result.png"));
1634 assert!(!inner.contains("https://example.com/missing.png"));
1635 }
1636
1637 #[tokio::test]
1638 async fn prepare_messages_strips_stale_native_tool_result_images() {
1639 let temp = tempfile::tempdir().unwrap();
1640 let image_path = temp.path().join("stale-native-tool-result.png");
1641 std::fs::write(
1642 &image_path,
1643 [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1644 )
1645 .unwrap();
1646
1647 let native_tool_content = serde_json::json!({
1648 "tool_call_id": "tc1",
1649 "content": format!("generated screenshot [IMAGE:{}]", image_path.display().to_string()),
1650 })
1651 .to_string();
1652
1653 let messages = vec![
1654 ChatMessage::tool(native_tool_content),
1655 ChatMessage {
1656 role: "assistant".to_string(),
1657 content: "I generated the screenshot.".to_string(),
1658 },
1659 ChatMessage::user("What happened next?".to_string()),
1660 ];
1661
1662 let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1663 .await
1664 .expect("preparation should strip stale tool images without loading them");
1665
1666 assert!(
1667 !prepared.contains_images,
1668 "stale tool-result images should not keep the request in vision mode"
1669 );
1670
1671 let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1672 .expect("stale native tool result should remain valid JSON");
1673 assert_eq!(
1674 value.get("tool_call_id").and_then(|v| v.as_str()),
1675 Some("tc1")
1676 );
1677
1678 let inner = value
1679 .get("content")
1680 .and_then(|v| v.as_str())
1681 .expect("content should remain a JSON string");
1682 assert!(inner.contains("generated screenshot"));
1683 assert!(!inner.contains("[IMAGE:"));
1684 assert!(!inner.contains("data:image"));
1685 assert!(!inner.contains("stale-native-tool-result.png"));
1686 }
1687
1688 #[tokio::test]
1689 async fn prepare_messages_strips_stale_prompt_tool_result_images() {
1690 let temp = tempfile::tempdir().unwrap();
1691 let image_path = temp.path().join("stale-prompt-tool-result.png");
1692 std::fs::write(
1693 &image_path,
1694 [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1695 )
1696 .unwrap();
1697
1698 let messages = vec![
1699 ChatMessage::user(format!(
1700 "[Tool results]\n<tool_result name=\"image_gen\">Generated [IMAGE:{}]</tool_result>",
1701 image_path.display()
1702 )),
1703 ChatMessage {
1704 role: "assistant".to_string(),
1705 content: "I generated the screenshot.".to_string(),
1706 },
1707 ChatMessage::user("Continue.".to_string()),
1708 ];
1709
1710 let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1711 .await
1712 .expect("preparation should strip stale prompt-mode tool images");
1713
1714 assert!(!prepared.contains_images);
1715 assert!(prepared.messages[0].content.contains("[Tool results]"));
1716 assert!(prepared.messages[0].content.contains("Generated"));
1717 assert!(!prepared.messages[0].content.contains("[IMAGE:"));
1718 assert!(!prepared.messages[0].content.contains("data:image"));
1719 assert!(
1720 !prepared.messages[0]
1721 .content
1722 .contains("stale-prompt-tool-result.png")
1723 );
1724 }
1725
1726 #[tokio::test]
1727 async fn prepare_messages_strips_stale_tool_image_while_normalizing_current_user_image() {
1728 let temp = tempfile::tempdir().unwrap();
1729 let stale_path = temp.path().join("stale-tool-result.png");
1730 let fresh_path = temp.path().join("fresh-user-image.png");
1731 let png = [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'];
1732 std::fs::write(&stale_path, png).unwrap();
1733 std::fs::write(&fresh_path, png).unwrap();
1734
1735 let native_tool_content = serde_json::json!({
1736 "tool_call_id": "tc1",
1737 "content": format!("generated screenshot [IMAGE:{}]", stale_path.display().to_string()),
1738 })
1739 .to_string();
1740
1741 let messages = vec![
1742 ChatMessage::tool(native_tool_content),
1743 ChatMessage {
1744 role: "assistant".to_string(),
1745 content: "I generated the screenshot.".to_string(),
1746 },
1747 ChatMessage::user(format!(
1748 "Now inspect this [IMAGE:{}]",
1749 fresh_path.display().to_string()
1750 )),
1751 ];
1752
1753 let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1754 .await
1755 .expect("preparation should strip stale tool images and normalize current user image");
1756
1757 assert!(prepared.contains_images);
1758
1759 let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1760 .expect("stale native tool result should remain valid JSON");
1761 let inner = value
1762 .get("content")
1763 .and_then(|v| v.as_str())
1764 .expect("content should remain a JSON string");
1765 assert!(inner.contains("generated screenshot"));
1766 assert!(!inner.contains("[IMAGE:"));
1767 assert!(!inner.contains("data:image"));
1768 assert!(!inner.contains("stale-tool-result.png"));
1769
1770 let (cleaned, refs) = parse_image_markers(&prepared.messages[2].content);
1771 assert_eq!(cleaned, "Now inspect this");
1772 assert_eq!(refs.len(), 1);
1773 assert!(refs[0].starts_with("data:image/png;base64,"));
1774 assert!(
1775 !prepared.messages[2]
1776 .content
1777 .contains("fresh-user-image.png")
1778 );
1779 }
1780
1781 #[test]
1782 fn count_image_markers_ignores_stale_tool_results() {
1783 let messages = vec![
1784 ChatMessage::tool("[IMAGE:/tmp/stale-tool.png]\nGenerated".to_string()),
1785 ChatMessage {
1786 role: "assistant".to_string(),
1787 content: "Done.".to_string(),
1788 },
1789 ChatMessage::user("Next question".to_string()),
1790 ];
1791
1792 assert_eq!(count_image_markers(&messages), 0);
1793
1794 let messages = vec![
1795 ChatMessage::user("Create an image".to_string()),
1796 ChatMessage::tool("[IMAGE:/tmp/latest-tool.png]\nGenerated".to_string()),
1797 ];
1798
1799 assert_eq!(count_image_markers(&messages), 1);
1800 }
1801
1802 #[tokio::test]
1803 async fn prepare_messages_trims_excess_images_from_older_messages() {
1804 let messages = vec![
1807 ChatMessage::user("[IMAGE:/tmp/old.png]\nOld caption".to_string()),
1808 ChatMessage::user("[IMAGE:/tmp/mid.png]\nMid caption".to_string()),
1809 ChatMessage::user("[IMAGE:/tmp/new.png]\nNew caption".to_string()),
1810 ];
1811
1812 let trimmed = trim_old_images(&messages, 2);
1817 assert_eq!(trimmed.len(), 3);
1818
1819 let (_, refs0) = parse_image_markers(&trimmed[0].content);
1821 assert!(refs0.is_empty(), "oldest image should be stripped");
1822 assert!(trimmed[0].content.contains("Old caption"));
1823
1824 let (_, refs1) = parse_image_markers(&trimmed[1].content);
1826 assert_eq!(refs1.len(), 1);
1827 let (_, refs2) = parse_image_markers(&trimmed[2].content);
1828 assert_eq!(refs2.len(), 1);
1829 }
1830
1831 #[test]
1832 fn trim_old_images_replaces_image_only_message() {
1833 let messages = vec![
1835 ChatMessage::user("[IMAGE:/tmp/old.png]".to_string()),
1836 ChatMessage::user("[IMAGE:/tmp/new.png]\nKeep this".to_string()),
1837 ];
1838
1839 let trimmed = trim_old_images(&messages, 1);
1840 assert_eq!(trimmed[0].content, "[image removed from history]");
1841 assert!(trimmed[1].content.contains("[IMAGE:/tmp/new.png]"));
1842 }
1843
1844 #[test]
1845 fn trim_old_images_multi_image_message_stripped_as_unit() {
1846 let messages = vec![
1851 ChatMessage::user(
1852 "[IMAGE:/tmp/a.png]\n[IMAGE:/tmp/b.png]\n[IMAGE:/tmp/c.png]\nThree pics"
1853 .to_string(),
1854 ),
1855 ChatMessage::user("Just text, no images".to_string()),
1856 ];
1857
1858 let trimmed = trim_old_images(&messages, 1);
1859 assert_eq!(trimmed.len(), 2);
1860 let (_, refs0) = parse_image_markers(&trimmed[0].content);
1862 assert!(refs0.is_empty());
1863 assert!(trimmed[0].content.contains("Three pics"));
1864 assert_eq!(trimmed[1].content, "Just text, no images");
1866 }
1867
1868 #[test]
1869 fn trim_old_images_skips_assistant_messages() {
1870 let messages = vec![
1872 ChatMessage {
1873 role: "assistant".to_string(),
1874 content: "[IMAGE:/tmp/assistant.png]\nAssistant generated".to_string(),
1875 },
1876 ChatMessage::user("[IMAGE:/tmp/user1.png]\nFirst".to_string()),
1877 ChatMessage::user("[IMAGE:/tmp/user2.png]\nSecond".to_string()),
1878 ];
1879
1880 let trimmed = trim_old_images(&messages, 1);
1881 assert!(trimmed[0].content.contains("[IMAGE:/tmp/assistant.png]"));
1883 let (_, refs1) = parse_image_markers(&trimmed[1].content);
1885 assert!(refs1.is_empty());
1886 assert!(trimmed[1].content.contains("First"));
1887 let (_, refs2) = parse_image_markers(&trimmed[2].content);
1889 assert_eq!(refs2.len(), 1);
1890 }
1891
1892 #[test]
1893 fn trim_old_images_counts_latest_tool_messages() {
1894 let messages = vec![
1895 ChatMessage::user("[IMAGE:/tmp/user-old.png]\nOldest".to_string()),
1896 ChatMessage::tool("[IMAGE:/tmp/tool-new.png]\nGenerated".to_string()),
1897 ];
1898
1899 let trimmed = trim_old_images(&messages, 1);
1900 let (_, refs0) = parse_image_markers(&trimmed[0].content);
1901 assert!(refs0.is_empty(), "oldest user image should be stripped");
1902 assert!(trimmed[0].content.contains("Oldest"));
1903
1904 let (_, refs1) = parse_image_markers(&trimmed[1].content);
1905 assert_eq!(refs1.len(), 1);
1906 }
1907
1908 #[test]
1909 fn trim_old_images_no_trimming_when_under_limit() {
1910 let messages = vec![
1911 ChatMessage::user("[IMAGE:/tmp/a.png]\nCaption A".to_string()),
1912 ChatMessage::user("[IMAGE:/tmp/b.png]\nCaption B".to_string()),
1913 ];
1914
1915 let trimmed = trim_old_images(&messages, 5);
1916 assert_eq!(trimmed[0].content, messages[0].content);
1918 assert_eq!(trimmed[1].content, messages[1].content);
1919 }
1920
1921 #[test]
1922 fn trim_old_images_no_trimming_when_exactly_at_limit() {
1923 let messages = vec![
1924 ChatMessage::user("[IMAGE:/tmp/a.png]\nA".to_string()),
1925 ChatMessage::user("[IMAGE:/tmp/b.png]\nB".to_string()),
1926 ];
1927
1928 let trimmed = trim_old_images(&messages, 2);
1929 assert_eq!(trimmed[0].content, messages[0].content);
1930 assert_eq!(trimmed[1].content, messages[1].content);
1931 }
1932
1933 #[test]
1934 fn trim_old_images_empty_messages() {
1935 let trimmed = trim_old_images(&[], 4);
1936 assert!(trimmed.is_empty());
1937 }
1938
1939 #[test]
1940 fn trim_old_images_interleaved_roles() {
1941 let messages = vec![
1944 ChatMessage::user("[IMAGE:/tmp/1.png]\nLook at this".to_string()),
1945 ChatMessage {
1946 role: "assistant".to_string(),
1947 content: "I see a photo.".to_string(),
1948 },
1949 ChatMessage::user("[IMAGE:/tmp/2.png]\nWhat about this?".to_string()),
1950 ChatMessage {
1951 role: "assistant".to_string(),
1952 content: "That's a chart.".to_string(),
1953 },
1954 ChatMessage::user("[IMAGE:/tmp/3.png]\nAnd this one".to_string()),
1955 ];
1956
1957 let trimmed = trim_old_images(&messages, 2);
1958 assert_eq!(trimmed.len(), 5);
1959 let (_, refs0) = parse_image_markers(&trimmed[0].content);
1961 assert!(refs0.is_empty());
1962 assert!(trimmed[0].content.contains("Look at this"));
1963 assert_eq!(trimmed[1].content, "I see a photo.");
1965 assert_eq!(trimmed[3].content, "That's a chart.");
1966 let (_, refs2) = parse_image_markers(&trimmed[2].content);
1968 assert_eq!(refs2.len(), 1);
1969 let (_, refs4) = parse_image_markers(&trimmed[4].content);
1970 assert_eq!(refs4.len(), 1);
1971 }
1972
1973 #[test]
1974 fn trim_old_images_strips_multiple_oldest_messages() {
1975 let messages: Vec<ChatMessage> = (1..=5)
1977 .map(|i| ChatMessage::user(format!("[IMAGE:/tmp/{i}.png]\nCaption {i}")))
1978 .collect();
1979
1980 let trimmed = trim_old_images(&messages, 1);
1981 assert_eq!(trimmed.len(), 5);
1982 for (i, msg) in trimmed.iter().enumerate().take(4) {
1983 let (_, refs) = parse_image_markers(&msg.content);
1984 assert!(refs.is_empty(), "message {i} should have images stripped");
1985 assert!(msg.content.contains(&format!("Caption {}", i + 1)));
1986 }
1987 let (_, refs_last) = parse_image_markers(&trimmed[4].content);
1989 assert_eq!(refs_last.len(), 1);
1990 }
1991
1992 #[tokio::test]
1993 async fn prepare_messages_trims_then_normalizes_surviving_images() {
1994 let temp = tempfile::tempdir().unwrap();
1997 let mut paths = Vec::new();
1998 for name in ["old.png", "mid.png", "new.png"] {
1999 let p = temp.path().join(name);
2000 let png_data = [
2002 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, 0x00, 0x00, 0x00, 0x90,
2005 0x77, 0x53, 0xDE, 0x00, 0x00, 0x00, 0x0C, 0x49, 0x44, 0x41, 0x54, 0x08, 0xD7, 0x63, 0xF8, 0xCF, 0xC0, 0x00, 0x00, 0x00, 0x02, 0x00, 0x01, 0xE2, 0x21,
2008 0xBC, 0x33, 0x00, 0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, 0x44, 0xAE, 0x42, 0x60, 0x82,
2011 ];
2012 std::fs::write(&p, png_data).unwrap();
2013 paths.push(p);
2014 }
2015
2016 let messages = vec![
2017 ChatMessage::user(format!("[IMAGE:{}]\nOld", paths[0].display().to_string())),
2018 ChatMessage::user(format!("[IMAGE:{}]\nMid", paths[1].display().to_string())),
2019 ChatMessage::user(format!("[IMAGE:{}]\nNew", paths[2].display().to_string())),
2020 ];
2021
2022 let config = MultimodalConfig {
2023 max_images: 2,
2024 max_image_size_mb: 5,
2025 allow_remote_fetch: false,
2026 ..Default::default()
2027 };
2028
2029 let result = prepare_messages_for_provider(&messages, &config)
2030 .await
2031 .expect("should succeed after trimming");
2032
2033 assert!(result.contains_images);
2034 assert_eq!(result.messages.len(), 3);
2035 assert!(!result.messages[0].content.contains("data:image"));
2037 assert!(result.messages[0].content.contains("Old"));
2038 assert!(result.messages[1].content.contains("data:image"));
2040 assert!(result.messages[2].content.contains("data:image"));
2041 }
2042
2043 #[tokio::test]
2044 async fn prepare_messages_skips_remote_url_when_disabled() {
2045 let messages = vec![ChatMessage::user(
2046 "Look [IMAGE:https://example.com/img.png]".to_string(),
2047 )];
2048
2049 let result = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
2050 .await
2051 .expect("disabled remote image should be skipped");
2052
2053 assert!(!result.contains_images);
2054 assert_eq!(result.messages.len(), 1);
2055 assert!(result.messages[0].content.contains("Look"));
2056 assert!(
2057 result.messages[0]
2058 .content
2059 .contains("1 attached image(s) could not be loaded")
2060 );
2061 assert!(
2062 !result.messages[0]
2063 .content
2064 .contains("https://example.com/img.png")
2065 );
2066 }
2067
2068 #[tokio::test]
2069 async fn prepare_messages_skips_oversized_local_image() {
2070 let temp = tempfile::tempdir().unwrap();
2071 let image_path = temp.path().join("big.png");
2072
2073 let bytes = vec![0u8; 1024 * 1024 + 1];
2074 std::fs::write(&image_path, bytes).unwrap();
2075
2076 let messages = vec![ChatMessage::user(format!(
2077 "[IMAGE:{}]",
2078 image_path.display()
2079 ))];
2080 let config = MultimodalConfig {
2081 max_images: 4,
2082 max_image_size_mb: 1,
2083 allow_remote_fetch: false,
2084 ..Default::default()
2085 };
2086
2087 let result = prepare_messages_for_provider(&messages, &config)
2088 .await
2089 .expect("oversized local image should be skipped");
2090
2091 assert!(!result.contains_images);
2092 assert_eq!(result.messages.len(), 1);
2093 assert!(
2094 result.messages[0]
2095 .content
2096 .contains("1 attached image(s) could not be loaded")
2097 );
2098 assert!(
2099 !result.messages[0]
2100 .content
2101 .contains(image_path.to_string_lossy().as_ref())
2102 );
2103 }
2104
2105 #[tokio::test]
2106 async fn prepare_messages_keeps_successful_images_when_some_are_skipped() {
2107 let temp = tempfile::tempdir().unwrap();
2108 let image_path = temp.path().join("ok.png");
2109 std::fs::write(
2110 &image_path,
2111 [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
2112 )
2113 .unwrap();
2114
2115 let messages = vec![ChatMessage::user(format!(
2116 "Look [IMAGE:{}] and [IMAGE:https://example.com/missing.png]",
2117 image_path.display()
2118 ))];
2119
2120 let result = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
2121 .await
2122 .expect("valid local image should survive while remote image is skipped");
2123
2124 assert!(result.contains_images);
2125 assert!(
2126 result.messages[0]
2127 .content
2128 .contains("data:image/png;base64,")
2129 );
2130 assert!(
2131 result.messages[0]
2132 .content
2133 .contains("1 of 2 attached image(s) could not be loaded")
2134 );
2135 assert!(
2136 !result.messages[0]
2137 .content
2138 .contains("https://example.com/missing.png")
2139 );
2140 }
2141
2142 #[tokio::test]
2143 async fn skipped_images_do_not_consume_image_budget() {
2144 let temp = tempfile::tempdir().unwrap();
2145 let image_path = temp.path().join("older-valid.png");
2146 std::fs::write(
2147 &image_path,
2148 [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
2149 )
2150 .unwrap();
2151
2152 let messages = vec![
2153 ChatMessage::user(format!(
2154 "Older valid image [IMAGE:{}]",
2155 image_path.display()
2156 )),
2157 ChatMessage::user(
2158 "Newer broken image [IMAGE:https://example.com/missing.png]".to_string(),
2159 ),
2160 ];
2161 let config = MultimodalConfig {
2162 max_images: 1,
2163 max_image_size_mb: 5,
2164 allow_remote_fetch: false,
2165 ..Default::default()
2166 };
2167
2168 let result = prepare_messages_for_provider(&messages, &config)
2169 .await
2170 .expect("broken image should not evict an older valid image");
2171
2172 assert!(result.contains_images);
2173 assert!(
2174 result.messages[0]
2175 .content
2176 .contains("data:image/png;base64,")
2177 );
2178 assert!(result.messages[1].content.contains("Newer broken image"));
2179 assert!(
2180 result.messages[1]
2181 .content
2182 .contains("1 attached image(s) could not be loaded")
2183 );
2184 assert!(
2185 !result.messages[1]
2186 .content
2187 .contains("https://example.com/missing.png")
2188 );
2189 }
2190
2191 #[test]
2192 fn extract_ollama_image_payload_supports_data_uris() {
2193 let payload = extract_ollama_image_payload("data:image/png;base64,abcd==")
2194 .expect("payload should be extracted");
2195 assert_eq!(payload, "abcd==");
2196 }
2197
2198 #[test]
2201 fn parse_image_markers_strips_markers_leaving_caption() {
2202 let input = "[IMAGE:/tmp/photo.jpg]\n\nDescribe this screenshot";
2203 let (cleaned, refs) = parse_image_markers(input);
2204 assert_eq!(cleaned, "Describe this screenshot");
2205 assert_eq!(refs.len(), 1);
2206 assert_eq!(refs[0], "/tmp/photo.jpg");
2207 }
2208
2209 #[test]
2212 fn parse_image_markers_image_only_message_becomes_empty() {
2213 let input = "[IMAGE:/tmp/photo.jpg]";
2214 let (cleaned, refs) = parse_image_markers(input);
2215 assert!(
2216 cleaned.is_empty(),
2217 "expected empty string, got: {cleaned:?}"
2218 );
2219 assert_eq!(refs.len(), 1);
2220 }
2221}