1use base64::{Engine as _, engine::general_purpose::STANDARD};
2use reqwest::Client;
3use std::collections::HashSet;
4use std::path::Path;
5use zeroclaw_api::model_provider::ChatMessage;
6use zeroclaw_config::schema::{MultimodalConfig, build_runtime_proxy_client_with_timeouts};
7
8const IMAGE_MARKER_PREFIX: &str = "[IMAGE:";
9const ALLOWED_IMAGE_MIME_TYPES: &[&str] = &[
10 "image/png",
11 "image/jpeg",
12 "image/webp",
13 "image/gif",
14 "image/bmp",
15];
16
17#[derive(Debug, Clone)]
18pub struct PreparedMessages {
19 pub messages: Vec<ChatMessage>,
20 pub contains_images: bool,
21}
22
23#[derive(Debug, thiserror::Error)]
24pub enum MultimodalError {
25 #[error("multimodal image limit exceeded: max_images={max_images}, found={found}")]
26 TooManyImages { max_images: usize, found: usize },
27
28 #[error(
29 "multimodal image size limit exceeded for '{input}': {size_bytes} bytes > {max_bytes} bytes"
30 )]
31 ImageTooLarge {
32 input: String,
33 size_bytes: usize,
34 max_bytes: usize,
35 },
36
37 #[error("multimodal image MIME type is not allowed for '{input}': {mime}")]
38 UnsupportedMime { input: String, mime: String },
39
40 #[error("multimodal remote image fetch is disabled for '{input}'")]
41 RemoteFetchDisabled { input: String },
42
43 #[error("multimodal image source not found or unreadable: '{input}'")]
44 ImageSourceNotFound { input: String },
45
46 #[error("invalid multimodal image marker '{input}': {reason}")]
47 InvalidMarker { input: String, reason: String },
48
49 #[error("failed to download remote image '{input}': {reason}")]
50 RemoteFetchFailed { input: String, reason: String },
51
52 #[error("failed to read local image '{input}': {reason}")]
53 LocalReadFailed { input: String, reason: String },
54}
55
56fn is_loadable_image_reference(candidate: &str) -> bool {
62 candidate.starts_with('/')
63 || candidate.starts_with("http://")
64 || candidate.starts_with("https://")
65 || candidate.starts_with("data:")
66 || is_windows_path(candidate)
67}
68
69fn is_windows_path(candidate: &str) -> bool {
71 let mut chars = candidate.chars();
72 let Some(first) = chars.next() else {
73 return false;
74 };
75 if !first.is_ascii_alphabetic() {
76 return false;
77 }
78 let Some(second) = chars.next() else {
79 return false;
80 };
81 if second != ':' {
82 return false;
83 }
84 matches!(chars.next(), Some('\\') | Some('/'))
85}
86
87fn collapse_wrapped_marker(raw: &str) -> String {
94 if !raw.contains('\n') && !raw.contains('\r') {
95 return raw.trim().to_string();
96 }
97 let mut out = String::with_capacity(raw.len());
98 let mut skip_ws = false;
99 for ch in raw.chars() {
100 if ch == '\n' || ch == '\r' {
101 skip_ws = true;
102 continue;
103 }
104 if skip_ws {
105 if ch.is_whitespace() {
106 continue;
107 }
108 skip_ws = false;
109 }
110 out.push(ch);
111 }
112 out.trim().to_string()
113}
114
115pub fn parse_image_markers(content: &str) -> (String, Vec<String>) {
116 let mut refs = Vec::new();
117 let mut cleaned = String::with_capacity(content.len());
118 let mut cursor = 0usize;
119
120 while let Some(rel_start) = content[cursor..].find(IMAGE_MARKER_PREFIX) {
121 let start = cursor + rel_start;
122 cleaned.push_str(&content[cursor..start]);
123
124 let marker_start = start + IMAGE_MARKER_PREFIX.len();
125 let Some(rel_end) = content[marker_start..].find(']') else {
126 cleaned.push_str(&content[start..]);
127 cursor = content.len();
128 break;
129 };
130
131 let end = marker_start + rel_end;
132 let candidate = collapse_wrapped_marker(&content[marker_start..end]);
133
134 if candidate.is_empty() || !is_loadable_image_reference(&candidate) {
135 cleaned.push_str(&content[start..=end]);
139 } else {
140 refs.push(candidate);
141 }
142
143 cursor = end + 1;
144 }
145
146 if cursor < content.len() {
147 cleaned.push_str(&content[cursor..]);
148 }
149
150 (cleaned.trim().to_string(), refs)
151}
152
153pub fn count_image_markers(messages: &[ChatMessage]) -> usize {
154 let latest_tool_indices = latest_tool_result_indices(messages);
155 count_image_markers_with_latest_tool_results(messages, &latest_tool_indices)
156}
157
158fn count_image_markers_with_latest_tool_results(
159 messages: &[ChatMessage],
160 latest_tool_result_indices: &HashSet<usize>,
161) -> usize {
162 messages
163 .iter()
164 .enumerate()
165 .filter(|(index, message)| {
166 should_normalize_message_images(*index, message, latest_tool_result_indices)
167 })
168 .map(|(_, message)| parse_image_markers(&message.content).1.len())
169 .sum()
170}
171
172pub fn contains_image_markers(messages: &[ChatMessage]) -> bool {
173 count_image_markers(messages) > 0
174}
175
176pub fn strip_media_markers(text: &str) -> String {
193 static RE: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| {
194 regex::Regex::new(r"(?i)\[(?:IMAGE|PHOTO|DOCUMENT|FILE|VIDEO|VOICE|AUDIO):[^\]]*\]")
195 .unwrap()
196 });
197 RE.replace_all(text, "[media attachment]").into_owned()
198}
199
200pub fn extract_ollama_image_payload(image_ref: &str) -> Option<String> {
201 if image_ref.starts_with("data:") {
202 let comma_idx = image_ref.find(',')?;
203 let (_, payload) = image_ref.split_at(comma_idx + 1);
204 let payload = payload.trim();
205 if payload.is_empty() {
206 None
207 } else {
208 Some(payload.to_string())
209 }
210 } else {
211 Some(image_ref.trim().to_string()).filter(|value| !value.is_empty())
212 }
213}
214
215fn is_prompt_tool_result_message(message: &ChatMessage) -> bool {
216 message.role == "user" && message.content.trim_start().starts_with("[Tool results]")
217}
218
219fn is_tool_result_carrier(message: &ChatMessage) -> bool {
220 message.role == "tool" || is_prompt_tool_result_message(message)
221}
222
223fn latest_tool_result_indices(messages: &[ChatMessage]) -> HashSet<usize> {
224 let mut indices = HashSet::new();
225 let Some((last_index, last_message)) = messages.iter().enumerate().next_back() else {
226 return indices;
227 };
228
229 if is_prompt_tool_result_message(last_message) {
230 indices.insert(last_index);
231 return indices;
232 }
233
234 if last_message.role == "tool" {
235 for (index, message) in messages.iter().enumerate().rev() {
236 if message.role != "tool" {
237 break;
238 }
239 indices.insert(index);
240 }
241 }
242
243 indices
244}
245
246fn should_normalize_message_images(
247 index: usize,
248 message: &ChatMessage,
249 latest_tool_result_indices: &HashSet<usize>,
250) -> bool {
251 if is_tool_result_carrier(message) {
252 return latest_tool_result_indices.contains(&index);
253 }
254
255 message.role == "user"
256}
257
258fn stripped_image_marker_text(content: &str) -> String {
259 let (cleaned, refs) = parse_image_markers(content);
260 if refs.is_empty() {
261 return content.to_string();
262 }
263
264 if cleaned.trim().is_empty() {
265 "[image removed from history]".to_string()
266 } else {
267 cleaned
268 }
269}
270
271fn strip_tool_result_image_markers(message: &ChatMessage) -> ChatMessage {
272 if !message.content.contains(IMAGE_MARKER_PREFIX) {
273 return message.clone();
274 }
275
276 if message.role == "tool"
277 && let Ok(serde_json::Value::Object(mut obj)) =
278 serde_json::from_str::<serde_json::Value>(&message.content)
279 && let Some(serde_json::Value::String(inner)) = obj.get("content").cloned()
280 {
281 let stripped = stripped_image_marker_text(&inner);
282 if stripped == inner {
283 return message.clone();
284 }
285
286 obj.insert("content".to_string(), serde_json::Value::String(stripped));
287 return ChatMessage {
288 role: message.role.clone(),
289 content: serde_json::Value::Object(obj).to_string(),
290 };
291 }
292
293 ChatMessage {
294 role: message.role.clone(),
295 content: stripped_image_marker_text(&message.content),
296 }
297}
298
299fn replay_message_without_stale_tool_images(
300 index: usize,
301 message: &ChatMessage,
302 latest_tool_result_indices: &HashSet<usize>,
303) -> ChatMessage {
304 if is_tool_result_carrier(message) && !latest_tool_result_indices.contains(&index) {
305 strip_tool_result_image_markers(message)
306 } else {
307 message.clone()
308 }
309}
310
311async fn normalize_native_tool_result_json(
323 content: &str,
324 config: &MultimodalConfig,
325 max_bytes: usize,
326 remote_client: &Client,
327) -> Option<(String, bool)> {
328 let Ok(serde_json::Value::Object(mut obj)) = serde_json::from_str::<serde_json::Value>(content)
329 else {
330 return None;
331 };
332
333 let Some(serde_json::Value::String(inner)) = obj.get("content").cloned() else {
334 return None;
335 };
336
337 let (cleaned_text, refs) = parse_image_markers(&inner);
338 if refs.is_empty() {
339 return None;
340 }
341
342 let normalized = normalize_image_references(&refs, config, max_bytes, remote_client).await;
343 let new_inner = compose_multimodal_content(
344 &cleaned_text,
345 &normalized.data_uris,
346 normalized.skipped_count,
347 refs.len(),
348 );
349 obj.insert("content".to_string(), serde_json::Value::String(new_inner));
350
351 Some((
352 serde_json::Value::Object(obj).to_string(),
353 !normalized.data_uris.is_empty(),
354 ))
355}
356
357pub async fn prepare_messages_for_provider(
358 messages: &[ChatMessage],
359 config: &MultimodalConfig,
360) -> anyhow::Result<PreparedMessages> {
361 let (max_images, max_image_size_mb) = config.effective_limits();
362 let max_bytes = max_image_size_mb.saturating_mul(1024 * 1024);
363
364 let latest_tool_indices = latest_tool_result_indices(messages);
365 let total_images = count_image_markers_with_latest_tool_results(messages, &latest_tool_indices);
366
367 if total_images == 0 {
368 return Ok(PreparedMessages {
369 messages: messages
370 .iter()
371 .enumerate()
372 .map(|(index, message)| {
373 replay_message_without_stale_tool_images(index, message, &latest_tool_indices)
374 })
375 .collect(),
376 contains_images: false,
377 });
378 }
379
380 let trimmed = if total_images > max_images {
385 trim_old_images(messages, max_images)
386 } else {
387 messages.to_vec()
388 };
389
390 let remote_client = build_runtime_proxy_client_with_timeouts("model_provider.ollama", 30, 10);
391 let latest_tool_indices = latest_tool_result_indices(&trimmed);
392
393 let mut normalized_messages = Vec::with_capacity(messages.len());
394 let mut has_successful_images = false;
395 for (index, message) in messages.iter().enumerate() {
396 if !should_normalize_message_images(index, message, &latest_tool_indices) {
397 normalized_messages.push(replay_message_without_stale_tool_images(
398 index,
399 message,
400 &latest_tool_indices,
401 ));
402 continue;
403 }
404
405 if message.role == "tool"
416 && let Some((prepared, contains_images)) = normalize_native_tool_result_json(
417 &message.content,
418 config,
419 max_bytes,
420 &remote_client,
421 )
422 .await
423 {
424 normalized_messages.push(ChatMessage {
425 role: message.role.clone(),
426 content: prepared,
427 });
428 has_successful_images |= contains_images;
429 continue;
430 }
431
432 let (cleaned_text, refs) = parse_image_markers(&message.content);
433 if refs.is_empty() {
434 normalized_messages.push(message.clone());
435 continue;
436 }
437
438 let normalized = normalize_image_references(&refs, config, max_bytes, &remote_client).await;
439 let content = compose_multimodal_content(
440 &cleaned_text,
441 &normalized.data_uris,
442 normalized.skipped_count,
443 refs.len(),
444 );
445 has_successful_images |= !normalized.data_uris.is_empty();
446 normalized_messages.push(ChatMessage {
447 role: message.role.clone(),
448 content,
449 });
450 }
451
452 let capped_messages =
455 if has_successful_images && count_image_markers(&normalized_messages) > max_images {
456 trim_old_images(&normalized_messages, max_images)
457 } else {
458 normalized_messages
459 };
460
461 Ok(PreparedMessages {
462 contains_images: count_image_markers(&capped_messages) > 0,
463 messages: capped_messages,
464 })
465}
466
467fn trim_old_images(messages: &[ChatMessage], max_images: usize) -> Vec<ChatMessage> {
470 let latest_tool_indices = latest_tool_result_indices(messages);
471 let image_positions: Vec<(usize, usize)> = messages
473 .iter()
474 .enumerate()
475 .filter(|(index, message)| {
476 should_normalize_message_images(*index, message, &latest_tool_indices)
477 })
478 .filter_map(|(i, m)| {
479 let count = parse_image_markers(&m.content).1.len();
480 if count > 0 { Some((i, count)) } else { None }
481 })
482 .collect();
483
484 let total: usize = image_positions.iter().map(|(_, c)| c).sum();
486 let mut to_drop = total.saturating_sub(max_images);
487
488 let mut strip_indices = std::collections::HashSet::new();
490 for &(idx, count) in &image_positions {
491 if to_drop == 0 {
492 break;
493 }
494 strip_indices.insert(idx);
495 to_drop = to_drop.saturating_sub(count);
496 }
497
498 messages
499 .iter()
500 .enumerate()
501 .map(|(i, m)| {
502 if strip_indices.contains(&i) {
503 let (cleaned, _) = parse_image_markers(&m.content);
504 let text = if cleaned.trim().is_empty() {
505 "[image removed from history]".to_string()
506 } else {
507 cleaned
508 };
509 ChatMessage {
510 role: m.role.clone(),
511 content: text,
512 }
513 } else {
514 replay_message_without_stale_tool_images(i, m, &latest_tool_indices)
515 }
516 })
517 .collect()
518}
519
520fn compose_multimodal_message(text: &str, data_uris: &[String]) -> String {
521 let mut content = String::new();
522 let trimmed = text.trim();
523
524 if !trimmed.is_empty() {
525 content.push_str(trimmed);
526 content.push_str("\n\n");
527 }
528
529 for (index, data_uri) in data_uris.iter().enumerate() {
530 if index > 0 {
531 content.push('\n');
532 }
533 content.push_str(IMAGE_MARKER_PREFIX);
534 content.push_str(data_uri);
535 content.push(']');
536 }
537
538 content
539}
540
541struct NormalizedImageReferences {
542 data_uris: Vec<String>,
543 skipped_count: usize,
544}
545
546async fn normalize_image_references(
547 refs: &[String],
548 config: &MultimodalConfig,
549 max_bytes: usize,
550 remote_client: &Client,
551) -> NormalizedImageReferences {
552 let mut data_uris = Vec::with_capacity(refs.len());
553 let mut skipped_count = 0usize;
554
555 for reference in refs {
556 match normalize_image_reference(reference, config, max_bytes, remote_client).await {
557 Ok(data_uri) => data_uris.push(data_uri),
558 Err(error) => {
559 skipped_count += 1;
560 let error_reason = multimodal_error_reason(&error);
561 ::zeroclaw_log::record!(
562 WARN,
563 ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
564 .with_outcome(::zeroclaw_log::EventOutcome::Unknown)
565 .with_attrs(::serde_json::json!({
566 "source_kind": image_reference_kind(reference),
567 "error_kind": multimodal_error_kind(&error),
568 "reason": error_reason.as_deref().unwrap_or(""),
569 })),
570 "skipping multimodal image that could not be loaded"
571 );
572 }
573 }
574 }
575
576 NormalizedImageReferences {
577 data_uris,
578 skipped_count,
579 }
580}
581
582fn compose_multimodal_content(
583 text: &str,
584 data_uris: &[String],
585 skipped_count: usize,
586 total_refs: usize,
587) -> String {
588 if skipped_count == 0 {
589 return compose_multimodal_message(text, data_uris);
590 }
591
592 let text_with_note = append_skipped_image_note(text, skipped_count, total_refs);
593 if data_uris.is_empty() {
594 text_with_note.trim().to_string()
595 } else {
596 compose_multimodal_message(&text_with_note, data_uris)
597 }
598}
599
600fn append_skipped_image_note(text: &str, skipped_count: usize, total_refs: usize) -> String {
601 if skipped_count == 0 {
602 return text.to_string();
603 }
604
605 let note = if skipped_count == total_refs {
607 format!("{skipped_count} attached image(s) could not be loaded")
608 } else {
609 format!("{skipped_count} of {total_refs} attached image(s) could not be loaded")
610 };
611
612 let trimmed = text.trim();
613 if trimmed.is_empty() {
614 format!("Note: {note}.")
615 } else {
616 format!("{trimmed}\n\nNote: {note}.")
617 }
618}
619
620fn image_reference_kind(reference: &str) -> &'static str {
621 if reference.starts_with("data:") {
622 "data"
623 } else if reference.starts_with("http://") || reference.starts_with("https://") {
624 "remote"
625 } else {
626 "local"
627 }
628}
629
630fn multimodal_error_kind(error: &anyhow::Error) -> &'static str {
631 match error.downcast_ref::<MultimodalError>() {
632 Some(MultimodalError::TooManyImages { .. }) => "too_many_images",
633 Some(MultimodalError::ImageTooLarge { .. }) => "image_too_large",
634 Some(MultimodalError::UnsupportedMime { .. }) => "unsupported_mime",
635 Some(MultimodalError::RemoteFetchDisabled { .. }) => "remote_fetch_disabled",
636 Some(MultimodalError::ImageSourceNotFound { .. }) => "image_source_not_found",
637 Some(MultimodalError::InvalidMarker { .. }) => "invalid_marker",
638 Some(MultimodalError::RemoteFetchFailed { .. }) => "remote_fetch_failed",
639 Some(MultimodalError::LocalReadFailed { .. }) => "local_read_failed",
640 None => "unknown",
641 }
642}
643
644fn multimodal_error_reason(error: &anyhow::Error) -> Option<String> {
645 match error.downcast_ref::<MultimodalError>() {
646 Some(MultimodalError::InvalidMarker { input, reason })
647 | Some(MultimodalError::RemoteFetchFailed { input, reason })
648 | Some(MultimodalError::LocalReadFailed { input, reason }) => {
649 Some(reason.replace(input, "<source>"))
650 }
651 _ => None,
652 }
653}
654
655async fn normalize_image_reference(
656 source: &str,
657 config: &MultimodalConfig,
658 max_bytes: usize,
659 remote_client: &Client,
660) -> anyhow::Result<String> {
661 if source.starts_with("data:") {
662 return normalize_data_uri(source, max_bytes);
663 }
664
665 if source.starts_with("http://") || source.starts_with("https://") {
666 if !config.allow_remote_fetch {
667 return Err(MultimodalError::RemoteFetchDisabled {
668 input: source.to_string(),
669 }
670 .into());
671 }
672
673 return normalize_remote_image(source, max_bytes, remote_client).await;
674 }
675
676 normalize_local_image(source, max_bytes).await
677}
678
679fn normalize_data_uri(source: &str, max_bytes: usize) -> anyhow::Result<String> {
680 let Some(comma_idx) = source.find(',') else {
681 return Err(MultimodalError::InvalidMarker {
682 input: source.to_string(),
683 reason: "expected data URI payload".to_string(),
684 }
685 .into());
686 };
687
688 let header = &source[..comma_idx];
689 let payload = source[comma_idx + 1..].trim();
690
691 if !header.contains(";base64") {
692 return Err(MultimodalError::InvalidMarker {
693 input: source.to_string(),
694 reason: "only base64 data URIs are supported".to_string(),
695 }
696 .into());
697 }
698
699 let mime = header
700 .trim_start_matches("data:")
701 .split(';')
702 .next()
703 .unwrap_or_default()
704 .trim()
705 .to_ascii_lowercase();
706
707 validate_mime(source, &mime)?;
708
709 let decoded = STANDARD
710 .decode(payload)
711 .map_err(|error| MultimodalError::InvalidMarker {
712 input: source.to_string(),
713 reason: format!("invalid base64 payload: {error}"),
714 })?;
715
716 validate_size(source, decoded.len(), max_bytes)?;
717
718 Ok(format!("data:{mime};base64,{}", STANDARD.encode(decoded)))
719}
720
721async fn normalize_remote_image(
722 source: &str,
723 max_bytes: usize,
724 remote_client: &Client,
725) -> anyhow::Result<String> {
726 let response = remote_client.get(source).send().await.map_err(|error| {
727 MultimodalError::RemoteFetchFailed {
728 input: source.to_string(),
729 reason: error.to_string(),
730 }
731 })?;
732
733 let status = response.status();
734 if !status.is_success() {
735 return Err(MultimodalError::RemoteFetchFailed {
736 input: source.to_string(),
737 reason: format!("HTTP {status}"),
738 }
739 .into());
740 }
741
742 if let Some(content_length) = response.content_length() {
743 let content_length = usize::try_from(content_length).unwrap_or(usize::MAX);
744 validate_size(source, content_length, max_bytes)?;
745 }
746
747 let content_type = response
748 .headers()
749 .get(reqwest::header::CONTENT_TYPE)
750 .and_then(|value| value.to_str().ok())
751 .map(ToString::to_string);
752
753 let bytes = response
754 .bytes()
755 .await
756 .map_err(|error| MultimodalError::RemoteFetchFailed {
757 input: source.to_string(),
758 reason: error.to_string(),
759 })?;
760
761 validate_size(source, bytes.len(), max_bytes)?;
762
763 let mime = detect_mime(None, bytes.as_ref(), content_type.as_deref()).ok_or_else(|| {
764 MultimodalError::UnsupportedMime {
765 input: source.to_string(),
766 mime: "unknown".to_string(),
767 }
768 })?;
769
770 validate_mime(source, &mime)?;
771
772 Ok(format!("data:{mime};base64,{}", STANDARD.encode(bytes)))
773}
774
775async fn normalize_local_image(source: &str, max_bytes: usize) -> anyhow::Result<String> {
776 let path = Path::new(source);
777 if !path.exists() || !path.is_file() {
778 return Err(MultimodalError::ImageSourceNotFound {
779 input: source.to_string(),
780 }
781 .into());
782 }
783
784 let metadata =
785 tokio::fs::metadata(path)
786 .await
787 .map_err(|error| MultimodalError::LocalReadFailed {
788 input: source.to_string(),
789 reason: error.to_string(),
790 })?;
791
792 validate_size(
793 source,
794 usize::try_from(metadata.len()).unwrap_or(usize::MAX),
795 max_bytes,
796 )?;
797
798 let bytes = tokio::fs::read(path)
799 .await
800 .map_err(|error| MultimodalError::LocalReadFailed {
801 input: source.to_string(),
802 reason: error.to_string(),
803 })?;
804
805 validate_size(source, bytes.len(), max_bytes)?;
806
807 let mime =
808 detect_mime(Some(path), &bytes, None).ok_or_else(|| MultimodalError::UnsupportedMime {
809 input: source.to_string(),
810 mime: "unknown".to_string(),
811 })?;
812
813 validate_mime(source, &mime)?;
814
815 Ok(format!("data:{mime};base64,{}", STANDARD.encode(bytes)))
816}
817
818fn validate_size(source: &str, size_bytes: usize, max_bytes: usize) -> anyhow::Result<()> {
819 if size_bytes > max_bytes {
820 return Err(MultimodalError::ImageTooLarge {
821 input: source.to_string(),
822 size_bytes,
823 max_bytes,
824 }
825 .into());
826 }
827
828 Ok(())
829}
830
831fn validate_mime(source: &str, mime: &str) -> anyhow::Result<()> {
832 if ALLOWED_IMAGE_MIME_TYPES.contains(&mime) {
833 return Ok(());
834 }
835
836 Err(MultimodalError::UnsupportedMime {
837 input: source.to_string(),
838 mime: mime.to_string(),
839 }
840 .into())
841}
842
843fn detect_mime(
844 path: Option<&Path>,
845 bytes: &[u8],
846 header_content_type: Option<&str>,
847) -> Option<String> {
848 if let Some(header_mime) = header_content_type.and_then(normalize_content_type) {
849 return Some(header_mime);
850 }
851
852 if let Some(path) = path
853 && let Some(ext) = path.extension().and_then(|value| value.to_str())
854 && let Some(mime) = mime_from_extension(ext)
855 {
856 return Some(mime.to_string());
857 }
858
859 mime_from_magic(bytes).map(ToString::to_string)
860}
861
862fn normalize_content_type(content_type: &str) -> Option<String> {
863 let mime = content_type.split(';').next()?.trim().to_ascii_lowercase();
864 if mime.is_empty() { None } else { Some(mime) }
865}
866
867fn mime_from_extension(ext: &str) -> Option<&'static str> {
868 match ext.to_ascii_lowercase().as_str() {
869 "png" => Some("image/png"),
870 "jpg" | "jpeg" => Some("image/jpeg"),
871 "webp" => Some("image/webp"),
872 "gif" => Some("image/gif"),
873 "bmp" => Some("image/bmp"),
874 _ => None,
875 }
876}
877
878fn mime_from_magic(bytes: &[u8]) -> Option<&'static str> {
879 if bytes.len() >= 8 && bytes.starts_with(&[0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n']) {
880 return Some("image/png");
881 }
882
883 if bytes.len() >= 3 && bytes.starts_with(&[0xff, 0xd8, 0xff]) {
884 return Some("image/jpeg");
885 }
886
887 if bytes.len() >= 6 && (bytes.starts_with(b"GIF87a") || bytes.starts_with(b"GIF89a")) {
888 return Some("image/gif");
889 }
890
891 if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
892 return Some("image/webp");
893 }
894
895 if bytes.len() >= 2 && bytes.starts_with(b"BM") {
896 return Some("image/bmp");
897 }
898
899 None
900}
901
902#[cfg(test)]
903mod tests {
904 use super::*;
905
906 #[test]
907 fn strip_media_markers_replaces_image_local_path() {
908 let input = "Look at [IMAGE:/zeroclaw-data/workspace/telegram_files/photo_1.jpg]";
909 assert_eq!(strip_media_markers(input), "Look at [media attachment]");
910 }
911
912 #[test]
913 fn strip_media_markers_replaces_image_data_uri() {
914 let input = "Inline [IMAGE:data:image/png;base64,abcd]";
915 assert_eq!(strip_media_markers(input), "Inline [media attachment]");
916 }
917
918 #[test]
919 fn strip_media_markers_replaces_all_supported_kinds() {
920 let input = "[IMAGE:/a.jpg] [PHOTO:/b.jpg] [DOCUMENT:/c.pdf] [FILE:/d.zip] [VIDEO:/e.mp4] [VOICE:/f.ogg] [AUDIO:/g.wav]";
924 let expected = "[media attachment] [media attachment] [media attachment] [media attachment] [media attachment] [media attachment] [media attachment]";
925 assert_eq!(strip_media_markers(input), expected);
926 }
927
928 #[test]
929 fn strip_media_markers_is_case_insensitive() {
930 let input = "[image:/a.jpg] [Photo:/b.jpg] [video:/c.mp4]";
935 let expected = "[media attachment] [media attachment] [media attachment]";
936 assert_eq!(strip_media_markers(input), expected);
937 }
938
939 #[test]
940 fn strip_media_markers_leaves_plain_text_untouched() {
941 let input = "No markers here, just text with [brackets] and (parens).";
942 assert_eq!(strip_media_markers(input), input);
943 }
944
945 #[test]
946 fn strip_media_markers_preserves_unrelated_brackets() {
947 let input = "Use [TODO:foo] and [NOTE:bar] but replace [IMAGE:/x.jpg]";
949 assert_eq!(
950 strip_media_markers(input),
951 "Use [TODO:foo] and [NOTE:bar] but replace [media attachment]"
952 );
953 }
954
955 #[test]
956 fn parse_image_markers_extracts_multiple_markers() {
957 let input = "Check this [IMAGE:/tmp/a.png] and this [IMAGE:https://example.com/b.jpg]";
958 let (cleaned, refs) = parse_image_markers(input);
959
960 assert_eq!(cleaned, "Check this and this");
961 assert_eq!(refs.len(), 2);
962 assert_eq!(refs[0], "/tmp/a.png");
963 assert_eq!(refs[1], "https://example.com/b.jpg");
964 }
965
966 #[test]
967 fn parse_image_markers_collapses_line_wrapped_path() {
968 let input = "from the logs whether the agent emits\n [IMAGE:/home/zeroclaw_user/.zeroclaw/workspace/signal_i\n nbound/attachment.jpg] (which the\n channel resolves)";
971 let (_, refs) = parse_image_markers(input);
972 assert_eq!(refs.len(), 1);
973 assert_eq!(
974 refs[0],
975 "/home/zeroclaw_user/.zeroclaw/workspace/signal_inbound/attachment.jpg"
976 );
977 }
978
979 #[test]
980 fn parse_image_markers_leaves_placeholder_markers_as_literal_text() {
981 let input = "example: `[IMAGE:...]` or `[IMAGE:<path>]` or `[IMAGE:example.png]`";
986 let (cleaned, refs) = parse_image_markers(input);
987 assert!(
988 refs.is_empty(),
989 "no placeholder should be treated as a loadable ref, got: {refs:?}"
990 );
991 assert!(cleaned.contains("[IMAGE:...]"));
992 assert!(cleaned.contains("[IMAGE:<path>]"));
993 assert!(cleaned.contains("[IMAGE:example.png]"));
994 }
995
996 #[test]
997 fn parse_image_markers_preserves_spaces_in_path() {
998 let input = "look at [IMAGE:/tmp/my photos/beetle.png] please";
1001 let (_, refs) = parse_image_markers(input);
1002 assert_eq!(refs.len(), 1);
1003 assert_eq!(refs[0], "/tmp/my photos/beetle.png");
1004 }
1005
1006 #[test]
1007 fn parse_image_markers_keeps_invalid_empty_marker() {
1008 let input = "hello [IMAGE:] world";
1009 let (cleaned, refs) = parse_image_markers(input);
1010
1011 assert_eq!(cleaned, "hello [IMAGE:] world");
1012 assert!(refs.is_empty());
1013 }
1014
1015 #[tokio::test]
1016 async fn prepare_messages_normalizes_local_image_to_data_uri() {
1017 let temp = tempfile::tempdir().unwrap();
1018 let image_path = temp.path().join("sample.png");
1019
1020 std::fs::write(
1022 &image_path,
1023 [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1024 )
1025 .unwrap();
1026
1027 let messages = vec![ChatMessage::user(format!(
1028 "Please inspect this screenshot [IMAGE:{}]",
1029 image_path.display()
1030 ))];
1031
1032 let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1033 .await
1034 .unwrap();
1035
1036 assert!(prepared.contains_images);
1037 assert_eq!(prepared.messages.len(), 1);
1038
1039 let (cleaned, refs) = parse_image_markers(&prepared.messages[0].content);
1040 assert_eq!(cleaned, "Please inspect this screenshot");
1041 assert_eq!(refs.len(), 1);
1042 assert!(refs[0].starts_with("data:image/png;base64,"));
1043 }
1044
1045 #[tokio::test]
1046 async fn prepare_messages_normalizes_tool_message_local_image_to_data_uri() {
1052 let temp = tempfile::tempdir().unwrap();
1053 let image_path = temp.path().join("tool-sample.png");
1054
1055 std::fs::write(
1056 &image_path,
1057 [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1058 )
1059 .unwrap();
1060
1061 let messages = vec![ChatMessage::tool(format!(
1062 "<tool_result name=\"image_gen\">\nGenerated image [IMAGE:{}]\n</tool_result>",
1063 image_path.display()
1064 ))];
1065
1066 let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1067 .await
1068 .unwrap();
1069
1070 assert!(prepared.contains_images);
1071 assert_eq!(prepared.messages.len(), 1);
1072 assert_eq!(prepared.messages[0].role, "tool");
1073
1074 let (cleaned, refs) = parse_image_markers(&prepared.messages[0].content);
1075 assert!(cleaned.contains("<tool_result name=\"image_gen\">"));
1076 assert!(cleaned.contains("Generated image"));
1077 assert_eq!(refs.len(), 1);
1078 assert!(refs[0].starts_with("data:image/png;base64,"));
1079 }
1080
1081 #[tokio::test]
1093 async fn prepare_messages_preserves_native_tool_result_json_shape() {
1094 let temp = tempfile::tempdir().unwrap();
1095 let image_path = temp.path().join("native-tool-result.png");
1096 std::fs::write(
1097 &image_path,
1098 [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1099 )
1100 .unwrap();
1101
1102 let native_tool_content = serde_json::json!({
1103 "tool_call_id": "tc1",
1104 "content": format!("see attached [IMAGE:{}]", image_path.display().to_string()),
1105 })
1106 .to_string();
1107
1108 let messages = vec![ChatMessage::tool(native_tool_content)];
1109
1110 let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1111 .await
1112 .expect("preparation should succeed for native tool-result JSON");
1113
1114 assert!(prepared.contains_images);
1115 assert_eq!(prepared.messages.len(), 1);
1116 assert_eq!(prepared.messages[0].role, "tool");
1117
1118 let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1119 .expect("prepared tool message must remain valid JSON");
1120
1121 assert_eq!(
1122 value.get("tool_call_id").and_then(|v| v.as_str()),
1123 Some("tc1"),
1124 "tool_call_id must survive multimodal preprocessing unchanged"
1125 );
1126
1127 let inner = value
1128 .get("content")
1129 .and_then(|v| v.as_str())
1130 .expect("content must remain a JSON string");
1131 assert!(
1132 inner.contains("see attached"),
1133 "surrounding text in tool content should survive normalization"
1134 );
1135 assert!(
1136 inner.contains("data:image/png;base64,"),
1137 "local image path inside tool content should be rewritten to a data URI"
1138 );
1139 assert!(
1140 !inner.contains("native-tool-result.png"),
1141 "raw local path must not leak after normalization"
1142 );
1143 }
1144
1145 #[tokio::test]
1146 async fn prepare_messages_preserves_native_tool_json_when_image_is_skipped() {
1147 let native_tool_content = serde_json::json!({
1148 "tool_call_id": "tc1",
1149 "content": "generated screenshot [IMAGE:https://example.com/missing.png]",
1150 })
1151 .to_string();
1152
1153 let prepared = prepare_messages_for_provider(
1154 &[ChatMessage::tool(native_tool_content)],
1155 &MultimodalConfig::default(),
1156 )
1157 .await
1158 .expect("skipped native tool image should not fail message preparation");
1159
1160 assert!(!prepared.contains_images);
1161 assert_eq!(prepared.messages.len(), 1);
1162
1163 let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1164 .expect("native tool result must remain valid JSON");
1165 assert_eq!(
1166 value.get("tool_call_id").and_then(|v| v.as_str()),
1167 Some("tc1")
1168 );
1169
1170 let inner = value
1171 .get("content")
1172 .and_then(|v| v.as_str())
1173 .expect("content should remain a JSON string");
1174 assert!(inner.contains("generated screenshot"));
1175 assert!(inner.contains("1 attached image(s) could not be loaded"));
1176 assert!(!inner.contains("[IMAGE:"));
1177 assert!(!inner.contains("https://example.com/missing.png"));
1178 }
1179
1180 #[tokio::test]
1181 async fn prepare_messages_preserves_native_tool_json_with_mixed_images() {
1182 let temp = tempfile::tempdir().unwrap();
1183 let image_path = temp.path().join("mixed-native-tool-result.png");
1184 std::fs::write(
1185 &image_path,
1186 [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1187 )
1188 .unwrap();
1189
1190 let native_tool_content = serde_json::json!({
1191 "tool_call_id": "tc1",
1192 "content": format!(
1193 "generated [IMAGE:{}] and [IMAGE:https://example.com/missing.png]",
1194 image_path.display()
1195 ),
1196 })
1197 .to_string();
1198
1199 let prepared = prepare_messages_for_provider(
1200 &[ChatMessage::tool(native_tool_content)],
1201 &MultimodalConfig::default(),
1202 )
1203 .await
1204 .expect("valid native tool image should survive while bad ref is skipped");
1205
1206 assert!(prepared.contains_images);
1207 assert_eq!(prepared.messages.len(), 1);
1208
1209 let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1210 .expect("native tool result must remain valid JSON");
1211 assert_eq!(
1212 value.get("tool_call_id").and_then(|v| v.as_str()),
1213 Some("tc1")
1214 );
1215
1216 let inner = value
1217 .get("content")
1218 .and_then(|v| v.as_str())
1219 .expect("content should remain a JSON string");
1220 assert!(inner.contains("generated"));
1221 assert!(inner.contains("data:image/png;base64,"));
1222 assert!(inner.contains("1 of 2 attached image(s) could not be loaded"));
1223 assert!(!inner.contains("mixed-native-tool-result.png"));
1224 assert!(!inner.contains("https://example.com/missing.png"));
1225 }
1226
1227 #[tokio::test]
1228 async fn prepare_messages_strips_stale_native_tool_result_images() {
1229 let temp = tempfile::tempdir().unwrap();
1230 let image_path = temp.path().join("stale-native-tool-result.png");
1231 std::fs::write(
1232 &image_path,
1233 [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1234 )
1235 .unwrap();
1236
1237 let native_tool_content = serde_json::json!({
1238 "tool_call_id": "tc1",
1239 "content": format!("generated screenshot [IMAGE:{}]", image_path.display().to_string()),
1240 })
1241 .to_string();
1242
1243 let messages = vec![
1244 ChatMessage::tool(native_tool_content),
1245 ChatMessage {
1246 role: "assistant".to_string(),
1247 content: "I generated the screenshot.".to_string(),
1248 },
1249 ChatMessage::user("What happened next?".to_string()),
1250 ];
1251
1252 let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1253 .await
1254 .expect("preparation should strip stale tool images without loading them");
1255
1256 assert!(
1257 !prepared.contains_images,
1258 "stale tool-result images should not keep the request in vision mode"
1259 );
1260
1261 let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1262 .expect("stale native tool result should remain valid JSON");
1263 assert_eq!(
1264 value.get("tool_call_id").and_then(|v| v.as_str()),
1265 Some("tc1")
1266 );
1267
1268 let inner = value
1269 .get("content")
1270 .and_then(|v| v.as_str())
1271 .expect("content should remain a JSON string");
1272 assert!(inner.contains("generated screenshot"));
1273 assert!(!inner.contains("[IMAGE:"));
1274 assert!(!inner.contains("data:image"));
1275 assert!(!inner.contains("stale-native-tool-result.png"));
1276 }
1277
1278 #[tokio::test]
1279 async fn prepare_messages_strips_stale_prompt_tool_result_images() {
1280 let temp = tempfile::tempdir().unwrap();
1281 let image_path = temp.path().join("stale-prompt-tool-result.png");
1282 std::fs::write(
1283 &image_path,
1284 [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1285 )
1286 .unwrap();
1287
1288 let messages = vec![
1289 ChatMessage::user(format!(
1290 "[Tool results]\n<tool_result name=\"image_gen\">Generated [IMAGE:{}]</tool_result>",
1291 image_path.display()
1292 )),
1293 ChatMessage {
1294 role: "assistant".to_string(),
1295 content: "I generated the screenshot.".to_string(),
1296 },
1297 ChatMessage::user("Continue.".to_string()),
1298 ];
1299
1300 let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1301 .await
1302 .expect("preparation should strip stale prompt-mode tool images");
1303
1304 assert!(!prepared.contains_images);
1305 assert!(prepared.messages[0].content.contains("[Tool results]"));
1306 assert!(prepared.messages[0].content.contains("Generated"));
1307 assert!(!prepared.messages[0].content.contains("[IMAGE:"));
1308 assert!(!prepared.messages[0].content.contains("data:image"));
1309 assert!(
1310 !prepared.messages[0]
1311 .content
1312 .contains("stale-prompt-tool-result.png")
1313 );
1314 }
1315
1316 #[tokio::test]
1317 async fn prepare_messages_strips_stale_tool_image_while_normalizing_current_user_image() {
1318 let temp = tempfile::tempdir().unwrap();
1319 let stale_path = temp.path().join("stale-tool-result.png");
1320 let fresh_path = temp.path().join("fresh-user-image.png");
1321 let png = [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'];
1322 std::fs::write(&stale_path, png).unwrap();
1323 std::fs::write(&fresh_path, png).unwrap();
1324
1325 let native_tool_content = serde_json::json!({
1326 "tool_call_id": "tc1",
1327 "content": format!("generated screenshot [IMAGE:{}]", stale_path.display().to_string()),
1328 })
1329 .to_string();
1330
1331 let messages = vec![
1332 ChatMessage::tool(native_tool_content),
1333 ChatMessage {
1334 role: "assistant".to_string(),
1335 content: "I generated the screenshot.".to_string(),
1336 },
1337 ChatMessage::user(format!(
1338 "Now inspect this [IMAGE:{}]",
1339 fresh_path.display().to_string()
1340 )),
1341 ];
1342
1343 let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1344 .await
1345 .expect("preparation should strip stale tool images and normalize current user image");
1346
1347 assert!(prepared.contains_images);
1348
1349 let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1350 .expect("stale native tool result should remain valid JSON");
1351 let inner = value
1352 .get("content")
1353 .and_then(|v| v.as_str())
1354 .expect("content should remain a JSON string");
1355 assert!(inner.contains("generated screenshot"));
1356 assert!(!inner.contains("[IMAGE:"));
1357 assert!(!inner.contains("data:image"));
1358 assert!(!inner.contains("stale-tool-result.png"));
1359
1360 let (cleaned, refs) = parse_image_markers(&prepared.messages[2].content);
1361 assert_eq!(cleaned, "Now inspect this");
1362 assert_eq!(refs.len(), 1);
1363 assert!(refs[0].starts_with("data:image/png;base64,"));
1364 assert!(
1365 !prepared.messages[2]
1366 .content
1367 .contains("fresh-user-image.png")
1368 );
1369 }
1370
1371 #[test]
1372 fn count_image_markers_ignores_stale_tool_results() {
1373 let messages = vec![
1374 ChatMessage::tool("[IMAGE:/tmp/stale-tool.png]\nGenerated".to_string()),
1375 ChatMessage {
1376 role: "assistant".to_string(),
1377 content: "Done.".to_string(),
1378 },
1379 ChatMessage::user("Next question".to_string()),
1380 ];
1381
1382 assert_eq!(count_image_markers(&messages), 0);
1383
1384 let messages = vec![
1385 ChatMessage::user("Create an image".to_string()),
1386 ChatMessage::tool("[IMAGE:/tmp/latest-tool.png]\nGenerated".to_string()),
1387 ];
1388
1389 assert_eq!(count_image_markers(&messages), 1);
1390 }
1391
1392 #[tokio::test]
1393 async fn prepare_messages_trims_excess_images_from_older_messages() {
1394 let messages = vec![
1397 ChatMessage::user("[IMAGE:/tmp/old.png]\nOld caption".to_string()),
1398 ChatMessage::user("[IMAGE:/tmp/mid.png]\nMid caption".to_string()),
1399 ChatMessage::user("[IMAGE:/tmp/new.png]\nNew caption".to_string()),
1400 ];
1401
1402 let trimmed = trim_old_images(&messages, 2);
1407 assert_eq!(trimmed.len(), 3);
1408
1409 let (_, refs0) = parse_image_markers(&trimmed[0].content);
1411 assert!(refs0.is_empty(), "oldest image should be stripped");
1412 assert!(trimmed[0].content.contains("Old caption"));
1413
1414 let (_, refs1) = parse_image_markers(&trimmed[1].content);
1416 assert_eq!(refs1.len(), 1);
1417 let (_, refs2) = parse_image_markers(&trimmed[2].content);
1418 assert_eq!(refs2.len(), 1);
1419 }
1420
1421 #[test]
1422 fn trim_old_images_replaces_image_only_message() {
1423 let messages = vec![
1425 ChatMessage::user("[IMAGE:/tmp/old.png]".to_string()),
1426 ChatMessage::user("[IMAGE:/tmp/new.png]\nKeep this".to_string()),
1427 ];
1428
1429 let trimmed = trim_old_images(&messages, 1);
1430 assert_eq!(trimmed[0].content, "[image removed from history]");
1431 assert!(trimmed[1].content.contains("[IMAGE:/tmp/new.png]"));
1432 }
1433
1434 #[test]
1435 fn trim_old_images_multi_image_message_stripped_as_unit() {
1436 let messages = vec![
1441 ChatMessage::user(
1442 "[IMAGE:/tmp/a.png]\n[IMAGE:/tmp/b.png]\n[IMAGE:/tmp/c.png]\nThree pics"
1443 .to_string(),
1444 ),
1445 ChatMessage::user("Just text, no images".to_string()),
1446 ];
1447
1448 let trimmed = trim_old_images(&messages, 1);
1449 assert_eq!(trimmed.len(), 2);
1450 let (_, refs0) = parse_image_markers(&trimmed[0].content);
1452 assert!(refs0.is_empty());
1453 assert!(trimmed[0].content.contains("Three pics"));
1454 assert_eq!(trimmed[1].content, "Just text, no images");
1456 }
1457
1458 #[test]
1459 fn trim_old_images_skips_assistant_messages() {
1460 let messages = vec![
1462 ChatMessage {
1463 role: "assistant".to_string(),
1464 content: "[IMAGE:/tmp/assistant.png]\nAssistant generated".to_string(),
1465 },
1466 ChatMessage::user("[IMAGE:/tmp/user1.png]\nFirst".to_string()),
1467 ChatMessage::user("[IMAGE:/tmp/user2.png]\nSecond".to_string()),
1468 ];
1469
1470 let trimmed = trim_old_images(&messages, 1);
1471 assert!(trimmed[0].content.contains("[IMAGE:/tmp/assistant.png]"));
1473 let (_, refs1) = parse_image_markers(&trimmed[1].content);
1475 assert!(refs1.is_empty());
1476 assert!(trimmed[1].content.contains("First"));
1477 let (_, refs2) = parse_image_markers(&trimmed[2].content);
1479 assert_eq!(refs2.len(), 1);
1480 }
1481
1482 #[test]
1483 fn trim_old_images_counts_latest_tool_messages() {
1484 let messages = vec![
1485 ChatMessage::user("[IMAGE:/tmp/user-old.png]\nOldest".to_string()),
1486 ChatMessage::tool("[IMAGE:/tmp/tool-new.png]\nGenerated".to_string()),
1487 ];
1488
1489 let trimmed = trim_old_images(&messages, 1);
1490 let (_, refs0) = parse_image_markers(&trimmed[0].content);
1491 assert!(refs0.is_empty(), "oldest user image should be stripped");
1492 assert!(trimmed[0].content.contains("Oldest"));
1493
1494 let (_, refs1) = parse_image_markers(&trimmed[1].content);
1495 assert_eq!(refs1.len(), 1);
1496 }
1497
1498 #[test]
1499 fn trim_old_images_no_trimming_when_under_limit() {
1500 let messages = vec![
1501 ChatMessage::user("[IMAGE:/tmp/a.png]\nCaption A".to_string()),
1502 ChatMessage::user("[IMAGE:/tmp/b.png]\nCaption B".to_string()),
1503 ];
1504
1505 let trimmed = trim_old_images(&messages, 5);
1506 assert_eq!(trimmed[0].content, messages[0].content);
1508 assert_eq!(trimmed[1].content, messages[1].content);
1509 }
1510
1511 #[test]
1512 fn trim_old_images_no_trimming_when_exactly_at_limit() {
1513 let messages = vec![
1514 ChatMessage::user("[IMAGE:/tmp/a.png]\nA".to_string()),
1515 ChatMessage::user("[IMAGE:/tmp/b.png]\nB".to_string()),
1516 ];
1517
1518 let trimmed = trim_old_images(&messages, 2);
1519 assert_eq!(trimmed[0].content, messages[0].content);
1520 assert_eq!(trimmed[1].content, messages[1].content);
1521 }
1522
1523 #[test]
1524 fn trim_old_images_empty_messages() {
1525 let trimmed = trim_old_images(&[], 4);
1526 assert!(trimmed.is_empty());
1527 }
1528
1529 #[test]
1530 fn trim_old_images_interleaved_roles() {
1531 let messages = vec![
1534 ChatMessage::user("[IMAGE:/tmp/1.png]\nLook at this".to_string()),
1535 ChatMessage {
1536 role: "assistant".to_string(),
1537 content: "I see a photo.".to_string(),
1538 },
1539 ChatMessage::user("[IMAGE:/tmp/2.png]\nWhat about this?".to_string()),
1540 ChatMessage {
1541 role: "assistant".to_string(),
1542 content: "That's a chart.".to_string(),
1543 },
1544 ChatMessage::user("[IMAGE:/tmp/3.png]\nAnd this one".to_string()),
1545 ];
1546
1547 let trimmed = trim_old_images(&messages, 2);
1548 assert_eq!(trimmed.len(), 5);
1549 let (_, refs0) = parse_image_markers(&trimmed[0].content);
1551 assert!(refs0.is_empty());
1552 assert!(trimmed[0].content.contains("Look at this"));
1553 assert_eq!(trimmed[1].content, "I see a photo.");
1555 assert_eq!(trimmed[3].content, "That's a chart.");
1556 let (_, refs2) = parse_image_markers(&trimmed[2].content);
1558 assert_eq!(refs2.len(), 1);
1559 let (_, refs4) = parse_image_markers(&trimmed[4].content);
1560 assert_eq!(refs4.len(), 1);
1561 }
1562
1563 #[test]
1564 fn trim_old_images_strips_multiple_oldest_messages() {
1565 let messages: Vec<ChatMessage> = (1..=5)
1567 .map(|i| ChatMessage::user(format!("[IMAGE:/tmp/{i}.png]\nCaption {i}")))
1568 .collect();
1569
1570 let trimmed = trim_old_images(&messages, 1);
1571 assert_eq!(trimmed.len(), 5);
1572 for (i, msg) in trimmed.iter().enumerate().take(4) {
1573 let (_, refs) = parse_image_markers(&msg.content);
1574 assert!(refs.is_empty(), "message {i} should have images stripped");
1575 assert!(msg.content.contains(&format!("Caption {}", i + 1)));
1576 }
1577 let (_, refs_last) = parse_image_markers(&trimmed[4].content);
1579 assert_eq!(refs_last.len(), 1);
1580 }
1581
1582 #[tokio::test]
1583 async fn prepare_messages_trims_then_normalizes_surviving_images() {
1584 let temp = tempfile::tempdir().unwrap();
1587 let mut paths = Vec::new();
1588 for name in ["old.png", "mid.png", "new.png"] {
1589 let p = temp.path().join(name);
1590 let png_data = [
1592 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, 0x00, 0x00, 0x00, 0x90,
1595 0x77, 0x53, 0xDE, 0x00, 0x00, 0x00, 0x0C, 0x49, 0x44, 0x41, 0x54, 0x08, 0xD7, 0x63, 0xF8, 0xCF, 0xC0, 0x00, 0x00, 0x00, 0x02, 0x00, 0x01, 0xE2, 0x21,
1598 0xBC, 0x33, 0x00, 0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, 0x44, 0xAE, 0x42, 0x60, 0x82,
1601 ];
1602 std::fs::write(&p, png_data).unwrap();
1603 paths.push(p);
1604 }
1605
1606 let messages = vec![
1607 ChatMessage::user(format!("[IMAGE:{}]\nOld", paths[0].display().to_string())),
1608 ChatMessage::user(format!("[IMAGE:{}]\nMid", paths[1].display().to_string())),
1609 ChatMessage::user(format!("[IMAGE:{}]\nNew", paths[2].display().to_string())),
1610 ];
1611
1612 let config = MultimodalConfig {
1613 max_images: 2,
1614 max_image_size_mb: 5,
1615 allow_remote_fetch: false,
1616 ..Default::default()
1617 };
1618
1619 let result = prepare_messages_for_provider(&messages, &config)
1620 .await
1621 .expect("should succeed after trimming");
1622
1623 assert!(result.contains_images);
1624 assert_eq!(result.messages.len(), 3);
1625 assert!(!result.messages[0].content.contains("data:image"));
1627 assert!(result.messages[0].content.contains("Old"));
1628 assert!(result.messages[1].content.contains("data:image"));
1630 assert!(result.messages[2].content.contains("data:image"));
1631 }
1632
1633 #[tokio::test]
1634 async fn prepare_messages_skips_remote_url_when_disabled() {
1635 let messages = vec![ChatMessage::user(
1636 "Look [IMAGE:https://example.com/img.png]".to_string(),
1637 )];
1638
1639 let result = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1640 .await
1641 .expect("disabled remote image should be skipped");
1642
1643 assert!(!result.contains_images);
1644 assert_eq!(result.messages.len(), 1);
1645 assert!(result.messages[0].content.contains("Look"));
1646 assert!(
1647 result.messages[0]
1648 .content
1649 .contains("1 attached image(s) could not be loaded")
1650 );
1651 assert!(
1652 !result.messages[0]
1653 .content
1654 .contains("https://example.com/img.png")
1655 );
1656 }
1657
1658 #[tokio::test]
1659 async fn prepare_messages_skips_oversized_local_image() {
1660 let temp = tempfile::tempdir().unwrap();
1661 let image_path = temp.path().join("big.png");
1662
1663 let bytes = vec![0u8; 1024 * 1024 + 1];
1664 std::fs::write(&image_path, bytes).unwrap();
1665
1666 let messages = vec![ChatMessage::user(format!(
1667 "[IMAGE:{}]",
1668 image_path.display()
1669 ))];
1670 let config = MultimodalConfig {
1671 max_images: 4,
1672 max_image_size_mb: 1,
1673 allow_remote_fetch: false,
1674 ..Default::default()
1675 };
1676
1677 let result = prepare_messages_for_provider(&messages, &config)
1678 .await
1679 .expect("oversized local image should be skipped");
1680
1681 assert!(!result.contains_images);
1682 assert_eq!(result.messages.len(), 1);
1683 assert!(
1684 result.messages[0]
1685 .content
1686 .contains("1 attached image(s) could not be loaded")
1687 );
1688 assert!(
1689 !result.messages[0]
1690 .content
1691 .contains(image_path.to_string_lossy().as_ref())
1692 );
1693 }
1694
1695 #[tokio::test]
1696 async fn prepare_messages_keeps_successful_images_when_some_are_skipped() {
1697 let temp = tempfile::tempdir().unwrap();
1698 let image_path = temp.path().join("ok.png");
1699 std::fs::write(
1700 &image_path,
1701 [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1702 )
1703 .unwrap();
1704
1705 let messages = vec![ChatMessage::user(format!(
1706 "Look [IMAGE:{}] and [IMAGE:https://example.com/missing.png]",
1707 image_path.display()
1708 ))];
1709
1710 let result = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1711 .await
1712 .expect("valid local image should survive while remote image is skipped");
1713
1714 assert!(result.contains_images);
1715 assert!(
1716 result.messages[0]
1717 .content
1718 .contains("data:image/png;base64,")
1719 );
1720 assert!(
1721 result.messages[0]
1722 .content
1723 .contains("1 of 2 attached image(s) could not be loaded")
1724 );
1725 assert!(
1726 !result.messages[0]
1727 .content
1728 .contains("https://example.com/missing.png")
1729 );
1730 }
1731
1732 #[tokio::test]
1733 async fn skipped_images_do_not_consume_image_budget() {
1734 let temp = tempfile::tempdir().unwrap();
1735 let image_path = temp.path().join("older-valid.png");
1736 std::fs::write(
1737 &image_path,
1738 [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1739 )
1740 .unwrap();
1741
1742 let messages = vec![
1743 ChatMessage::user(format!(
1744 "Older valid image [IMAGE:{}]",
1745 image_path.display()
1746 )),
1747 ChatMessage::user(
1748 "Newer broken image [IMAGE:https://example.com/missing.png]".to_string(),
1749 ),
1750 ];
1751 let config = MultimodalConfig {
1752 max_images: 1,
1753 max_image_size_mb: 5,
1754 allow_remote_fetch: false,
1755 ..Default::default()
1756 };
1757
1758 let result = prepare_messages_for_provider(&messages, &config)
1759 .await
1760 .expect("broken image should not evict an older valid image");
1761
1762 assert!(result.contains_images);
1763 assert!(
1764 result.messages[0]
1765 .content
1766 .contains("data:image/png;base64,")
1767 );
1768 assert!(result.messages[1].content.contains("Newer broken image"));
1769 assert!(
1770 result.messages[1]
1771 .content
1772 .contains("1 attached image(s) could not be loaded")
1773 );
1774 assert!(
1775 !result.messages[1]
1776 .content
1777 .contains("https://example.com/missing.png")
1778 );
1779 }
1780
1781 #[test]
1782 fn extract_ollama_image_payload_supports_data_uris() {
1783 let payload = extract_ollama_image_payload("data:image/png;base64,abcd==")
1784 .expect("payload should be extracted");
1785 assert_eq!(payload, "abcd==");
1786 }
1787
1788 #[test]
1791 fn parse_image_markers_strips_markers_leaving_caption() {
1792 let input = "[IMAGE:/tmp/photo.jpg]\n\nDescribe this screenshot";
1793 let (cleaned, refs) = parse_image_markers(input);
1794 assert_eq!(cleaned, "Describe this screenshot");
1795 assert_eq!(refs.len(), 1);
1796 assert_eq!(refs[0], "/tmp/photo.jpg");
1797 }
1798
1799 #[test]
1802 fn parse_image_markers_image_only_message_becomes_empty() {
1803 let input = "[IMAGE:/tmp/photo.jpg]";
1804 let (cleaned, refs) = parse_image_markers(input);
1805 assert!(
1806 cleaned.is_empty(),
1807 "expected empty string, got: {cleaned:?}"
1808 );
1809 assert_eq!(refs.len(), 1);
1810 }
1811}