Skip to main content

zeroclaw_providers/
multimodal.rs

1use base64::{Engine as _, engine::general_purpose::STANDARD};
2use reqwest::Client;
3use std::collections::HashSet;
4use std::path::Path;
5use zeroclaw_api::model_provider::ChatMessage;
6use zeroclaw_config::schema::{MultimodalConfig, build_runtime_proxy_client_with_timeouts};
7
8const IMAGE_MARKER_PREFIX: &str = "[IMAGE:";
9const ALLOWED_IMAGE_MIME_TYPES: &[&str] = &[
10    "image/png",
11    "image/jpeg",
12    "image/webp",
13    "image/gif",
14    "image/bmp",
15];
16
17#[derive(Debug, Clone)]
18pub struct PreparedMessages {
19    pub messages: Vec<ChatMessage>,
20    pub contains_images: bool,
21}
22
23#[derive(Debug, thiserror::Error)]
24pub enum MultimodalError {
25    #[error("multimodal image limit exceeded: max_images={max_images}, found={found}")]
26    TooManyImages { max_images: usize, found: usize },
27
28    #[error(
29        "multimodal image size limit exceeded for '{input}': {size_bytes} bytes > {max_bytes} bytes"
30    )]
31    ImageTooLarge {
32        input: String,
33        size_bytes: usize,
34        max_bytes: usize,
35    },
36
37    #[error("multimodal image MIME type is not allowed for '{input}': {mime}")]
38    UnsupportedMime { input: String, mime: String },
39
40    #[error("multimodal remote image fetch is disabled for '{input}'")]
41    RemoteFetchDisabled { input: String },
42
43    #[error("multimodal image source not found or unreadable: '{input}'")]
44    ImageSourceNotFound { input: String },
45
46    #[error("invalid multimodal image marker '{input}': {reason}")]
47    InvalidMarker { input: String, reason: String },
48
49    #[error("failed to download remote image '{input}': {reason}")]
50    RemoteFetchFailed { input: String, reason: String },
51
52    #[error("failed to read local image '{input}': {reason}")]
53    LocalReadFailed { input: String, reason: String },
54}
55
56/// Returns true for payloads that are plausibly loadable image references:
57/// absolute filesystem paths, `http(s)://` URLs, or base64 `data:` URIs.
58/// Placeholder-style payloads like `...`, `<path>`, or `example.png` fail
59/// this check and are left as literal text by [`parse_image_markers`], so
60/// illustrative markdown in a conversation does not trigger loader errors.
61fn is_loadable_image_reference(candidate: &str) -> bool {
62    candidate.starts_with('/')
63        || candidate.starts_with("http://")
64        || candidate.starts_with("https://")
65        || candidate.starts_with("data:")
66        || is_windows_path(candidate)
67}
68
69/// Returns true for Windows-style absolute paths like `C:\…` or `D:/…`.
70fn is_windows_path(candidate: &str) -> bool {
71    let mut chars = candidate.chars();
72    let Some(first) = chars.next() else {
73        return false;
74    };
75    if !first.is_ascii_alphabetic() {
76        return false;
77    }
78    let Some(second) = chars.next() else {
79        return false;
80    };
81    if second != ':' {
82        return false;
83    }
84    matches!(chars.next(), Some('\\') | Some('/'))
85}
86
87/// Normalize a marker payload that may have been line-wrapped when pasted
88/// from a terminal (e.g. a log line where a long path was broken across
89/// rows with leading indentation). Interior newlines — and any whitespace
90/// immediately following them — are dropped; leading/trailing whitespace
91/// is trimmed. Legitimate paths may contain spaces but never newlines, so
92/// this only recovers corrupted markers and does not mangle real paths.
93fn collapse_wrapped_marker(raw: &str) -> String {
94    if !raw.contains('\n') && !raw.contains('\r') {
95        return raw.trim().to_string();
96    }
97    let mut out = String::with_capacity(raw.len());
98    let mut skip_ws = false;
99    for ch in raw.chars() {
100        if ch == '\n' || ch == '\r' {
101            skip_ws = true;
102            continue;
103        }
104        if skip_ws {
105            if ch.is_whitespace() {
106                continue;
107            }
108            skip_ws = false;
109        }
110        out.push(ch);
111    }
112    out.trim().to_string()
113}
114
115pub fn parse_image_markers(content: &str) -> (String, Vec<String>) {
116    let mut refs = Vec::new();
117    let mut cleaned = String::with_capacity(content.len());
118    let mut cursor = 0usize;
119
120    while let Some(rel_start) = content[cursor..].find(IMAGE_MARKER_PREFIX) {
121        let start = cursor + rel_start;
122        cleaned.push_str(&content[cursor..start]);
123
124        let marker_start = start + IMAGE_MARKER_PREFIX.len();
125        let Some(rel_end) = content[marker_start..].find(']') else {
126            cleaned.push_str(&content[start..]);
127            cursor = content.len();
128            break;
129        };
130
131        let end = marker_start + rel_end;
132        let candidate = collapse_wrapped_marker(&content[marker_start..end]);
133
134        if candidate.is_empty() || !is_loadable_image_reference(&candidate) {
135            // Preserve the original marker text (placeholders like
136            // `[IMAGE:...]` or `[IMAGE:<path>]` should survive as prose
137            // rather than triggering a loader error).
138            cleaned.push_str(&content[start..=end]);
139        } else {
140            refs.push(candidate);
141        }
142
143        cursor = end + 1;
144    }
145
146    if cursor < content.len() {
147        cleaned.push_str(&content[cursor..]);
148    }
149
150    (cleaned.trim().to_string(), refs)
151}
152
153pub fn count_image_markers(messages: &[ChatMessage]) -> usize {
154    let latest_tool_indices = latest_tool_result_indices(messages);
155    count_image_markers_with_latest_tool_results(messages, &latest_tool_indices)
156}
157
158fn count_image_markers_with_latest_tool_results(
159    messages: &[ChatMessage],
160    latest_tool_result_indices: &HashSet<usize>,
161) -> usize {
162    messages
163        .iter()
164        .enumerate()
165        .filter(|(index, message)| {
166            should_normalize_message_images(*index, message, latest_tool_result_indices)
167        })
168        .map(|(_, message)| parse_image_markers(&message.content).1.len())
169        .sum()
170}
171
172pub fn contains_image_markers(messages: &[ChatMessage]) -> bool {
173    count_image_markers(messages) > 0
174}
175
176/// Replace media markers (`[IMAGE:...]`, `[PHOTO:...]`, `[DOCUMENT:...]`,
177/// `[FILE:...]`, `[VIDEO:...]`, `[VOICE:...]`, `[AUDIO:...]`) with
178/// `[media attachment]`. Match is case-insensitive to align with the channel
179/// attachment parsers, which all uppercase the kind before comparing
180/// (`crates/zeroclaw-channels/src/util.rs::ATTACHMENT_KINDS`,
181/// `telegram.rs`, `discord.rs`, `qq.rs`, `whatsapp_web.rs`).
182///
183/// Use before passing user-facing text to auxiliary `chat_with_system` calls
184/// (intent classification, summarization, delegation) so that local file
185/// paths from inbound channels do not leak to the upstream provider — the
186/// upstream API would otherwise receive a filesystem path as `image_url.url`
187/// and reject the request.
188///
189/// Auxiliary calls do not need to *see* the media content; they only route
190/// or summarize, so the placeholder is sufficient. The main agent loop
191/// continues to call `prepare_messages_for_provider` for full normalization.
192pub fn strip_media_markers(text: &str) -> String {
193    static RE: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| {
194        regex::Regex::new(r"(?i)\[(?:IMAGE|PHOTO|DOCUMENT|FILE|VIDEO|VOICE|AUDIO):[^\]]*\]")
195            .unwrap()
196    });
197    RE.replace_all(text, "[media attachment]").into_owned()
198}
199
200pub fn extract_ollama_image_payload(image_ref: &str) -> Option<String> {
201    if image_ref.starts_with("data:") {
202        let comma_idx = image_ref.find(',')?;
203        let (_, payload) = image_ref.split_at(comma_idx + 1);
204        let payload = payload.trim();
205        if payload.is_empty() {
206            None
207        } else {
208            Some(payload.to_string())
209        }
210    } else {
211        Some(image_ref.trim().to_string()).filter(|value| !value.is_empty())
212    }
213}
214
215fn is_prompt_tool_result_message(message: &ChatMessage) -> bool {
216    message.role == "user" && message.content.trim_start().starts_with("[Tool results]")
217}
218
219fn is_tool_result_carrier(message: &ChatMessage) -> bool {
220    message.role == "tool" || is_prompt_tool_result_message(message)
221}
222
223fn latest_tool_result_indices(messages: &[ChatMessage]) -> HashSet<usize> {
224    let mut indices = HashSet::new();
225    let Some((last_index, last_message)) = messages.iter().enumerate().next_back() else {
226        return indices;
227    };
228
229    if is_prompt_tool_result_message(last_message) {
230        indices.insert(last_index);
231        return indices;
232    }
233
234    if last_message.role == "tool" {
235        for (index, message) in messages.iter().enumerate().rev() {
236            if message.role != "tool" {
237                break;
238            }
239            indices.insert(index);
240        }
241    }
242
243    indices
244}
245
246fn should_normalize_message_images(
247    index: usize,
248    message: &ChatMessage,
249    latest_tool_result_indices: &HashSet<usize>,
250) -> bool {
251    if is_tool_result_carrier(message) {
252        return latest_tool_result_indices.contains(&index);
253    }
254
255    message.role == "user"
256}
257
258fn stripped_image_marker_text(content: &str) -> String {
259    let (cleaned, refs) = parse_image_markers(content);
260    if refs.is_empty() {
261        return content.to_string();
262    }
263
264    if cleaned.trim().is_empty() {
265        "[image removed from history]".to_string()
266    } else {
267        cleaned
268    }
269}
270
271fn strip_tool_result_image_markers(message: &ChatMessage) -> ChatMessage {
272    if !message.content.contains(IMAGE_MARKER_PREFIX) {
273        return message.clone();
274    }
275
276    if message.role == "tool"
277        && let Ok(serde_json::Value::Object(mut obj)) =
278            serde_json::from_str::<serde_json::Value>(&message.content)
279        && let Some(serde_json::Value::String(inner)) = obj.get("content").cloned()
280    {
281        let stripped = stripped_image_marker_text(&inner);
282        if stripped == inner {
283            return message.clone();
284        }
285
286        obj.insert("content".to_string(), serde_json::Value::String(stripped));
287        return ChatMessage {
288            role: message.role.clone(),
289            content: serde_json::Value::Object(obj).to_string(),
290        };
291    }
292
293    ChatMessage {
294        role: message.role.clone(),
295        content: stripped_image_marker_text(&message.content),
296    }
297}
298
299fn replay_message_without_stale_tool_images(
300    index: usize,
301    message: &ChatMessage,
302    latest_tool_result_indices: &HashSet<usize>,
303) -> ChatMessage {
304    if is_tool_result_carrier(message) && !latest_tool_result_indices.contains(&index) {
305        strip_tool_result_image_markers(message)
306    } else {
307        message.clone()
308    }
309}
310
311/// Attempt to normalize image markers inside a native tool-result JSON
312/// payload produced by `NativeToolDispatcher::to_provider_messages`. On
313/// success, returns the reserialized JSON string with the inner `content`
314/// field rewritten to inline `[IMAGE:data:…]` markers (data URIs). Returns
315/// `Ok(None)` when the payload is not a JSON object with a string `content`
316/// field, when the inner content has no normalizable markers, or when no
317/// rewriting is needed — letting the caller fall through to the existing
318/// plain-text path. The returned JSON preserves `tool_call_id` and any
319/// other top-level fields so downstream native adapters
320/// (e.g. `OpenAiCompatibleProvider::convert_messages_for_native`) can keep
321/// recovering the tool-call linkage via `serde_json::from_str`.
322async fn normalize_native_tool_result_json(
323    content: &str,
324    config: &MultimodalConfig,
325    max_bytes: usize,
326    remote_client: &Client,
327) -> Option<(String, bool)> {
328    let Ok(serde_json::Value::Object(mut obj)) = serde_json::from_str::<serde_json::Value>(content)
329    else {
330        return None;
331    };
332
333    let Some(serde_json::Value::String(inner)) = obj.get("content").cloned() else {
334        return None;
335    };
336
337    let (cleaned_text, refs) = parse_image_markers(&inner);
338    if refs.is_empty() {
339        return None;
340    }
341
342    let normalized = normalize_image_references(&refs, config, max_bytes, remote_client).await;
343    let new_inner = compose_multimodal_content(
344        &cleaned_text,
345        &normalized.data_uris,
346        normalized.skipped_count,
347        refs.len(),
348    );
349    obj.insert("content".to_string(), serde_json::Value::String(new_inner));
350
351    Some((
352        serde_json::Value::Object(obj).to_string(),
353        !normalized.data_uris.is_empty(),
354    ))
355}
356
357pub async fn prepare_messages_for_provider(
358    messages: &[ChatMessage],
359    config: &MultimodalConfig,
360) -> anyhow::Result<PreparedMessages> {
361    let (max_images, max_image_size_mb) = config.effective_limits();
362    let max_bytes = max_image_size_mb.saturating_mul(1024 * 1024);
363
364    let latest_tool_indices = latest_tool_result_indices(messages);
365    let total_images = count_image_markers_with_latest_tool_results(messages, &latest_tool_indices);
366
367    if total_images == 0 {
368        return Ok(PreparedMessages {
369            messages: messages
370                .iter()
371                .enumerate()
372                .map(|(index, message)| {
373                    replay_message_without_stale_tool_images(index, message, &latest_tool_indices)
374                })
375                .collect(),
376            contains_images: false,
377        });
378    }
379
380    // When image count exceeds the limit, strip markers from oldest messages
381    // first so that the most recent (most relevant) images survive. This
382    // prevents conversations from becoming permanently stuck once the
383    // cumulative image count crosses the threshold.
384    let trimmed = if total_images > max_images {
385        trim_old_images(messages, max_images)
386    } else {
387        messages.to_vec()
388    };
389
390    let remote_client = build_runtime_proxy_client_with_timeouts("model_provider.ollama", 30, 10);
391    let latest_tool_indices = latest_tool_result_indices(&trimmed);
392
393    let mut normalized_messages = Vec::with_capacity(messages.len());
394    let mut has_successful_images = false;
395    for (index, message) in messages.iter().enumerate() {
396        if !should_normalize_message_images(index, message, &latest_tool_indices) {
397            normalized_messages.push(replay_message_without_stale_tool_images(
398                index,
399                message,
400                &latest_tool_indices,
401            ));
402            continue;
403        }
404
405        // Native tool dispatchers wrap tool results as a JSON object
406        // (`{"tool_call_id":"…","content":"…"}`) so that provider adapters
407        // can recover `tool_call_id` via `serde_json::from_str` on
408        // `message.content`. Treating that JSON blob as plain text would
409        // strip markers out of the `content` field and append the data URI
410        // outside the JSON object, breaking the native tool-result contract
411        // and dropping `tool_call_id`. When we recognise that shape,
412        // normalize only the inner `content` string and reserialize the
413        // JSON so adapters keep seeing the structure they expect. Falls
414        // through to the plain-text path for non-JSON tool messages.
415        if message.role == "tool"
416            && let Some((prepared, contains_images)) = normalize_native_tool_result_json(
417                &message.content,
418                config,
419                max_bytes,
420                &remote_client,
421            )
422            .await
423        {
424            normalized_messages.push(ChatMessage {
425                role: message.role.clone(),
426                content: prepared,
427            });
428            has_successful_images |= contains_images;
429            continue;
430        }
431
432        let (cleaned_text, refs) = parse_image_markers(&message.content);
433        if refs.is_empty() {
434            normalized_messages.push(message.clone());
435            continue;
436        }
437
438        let normalized = normalize_image_references(&refs, config, max_bytes, &remote_client).await;
439        let content = compose_multimodal_content(
440            &cleaned_text,
441            &normalized.data_uris,
442            normalized.skipped_count,
443            refs.len(),
444        );
445        has_successful_images |= !normalized.data_uris.is_empty();
446        normalized_messages.push(ChatMessage {
447            role: message.role.clone(),
448            content,
449        });
450    }
451
452    // Apply the per-request image cap after normalization so failed image refs
453    // do not consume budget and evict older images that could still be sent.
454    let capped_messages =
455        if has_successful_images && count_image_markers(&normalized_messages) > max_images {
456            trim_old_images(&normalized_messages, max_images)
457        } else {
458            normalized_messages
459        };
460
461    Ok(PreparedMessages {
462        contains_images: count_image_markers(&capped_messages) > 0,
463        messages: capped_messages,
464    })
465}
466
467/// Strip image markers from older messages (oldest first) until total image
468/// count is within `max_images`. Keeps the text content of each message.
469fn trim_old_images(messages: &[ChatMessage], max_images: usize) -> Vec<ChatMessage> {
470    let latest_tool_indices = latest_tool_result_indices(messages);
471    // Find which messages (by index) contain images, oldest first.
472    let image_positions: Vec<(usize, usize)> = messages
473        .iter()
474        .enumerate()
475        .filter(|(index, message)| {
476            should_normalize_message_images(*index, message, &latest_tool_indices)
477        })
478        .filter_map(|(i, m)| {
479            let count = parse_image_markers(&m.content).1.len();
480            if count > 0 { Some((i, count)) } else { None }
481        })
482        .collect();
483
484    // Determine how many images to drop (from the oldest messages).
485    let total: usize = image_positions.iter().map(|(_, c)| c).sum();
486    let mut to_drop = total.saturating_sub(max_images);
487
488    // Collect indices of messages whose images should be stripped.
489    let mut strip_indices = std::collections::HashSet::new();
490    for &(idx, count) in &image_positions {
491        if to_drop == 0 {
492            break;
493        }
494        strip_indices.insert(idx);
495        to_drop = to_drop.saturating_sub(count);
496    }
497
498    messages
499        .iter()
500        .enumerate()
501        .map(|(i, m)| {
502            if strip_indices.contains(&i) {
503                let (cleaned, _) = parse_image_markers(&m.content);
504                let text = if cleaned.trim().is_empty() {
505                    "[image removed from history]".to_string()
506                } else {
507                    cleaned
508                };
509                ChatMessage {
510                    role: m.role.clone(),
511                    content: text,
512                }
513            } else {
514                replay_message_without_stale_tool_images(i, m, &latest_tool_indices)
515            }
516        })
517        .collect()
518}
519
520fn compose_multimodal_message(text: &str, data_uris: &[String]) -> String {
521    let mut content = String::new();
522    let trimmed = text.trim();
523
524    if !trimmed.is_empty() {
525        content.push_str(trimmed);
526        content.push_str("\n\n");
527    }
528
529    for (index, data_uri) in data_uris.iter().enumerate() {
530        if index > 0 {
531            content.push('\n');
532        }
533        content.push_str(IMAGE_MARKER_PREFIX);
534        content.push_str(data_uri);
535        content.push(']');
536    }
537
538    content
539}
540
541struct NormalizedImageReferences {
542    data_uris: Vec<String>,
543    skipped_count: usize,
544}
545
546async fn normalize_image_references(
547    refs: &[String],
548    config: &MultimodalConfig,
549    max_bytes: usize,
550    remote_client: &Client,
551) -> NormalizedImageReferences {
552    let mut data_uris = Vec::with_capacity(refs.len());
553    let mut skipped_count = 0usize;
554
555    for reference in refs {
556        match normalize_image_reference(reference, config, max_bytes, remote_client).await {
557            Ok(data_uri) => data_uris.push(data_uri),
558            Err(error) => {
559                skipped_count += 1;
560                let error_reason = multimodal_error_reason(&error);
561                ::zeroclaw_log::record!(
562                    WARN,
563                    ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
564                        .with_outcome(::zeroclaw_log::EventOutcome::Unknown)
565                        .with_attrs(::serde_json::json!({
566                            "source_kind": image_reference_kind(reference),
567                            "error_kind": multimodal_error_kind(&error),
568                            "reason": error_reason.as_deref().unwrap_or(""),
569                        })),
570                    "skipping multimodal image that could not be loaded"
571                );
572            }
573        }
574    }
575
576    NormalizedImageReferences {
577        data_uris,
578        skipped_count,
579    }
580}
581
582fn compose_multimodal_content(
583    text: &str,
584    data_uris: &[String],
585    skipped_count: usize,
586    total_refs: usize,
587) -> String {
588    if skipped_count == 0 {
589        return compose_multimodal_message(text, data_uris);
590    }
591
592    let text_with_note = append_skipped_image_note(text, skipped_count, total_refs);
593    if data_uris.is_empty() {
594        text_with_note.trim().to_string()
595    } else {
596        compose_multimodal_message(&text_with_note, data_uris)
597    }
598}
599
600fn append_skipped_image_note(text: &str, skipped_count: usize, total_refs: usize) -> String {
601    if skipped_count == 0 {
602        return text.to_string();
603    }
604
605    // This note is model-facing provider context, not direct localized UI text.
606    let note = if skipped_count == total_refs {
607        format!("{skipped_count} attached image(s) could not be loaded")
608    } else {
609        format!("{skipped_count} of {total_refs} attached image(s) could not be loaded")
610    };
611
612    let trimmed = text.trim();
613    if trimmed.is_empty() {
614        format!("Note: {note}.")
615    } else {
616        format!("{trimmed}\n\nNote: {note}.")
617    }
618}
619
620fn image_reference_kind(reference: &str) -> &'static str {
621    if reference.starts_with("data:") {
622        "data"
623    } else if reference.starts_with("http://") || reference.starts_with("https://") {
624        "remote"
625    } else {
626        "local"
627    }
628}
629
630fn multimodal_error_kind(error: &anyhow::Error) -> &'static str {
631    match error.downcast_ref::<MultimodalError>() {
632        Some(MultimodalError::TooManyImages { .. }) => "too_many_images",
633        Some(MultimodalError::ImageTooLarge { .. }) => "image_too_large",
634        Some(MultimodalError::UnsupportedMime { .. }) => "unsupported_mime",
635        Some(MultimodalError::RemoteFetchDisabled { .. }) => "remote_fetch_disabled",
636        Some(MultimodalError::ImageSourceNotFound { .. }) => "image_source_not_found",
637        Some(MultimodalError::InvalidMarker { .. }) => "invalid_marker",
638        Some(MultimodalError::RemoteFetchFailed { .. }) => "remote_fetch_failed",
639        Some(MultimodalError::LocalReadFailed { .. }) => "local_read_failed",
640        None => "unknown",
641    }
642}
643
644fn multimodal_error_reason(error: &anyhow::Error) -> Option<String> {
645    match error.downcast_ref::<MultimodalError>() {
646        Some(MultimodalError::InvalidMarker { input, reason })
647        | Some(MultimodalError::RemoteFetchFailed { input, reason })
648        | Some(MultimodalError::LocalReadFailed { input, reason }) => {
649            Some(reason.replace(input, "<source>"))
650        }
651        _ => None,
652    }
653}
654
655async fn normalize_image_reference(
656    source: &str,
657    config: &MultimodalConfig,
658    max_bytes: usize,
659    remote_client: &Client,
660) -> anyhow::Result<String> {
661    if source.starts_with("data:") {
662        return normalize_data_uri(source, max_bytes);
663    }
664
665    if source.starts_with("http://") || source.starts_with("https://") {
666        if !config.allow_remote_fetch {
667            return Err(MultimodalError::RemoteFetchDisabled {
668                input: source.to_string(),
669            }
670            .into());
671        }
672
673        return normalize_remote_image(source, max_bytes, remote_client).await;
674    }
675
676    normalize_local_image(source, max_bytes).await
677}
678
679fn normalize_data_uri(source: &str, max_bytes: usize) -> anyhow::Result<String> {
680    let Some(comma_idx) = source.find(',') else {
681        return Err(MultimodalError::InvalidMarker {
682            input: source.to_string(),
683            reason: "expected data URI payload".to_string(),
684        }
685        .into());
686    };
687
688    let header = &source[..comma_idx];
689    let payload = source[comma_idx + 1..].trim();
690
691    if !header.contains(";base64") {
692        return Err(MultimodalError::InvalidMarker {
693            input: source.to_string(),
694            reason: "only base64 data URIs are supported".to_string(),
695        }
696        .into());
697    }
698
699    let mime = header
700        .trim_start_matches("data:")
701        .split(';')
702        .next()
703        .unwrap_or_default()
704        .trim()
705        .to_ascii_lowercase();
706
707    validate_mime(source, &mime)?;
708
709    let decoded = STANDARD
710        .decode(payload)
711        .map_err(|error| MultimodalError::InvalidMarker {
712            input: source.to_string(),
713            reason: format!("invalid base64 payload: {error}"),
714        })?;
715
716    validate_size(source, decoded.len(), max_bytes)?;
717
718    Ok(format!("data:{mime};base64,{}", STANDARD.encode(decoded)))
719}
720
721async fn normalize_remote_image(
722    source: &str,
723    max_bytes: usize,
724    remote_client: &Client,
725) -> anyhow::Result<String> {
726    let response = remote_client.get(source).send().await.map_err(|error| {
727        MultimodalError::RemoteFetchFailed {
728            input: source.to_string(),
729            reason: error.to_string(),
730        }
731    })?;
732
733    let status = response.status();
734    if !status.is_success() {
735        return Err(MultimodalError::RemoteFetchFailed {
736            input: source.to_string(),
737            reason: format!("HTTP {status}"),
738        }
739        .into());
740    }
741
742    if let Some(content_length) = response.content_length() {
743        let content_length = usize::try_from(content_length).unwrap_or(usize::MAX);
744        validate_size(source, content_length, max_bytes)?;
745    }
746
747    let content_type = response
748        .headers()
749        .get(reqwest::header::CONTENT_TYPE)
750        .and_then(|value| value.to_str().ok())
751        .map(ToString::to_string);
752
753    let bytes = response
754        .bytes()
755        .await
756        .map_err(|error| MultimodalError::RemoteFetchFailed {
757            input: source.to_string(),
758            reason: error.to_string(),
759        })?;
760
761    validate_size(source, bytes.len(), max_bytes)?;
762
763    let mime = detect_mime(None, bytes.as_ref(), content_type.as_deref()).ok_or_else(|| {
764        MultimodalError::UnsupportedMime {
765            input: source.to_string(),
766            mime: "unknown".to_string(),
767        }
768    })?;
769
770    validate_mime(source, &mime)?;
771
772    Ok(format!("data:{mime};base64,{}", STANDARD.encode(bytes)))
773}
774
775async fn normalize_local_image(source: &str, max_bytes: usize) -> anyhow::Result<String> {
776    let path = Path::new(source);
777    if !path.exists() || !path.is_file() {
778        return Err(MultimodalError::ImageSourceNotFound {
779            input: source.to_string(),
780        }
781        .into());
782    }
783
784    let metadata =
785        tokio::fs::metadata(path)
786            .await
787            .map_err(|error| MultimodalError::LocalReadFailed {
788                input: source.to_string(),
789                reason: error.to_string(),
790            })?;
791
792    validate_size(
793        source,
794        usize::try_from(metadata.len()).unwrap_or(usize::MAX),
795        max_bytes,
796    )?;
797
798    let bytes = tokio::fs::read(path)
799        .await
800        .map_err(|error| MultimodalError::LocalReadFailed {
801            input: source.to_string(),
802            reason: error.to_string(),
803        })?;
804
805    validate_size(source, bytes.len(), max_bytes)?;
806
807    let mime =
808        detect_mime(Some(path), &bytes, None).ok_or_else(|| MultimodalError::UnsupportedMime {
809            input: source.to_string(),
810            mime: "unknown".to_string(),
811        })?;
812
813    validate_mime(source, &mime)?;
814
815    Ok(format!("data:{mime};base64,{}", STANDARD.encode(bytes)))
816}
817
818fn validate_size(source: &str, size_bytes: usize, max_bytes: usize) -> anyhow::Result<()> {
819    if size_bytes > max_bytes {
820        return Err(MultimodalError::ImageTooLarge {
821            input: source.to_string(),
822            size_bytes,
823            max_bytes,
824        }
825        .into());
826    }
827
828    Ok(())
829}
830
831fn validate_mime(source: &str, mime: &str) -> anyhow::Result<()> {
832    if ALLOWED_IMAGE_MIME_TYPES.contains(&mime) {
833        return Ok(());
834    }
835
836    Err(MultimodalError::UnsupportedMime {
837        input: source.to_string(),
838        mime: mime.to_string(),
839    }
840    .into())
841}
842
843fn detect_mime(
844    path: Option<&Path>,
845    bytes: &[u8],
846    header_content_type: Option<&str>,
847) -> Option<String> {
848    if let Some(header_mime) = header_content_type.and_then(normalize_content_type) {
849        return Some(header_mime);
850    }
851
852    if let Some(path) = path
853        && let Some(ext) = path.extension().and_then(|value| value.to_str())
854        && let Some(mime) = mime_from_extension(ext)
855    {
856        return Some(mime.to_string());
857    }
858
859    mime_from_magic(bytes).map(ToString::to_string)
860}
861
862fn normalize_content_type(content_type: &str) -> Option<String> {
863    let mime = content_type.split(';').next()?.trim().to_ascii_lowercase();
864    if mime.is_empty() { None } else { Some(mime) }
865}
866
867fn mime_from_extension(ext: &str) -> Option<&'static str> {
868    match ext.to_ascii_lowercase().as_str() {
869        "png" => Some("image/png"),
870        "jpg" | "jpeg" => Some("image/jpeg"),
871        "webp" => Some("image/webp"),
872        "gif" => Some("image/gif"),
873        "bmp" => Some("image/bmp"),
874        _ => None,
875    }
876}
877
878fn mime_from_magic(bytes: &[u8]) -> Option<&'static str> {
879    if bytes.len() >= 8 && bytes.starts_with(&[0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n']) {
880        return Some("image/png");
881    }
882
883    if bytes.len() >= 3 && bytes.starts_with(&[0xff, 0xd8, 0xff]) {
884        return Some("image/jpeg");
885    }
886
887    if bytes.len() >= 6 && (bytes.starts_with(b"GIF87a") || bytes.starts_with(b"GIF89a")) {
888        return Some("image/gif");
889    }
890
891    if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
892        return Some("image/webp");
893    }
894
895    if bytes.len() >= 2 && bytes.starts_with(b"BM") {
896        return Some("image/bmp");
897    }
898
899    None
900}
901
902#[cfg(test)]
903mod tests {
904    use super::*;
905
906    #[test]
907    fn strip_media_markers_replaces_image_local_path() {
908        let input = "Look at [IMAGE:/zeroclaw-data/workspace/telegram_files/photo_1.jpg]";
909        assert_eq!(strip_media_markers(input), "Look at [media attachment]");
910    }
911
912    #[test]
913    fn strip_media_markers_replaces_image_data_uri() {
914        let input = "Inline [IMAGE:data:image/png;base64,abcd]";
915        assert_eq!(strip_media_markers(input), "Inline [media attachment]");
916    }
917
918    #[test]
919    fn strip_media_markers_replaces_all_supported_kinds() {
920        // Mirrors `ATTACHMENT_KINDS` in
921        // `crates/zeroclaw-channels/src/util.rs`, which is the source of
922        // truth for which marker spellings inbound channels can produce.
923        let input = "[IMAGE:/a.jpg] [PHOTO:/b.jpg] [DOCUMENT:/c.pdf] [FILE:/d.zip] [VIDEO:/e.mp4] [VOICE:/f.ogg] [AUDIO:/g.wav]";
924        let expected = "[media attachment] [media attachment] [media attachment] [media attachment] [media attachment] [media attachment] [media attachment]";
925        assert_eq!(strip_media_markers(input), expected);
926    }
927
928    #[test]
929    fn strip_media_markers_is_case_insensitive() {
930        // Channel parsers uppercase the kind before comparing, so by the time
931        // a marker reaches conversation history it is normally upper-case —
932        // but accept lower/mixed case too so we don't depend on that
933        // invariant downstream.
934        let input = "[image:/a.jpg] [Photo:/b.jpg] [video:/c.mp4]";
935        let expected = "[media attachment] [media attachment] [media attachment]";
936        assert_eq!(strip_media_markers(input), expected);
937    }
938
939    #[test]
940    fn strip_media_markers_leaves_plain_text_untouched() {
941        let input = "No markers here, just text with [brackets] and (parens).";
942        assert_eq!(strip_media_markers(input), input);
943    }
944
945    #[test]
946    fn strip_media_markers_preserves_unrelated_brackets() {
947        // Markers that don't match the media kinds are left alone.
948        let input = "Use [TODO:foo] and [NOTE:bar] but replace [IMAGE:/x.jpg]";
949        assert_eq!(
950            strip_media_markers(input),
951            "Use [TODO:foo] and [NOTE:bar] but replace [media attachment]"
952        );
953    }
954
955    #[test]
956    fn parse_image_markers_extracts_multiple_markers() {
957        let input = "Check this [IMAGE:/tmp/a.png] and this [IMAGE:https://example.com/b.jpg]";
958        let (cleaned, refs) = parse_image_markers(input);
959
960        assert_eq!(cleaned, "Check this  and this");
961        assert_eq!(refs.len(), 2);
962        assert_eq!(refs[0], "/tmp/a.png");
963        assert_eq!(refs[1], "https://example.com/b.jpg");
964    }
965
966    #[test]
967    fn parse_image_markers_collapses_line_wrapped_path() {
968        // Terminal-wrapped paste: a long path split across two rows with
969        // leading indentation should be recovered into the original path.
970        let input = "from the logs whether the agent emits\n  [IMAGE:/home/zeroclaw_user/.zeroclaw/workspace/signal_i\n  nbound/attachment.jpg] (which the\n  channel resolves)";
971        let (_, refs) = parse_image_markers(input);
972        assert_eq!(refs.len(), 1);
973        assert_eq!(
974            refs[0],
975            "/home/zeroclaw_user/.zeroclaw/workspace/signal_inbound/attachment.jpg"
976        );
977    }
978
979    #[test]
980    fn parse_image_markers_leaves_placeholder_markers_as_literal_text() {
981        // Illustrative markdown like `[IMAGE:...]` or `[IMAGE:<path>]`
982        // (e.g. in agent-authored prose the user quotes back) is not a
983        // loadable reference and must stay as literal text — otherwise the
984        // multimodal loader errors every turn the conversation replays.
985        let input = "example: `[IMAGE:...]` or `[IMAGE:<path>]` or `[IMAGE:example.png]`";
986        let (cleaned, refs) = parse_image_markers(input);
987        assert!(
988            refs.is_empty(),
989            "no placeholder should be treated as a loadable ref, got: {refs:?}"
990        );
991        assert!(cleaned.contains("[IMAGE:...]"));
992        assert!(cleaned.contains("[IMAGE:<path>]"));
993        assert!(cleaned.contains("[IMAGE:example.png]"));
994    }
995
996    #[test]
997    fn parse_image_markers_preserves_spaces_in_path() {
998        // Spaces within a single-line marker are legitimate (paths can
999        // contain spaces) and must survive unchanged.
1000        let input = "look at [IMAGE:/tmp/my photos/beetle.png] please";
1001        let (_, refs) = parse_image_markers(input);
1002        assert_eq!(refs.len(), 1);
1003        assert_eq!(refs[0], "/tmp/my photos/beetle.png");
1004    }
1005
1006    #[test]
1007    fn parse_image_markers_keeps_invalid_empty_marker() {
1008        let input = "hello [IMAGE:] world";
1009        let (cleaned, refs) = parse_image_markers(input);
1010
1011        assert_eq!(cleaned, "hello [IMAGE:] world");
1012        assert!(refs.is_empty());
1013    }
1014
1015    #[tokio::test]
1016    async fn prepare_messages_normalizes_local_image_to_data_uri() {
1017        let temp = tempfile::tempdir().unwrap();
1018        let image_path = temp.path().join("sample.png");
1019
1020        // Minimal PNG signature bytes are enough for MIME detection.
1021        std::fs::write(
1022            &image_path,
1023            [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1024        )
1025        .unwrap();
1026
1027        let messages = vec![ChatMessage::user(format!(
1028            "Please inspect this screenshot [IMAGE:{}]",
1029            image_path.display()
1030        ))];
1031
1032        let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1033            .await
1034            .unwrap();
1035
1036        assert!(prepared.contains_images);
1037        assert_eq!(prepared.messages.len(), 1);
1038
1039        let (cleaned, refs) = parse_image_markers(&prepared.messages[0].content);
1040        assert_eq!(cleaned, "Please inspect this screenshot");
1041        assert_eq!(refs.len(), 1);
1042        assert!(refs[0].starts_with("data:image/png;base64,"));
1043    }
1044
1045    #[tokio::test]
1046    // Covers the plain-text fallback path for `role == "tool"` messages
1047    // whose `content` is not a native-dispatcher JSON payload (e.g.
1048    // synthetic XML-shaped input or future non-JSON tool transports). The
1049    // JSON-shaped native contract is exercised by
1050    // `prepare_messages_preserves_native_tool_result_json_shape` below.
1051    async fn prepare_messages_normalizes_tool_message_local_image_to_data_uri() {
1052        let temp = tempfile::tempdir().unwrap();
1053        let image_path = temp.path().join("tool-sample.png");
1054
1055        std::fs::write(
1056            &image_path,
1057            [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1058        )
1059        .unwrap();
1060
1061        let messages = vec![ChatMessage::tool(format!(
1062            "<tool_result name=\"image_gen\">\nGenerated image [IMAGE:{}]\n</tool_result>",
1063            image_path.display()
1064        ))];
1065
1066        let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1067            .await
1068            .unwrap();
1069
1070        assert!(prepared.contains_images);
1071        assert_eq!(prepared.messages.len(), 1);
1072        assert_eq!(prepared.messages[0].role, "tool");
1073
1074        let (cleaned, refs) = parse_image_markers(&prepared.messages[0].content);
1075        assert!(cleaned.contains("<tool_result name=\"image_gen\">"));
1076        assert!(cleaned.contains("Generated image"));
1077        assert_eq!(refs.len(), 1);
1078        assert!(refs[0].starts_with("data:image/png;base64,"));
1079    }
1080
1081    // Regression for the JSON-clobber bug surfaced on PR #6183: native tool
1082    // dispatchers serialize tool results as `{"tool_call_id":"…","content":"…"}`
1083    // and downstream adapters (e.g. `OpenAiCompatibleProvider::convert_messages_for_native`)
1084    // recover `tool_call_id` via `serde_json::from_str` on the message
1085    // content. The multimodal preprocessor must keep that JSON intact while
1086    // still inlining any `[IMAGE:/path]` markers inside the inner `content`
1087    // field. Asserts:
1088    //   1. Prepared content is still valid JSON.
1089    //   2. `tool_call_id` survives unchanged.
1090    //   3. The inner `content` field carries `data:image/png;base64,…`
1091    //      (marker rewritten) and keeps surrounding text.
1092    #[tokio::test]
1093    async fn prepare_messages_preserves_native_tool_result_json_shape() {
1094        let temp = tempfile::tempdir().unwrap();
1095        let image_path = temp.path().join("native-tool-result.png");
1096        std::fs::write(
1097            &image_path,
1098            [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1099        )
1100        .unwrap();
1101
1102        let native_tool_content = serde_json::json!({
1103            "tool_call_id": "tc1",
1104            "content": format!("see attached [IMAGE:{}]", image_path.display().to_string()),
1105        })
1106        .to_string();
1107
1108        let messages = vec![ChatMessage::tool(native_tool_content)];
1109
1110        let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1111            .await
1112            .expect("preparation should succeed for native tool-result JSON");
1113
1114        assert!(prepared.contains_images);
1115        assert_eq!(prepared.messages.len(), 1);
1116        assert_eq!(prepared.messages[0].role, "tool");
1117
1118        let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1119            .expect("prepared tool message must remain valid JSON");
1120
1121        assert_eq!(
1122            value.get("tool_call_id").and_then(|v| v.as_str()),
1123            Some("tc1"),
1124            "tool_call_id must survive multimodal preprocessing unchanged"
1125        );
1126
1127        let inner = value
1128            .get("content")
1129            .and_then(|v| v.as_str())
1130            .expect("content must remain a JSON string");
1131        assert!(
1132            inner.contains("see attached"),
1133            "surrounding text in tool content should survive normalization"
1134        );
1135        assert!(
1136            inner.contains("data:image/png;base64,"),
1137            "local image path inside tool content should be rewritten to a data URI"
1138        );
1139        assert!(
1140            !inner.contains("native-tool-result.png"),
1141            "raw local path must not leak after normalization"
1142        );
1143    }
1144
1145    #[tokio::test]
1146    async fn prepare_messages_preserves_native_tool_json_when_image_is_skipped() {
1147        let native_tool_content = serde_json::json!({
1148            "tool_call_id": "tc1",
1149            "content": "generated screenshot [IMAGE:https://example.com/missing.png]",
1150        })
1151        .to_string();
1152
1153        let prepared = prepare_messages_for_provider(
1154            &[ChatMessage::tool(native_tool_content)],
1155            &MultimodalConfig::default(),
1156        )
1157        .await
1158        .expect("skipped native tool image should not fail message preparation");
1159
1160        assert!(!prepared.contains_images);
1161        assert_eq!(prepared.messages.len(), 1);
1162
1163        let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1164            .expect("native tool result must remain valid JSON");
1165        assert_eq!(
1166            value.get("tool_call_id").and_then(|v| v.as_str()),
1167            Some("tc1")
1168        );
1169
1170        let inner = value
1171            .get("content")
1172            .and_then(|v| v.as_str())
1173            .expect("content should remain a JSON string");
1174        assert!(inner.contains("generated screenshot"));
1175        assert!(inner.contains("1 attached image(s) could not be loaded"));
1176        assert!(!inner.contains("[IMAGE:"));
1177        assert!(!inner.contains("https://example.com/missing.png"));
1178    }
1179
1180    #[tokio::test]
1181    async fn prepare_messages_preserves_native_tool_json_with_mixed_images() {
1182        let temp = tempfile::tempdir().unwrap();
1183        let image_path = temp.path().join("mixed-native-tool-result.png");
1184        std::fs::write(
1185            &image_path,
1186            [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1187        )
1188        .unwrap();
1189
1190        let native_tool_content = serde_json::json!({
1191            "tool_call_id": "tc1",
1192            "content": format!(
1193                "generated [IMAGE:{}] and [IMAGE:https://example.com/missing.png]",
1194                image_path.display()
1195            ),
1196        })
1197        .to_string();
1198
1199        let prepared = prepare_messages_for_provider(
1200            &[ChatMessage::tool(native_tool_content)],
1201            &MultimodalConfig::default(),
1202        )
1203        .await
1204        .expect("valid native tool image should survive while bad ref is skipped");
1205
1206        assert!(prepared.contains_images);
1207        assert_eq!(prepared.messages.len(), 1);
1208
1209        let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1210            .expect("native tool result must remain valid JSON");
1211        assert_eq!(
1212            value.get("tool_call_id").and_then(|v| v.as_str()),
1213            Some("tc1")
1214        );
1215
1216        let inner = value
1217            .get("content")
1218            .and_then(|v| v.as_str())
1219            .expect("content should remain a JSON string");
1220        assert!(inner.contains("generated"));
1221        assert!(inner.contains("data:image/png;base64,"));
1222        assert!(inner.contains("1 of 2 attached image(s) could not be loaded"));
1223        assert!(!inner.contains("mixed-native-tool-result.png"));
1224        assert!(!inner.contains("https://example.com/missing.png"));
1225    }
1226
1227    #[tokio::test]
1228    async fn prepare_messages_strips_stale_native_tool_result_images() {
1229        let temp = tempfile::tempdir().unwrap();
1230        let image_path = temp.path().join("stale-native-tool-result.png");
1231        std::fs::write(
1232            &image_path,
1233            [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1234        )
1235        .unwrap();
1236
1237        let native_tool_content = serde_json::json!({
1238            "tool_call_id": "tc1",
1239            "content": format!("generated screenshot [IMAGE:{}]", image_path.display().to_string()),
1240        })
1241        .to_string();
1242
1243        let messages = vec![
1244            ChatMessage::tool(native_tool_content),
1245            ChatMessage {
1246                role: "assistant".to_string(),
1247                content: "I generated the screenshot.".to_string(),
1248            },
1249            ChatMessage::user("What happened next?".to_string()),
1250        ];
1251
1252        let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1253            .await
1254            .expect("preparation should strip stale tool images without loading them");
1255
1256        assert!(
1257            !prepared.contains_images,
1258            "stale tool-result images should not keep the request in vision mode"
1259        );
1260
1261        let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1262            .expect("stale native tool result should remain valid JSON");
1263        assert_eq!(
1264            value.get("tool_call_id").and_then(|v| v.as_str()),
1265            Some("tc1")
1266        );
1267
1268        let inner = value
1269            .get("content")
1270            .and_then(|v| v.as_str())
1271            .expect("content should remain a JSON string");
1272        assert!(inner.contains("generated screenshot"));
1273        assert!(!inner.contains("[IMAGE:"));
1274        assert!(!inner.contains("data:image"));
1275        assert!(!inner.contains("stale-native-tool-result.png"));
1276    }
1277
1278    #[tokio::test]
1279    async fn prepare_messages_strips_stale_prompt_tool_result_images() {
1280        let temp = tempfile::tempdir().unwrap();
1281        let image_path = temp.path().join("stale-prompt-tool-result.png");
1282        std::fs::write(
1283            &image_path,
1284            [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1285        )
1286        .unwrap();
1287
1288        let messages = vec![
1289            ChatMessage::user(format!(
1290                "[Tool results]\n<tool_result name=\"image_gen\">Generated [IMAGE:{}]</tool_result>",
1291                image_path.display()
1292            )),
1293            ChatMessage {
1294                role: "assistant".to_string(),
1295                content: "I generated the screenshot.".to_string(),
1296            },
1297            ChatMessage::user("Continue.".to_string()),
1298        ];
1299
1300        let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1301            .await
1302            .expect("preparation should strip stale prompt-mode tool images");
1303
1304        assert!(!prepared.contains_images);
1305        assert!(prepared.messages[0].content.contains("[Tool results]"));
1306        assert!(prepared.messages[0].content.contains("Generated"));
1307        assert!(!prepared.messages[0].content.contains("[IMAGE:"));
1308        assert!(!prepared.messages[0].content.contains("data:image"));
1309        assert!(
1310            !prepared.messages[0]
1311                .content
1312                .contains("stale-prompt-tool-result.png")
1313        );
1314    }
1315
1316    #[tokio::test]
1317    async fn prepare_messages_strips_stale_tool_image_while_normalizing_current_user_image() {
1318        let temp = tempfile::tempdir().unwrap();
1319        let stale_path = temp.path().join("stale-tool-result.png");
1320        let fresh_path = temp.path().join("fresh-user-image.png");
1321        let png = [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'];
1322        std::fs::write(&stale_path, png).unwrap();
1323        std::fs::write(&fresh_path, png).unwrap();
1324
1325        let native_tool_content = serde_json::json!({
1326            "tool_call_id": "tc1",
1327            "content": format!("generated screenshot [IMAGE:{}]", stale_path.display().to_string()),
1328        })
1329        .to_string();
1330
1331        let messages = vec![
1332            ChatMessage::tool(native_tool_content),
1333            ChatMessage {
1334                role: "assistant".to_string(),
1335                content: "I generated the screenshot.".to_string(),
1336            },
1337            ChatMessage::user(format!(
1338                "Now inspect this [IMAGE:{}]",
1339                fresh_path.display().to_string()
1340            )),
1341        ];
1342
1343        let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1344            .await
1345            .expect("preparation should strip stale tool images and normalize current user image");
1346
1347        assert!(prepared.contains_images);
1348
1349        let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1350            .expect("stale native tool result should remain valid JSON");
1351        let inner = value
1352            .get("content")
1353            .and_then(|v| v.as_str())
1354            .expect("content should remain a JSON string");
1355        assert!(inner.contains("generated screenshot"));
1356        assert!(!inner.contains("[IMAGE:"));
1357        assert!(!inner.contains("data:image"));
1358        assert!(!inner.contains("stale-tool-result.png"));
1359
1360        let (cleaned, refs) = parse_image_markers(&prepared.messages[2].content);
1361        assert_eq!(cleaned, "Now inspect this");
1362        assert_eq!(refs.len(), 1);
1363        assert!(refs[0].starts_with("data:image/png;base64,"));
1364        assert!(
1365            !prepared.messages[2]
1366                .content
1367                .contains("fresh-user-image.png")
1368        );
1369    }
1370
1371    #[test]
1372    fn count_image_markers_ignores_stale_tool_results() {
1373        let messages = vec![
1374            ChatMessage::tool("[IMAGE:/tmp/stale-tool.png]\nGenerated".to_string()),
1375            ChatMessage {
1376                role: "assistant".to_string(),
1377                content: "Done.".to_string(),
1378            },
1379            ChatMessage::user("Next question".to_string()),
1380        ];
1381
1382        assert_eq!(count_image_markers(&messages), 0);
1383
1384        let messages = vec![
1385            ChatMessage::user("Create an image".to_string()),
1386            ChatMessage::tool("[IMAGE:/tmp/latest-tool.png]\nGenerated".to_string()),
1387        ];
1388
1389        assert_eq!(count_image_markers(&messages), 1);
1390    }
1391
1392    #[tokio::test]
1393    async fn prepare_messages_trims_excess_images_from_older_messages() {
1394        // 3 messages, each with 1 image — max is 2.
1395        // The oldest message's image should be stripped.
1396        let messages = vec![
1397            ChatMessage::user("[IMAGE:/tmp/old.png]\nOld caption".to_string()),
1398            ChatMessage::user("[IMAGE:/tmp/mid.png]\nMid caption".to_string()),
1399            ChatMessage::user("[IMAGE:/tmp/new.png]\nNew caption".to_string()),
1400        ];
1401
1402        // Should not error — instead trims oldest.
1403        // (Will error on normalize_image_reference for the surviving images
1404        //  since /tmp/mid.png and /tmp/new.png don't exist, but the trimming
1405        //  itself should succeed.)
1406        let trimmed = trim_old_images(&messages, 2);
1407        assert_eq!(trimmed.len(), 3);
1408
1409        // Oldest message should have image stripped
1410        let (_, refs0) = parse_image_markers(&trimmed[0].content);
1411        assert!(refs0.is_empty(), "oldest image should be stripped");
1412        assert!(trimmed[0].content.contains("Old caption"));
1413
1414        // Newer messages keep their images
1415        let (_, refs1) = parse_image_markers(&trimmed[1].content);
1416        assert_eq!(refs1.len(), 1);
1417        let (_, refs2) = parse_image_markers(&trimmed[2].content);
1418        assert_eq!(refs2.len(), 1);
1419    }
1420
1421    #[test]
1422    fn trim_old_images_replaces_image_only_message() {
1423        // A message with only an image and no text should get a placeholder.
1424        let messages = vec![
1425            ChatMessage::user("[IMAGE:/tmp/old.png]".to_string()),
1426            ChatMessage::user("[IMAGE:/tmp/new.png]\nKeep this".to_string()),
1427        ];
1428
1429        let trimmed = trim_old_images(&messages, 1);
1430        assert_eq!(trimmed[0].content, "[image removed from history]");
1431        assert!(trimmed[1].content.contains("[IMAGE:/tmp/new.png]"));
1432    }
1433
1434    #[test]
1435    fn trim_old_images_multi_image_message_stripped_as_unit() {
1436        // A single message has 3 images. We need to drop 2 to reach max=1.
1437        // But trimming works at message granularity — the entire message gets
1438        // stripped (all 3 images removed), which over-trims to 0. The newest
1439        // message (text-only) is untouched.
1440        let messages = vec![
1441            ChatMessage::user(
1442                "[IMAGE:/tmp/a.png]\n[IMAGE:/tmp/b.png]\n[IMAGE:/tmp/c.png]\nThree pics"
1443                    .to_string(),
1444            ),
1445            ChatMessage::user("Just text, no images".to_string()),
1446        ];
1447
1448        let trimmed = trim_old_images(&messages, 1);
1449        assert_eq!(trimmed.len(), 2);
1450        // All images in the first message are gone, but text remains
1451        let (_, refs0) = parse_image_markers(&trimmed[0].content);
1452        assert!(refs0.is_empty());
1453        assert!(trimmed[0].content.contains("Three pics"));
1454        // Second message unchanged
1455        assert_eq!(trimmed[1].content, "Just text, no images");
1456    }
1457
1458    #[test]
1459    fn trim_old_images_skips_assistant_messages() {
1460        // Assistant messages with image markers should not be counted or stripped.
1461        let messages = vec![
1462            ChatMessage {
1463                role: "assistant".to_string(),
1464                content: "[IMAGE:/tmp/assistant.png]\nAssistant generated".to_string(),
1465            },
1466            ChatMessage::user("[IMAGE:/tmp/user1.png]\nFirst".to_string()),
1467            ChatMessage::user("[IMAGE:/tmp/user2.png]\nSecond".to_string()),
1468        ];
1469
1470        let trimmed = trim_old_images(&messages, 1);
1471        // Assistant message untouched (not counted toward limit)
1472        assert!(trimmed[0].content.contains("[IMAGE:/tmp/assistant.png]"));
1473        // Oldest user image stripped
1474        let (_, refs1) = parse_image_markers(&trimmed[1].content);
1475        assert!(refs1.is_empty());
1476        assert!(trimmed[1].content.contains("First"));
1477        // Newest user image kept
1478        let (_, refs2) = parse_image_markers(&trimmed[2].content);
1479        assert_eq!(refs2.len(), 1);
1480    }
1481
1482    #[test]
1483    fn trim_old_images_counts_latest_tool_messages() {
1484        let messages = vec![
1485            ChatMessage::user("[IMAGE:/tmp/user-old.png]\nOldest".to_string()),
1486            ChatMessage::tool("[IMAGE:/tmp/tool-new.png]\nGenerated".to_string()),
1487        ];
1488
1489        let trimmed = trim_old_images(&messages, 1);
1490        let (_, refs0) = parse_image_markers(&trimmed[0].content);
1491        assert!(refs0.is_empty(), "oldest user image should be stripped");
1492        assert!(trimmed[0].content.contains("Oldest"));
1493
1494        let (_, refs1) = parse_image_markers(&trimmed[1].content);
1495        assert_eq!(refs1.len(), 1);
1496    }
1497
1498    #[test]
1499    fn trim_old_images_no_trimming_when_under_limit() {
1500        let messages = vec![
1501            ChatMessage::user("[IMAGE:/tmp/a.png]\nCaption A".to_string()),
1502            ChatMessage::user("[IMAGE:/tmp/b.png]\nCaption B".to_string()),
1503        ];
1504
1505        let trimmed = trim_old_images(&messages, 5);
1506        // Nothing should change — both images are under the limit
1507        assert_eq!(trimmed[0].content, messages[0].content);
1508        assert_eq!(trimmed[1].content, messages[1].content);
1509    }
1510
1511    #[test]
1512    fn trim_old_images_no_trimming_when_exactly_at_limit() {
1513        let messages = vec![
1514            ChatMessage::user("[IMAGE:/tmp/a.png]\nA".to_string()),
1515            ChatMessage::user("[IMAGE:/tmp/b.png]\nB".to_string()),
1516        ];
1517
1518        let trimmed = trim_old_images(&messages, 2);
1519        assert_eq!(trimmed[0].content, messages[0].content);
1520        assert_eq!(trimmed[1].content, messages[1].content);
1521    }
1522
1523    #[test]
1524    fn trim_old_images_empty_messages() {
1525        let trimmed = trim_old_images(&[], 4);
1526        assert!(trimmed.is_empty());
1527    }
1528
1529    #[test]
1530    fn trim_old_images_interleaved_roles() {
1531        // Realistic conversation: user sends image, assistant replies, user sends
1532        // another image, etc. Only user messages should be candidates for trimming.
1533        let messages = vec![
1534            ChatMessage::user("[IMAGE:/tmp/1.png]\nLook at this".to_string()),
1535            ChatMessage {
1536                role: "assistant".to_string(),
1537                content: "I see a photo.".to_string(),
1538            },
1539            ChatMessage::user("[IMAGE:/tmp/2.png]\nWhat about this?".to_string()),
1540            ChatMessage {
1541                role: "assistant".to_string(),
1542                content: "That's a chart.".to_string(),
1543            },
1544            ChatMessage::user("[IMAGE:/tmp/3.png]\nAnd this one".to_string()),
1545        ];
1546
1547        let trimmed = trim_old_images(&messages, 2);
1548        assert_eq!(trimmed.len(), 5);
1549        // Oldest user image stripped
1550        let (_, refs0) = parse_image_markers(&trimmed[0].content);
1551        assert!(refs0.is_empty());
1552        assert!(trimmed[0].content.contains("Look at this"));
1553        // Assistant messages untouched
1554        assert_eq!(trimmed[1].content, "I see a photo.");
1555        assert_eq!(trimmed[3].content, "That's a chart.");
1556        // Two newest user images kept
1557        let (_, refs2) = parse_image_markers(&trimmed[2].content);
1558        assert_eq!(refs2.len(), 1);
1559        let (_, refs4) = parse_image_markers(&trimmed[4].content);
1560        assert_eq!(refs4.len(), 1);
1561    }
1562
1563    #[test]
1564    fn trim_old_images_strips_multiple_oldest_messages() {
1565        // 5 user images, max 1 — should strip the first 4 messages' images.
1566        let messages: Vec<ChatMessage> = (1..=5)
1567            .map(|i| ChatMessage::user(format!("[IMAGE:/tmp/{i}.png]\nCaption {i}")))
1568            .collect();
1569
1570        let trimmed = trim_old_images(&messages, 1);
1571        assert_eq!(trimmed.len(), 5);
1572        for (i, msg) in trimmed.iter().enumerate().take(4) {
1573            let (_, refs) = parse_image_markers(&msg.content);
1574            assert!(refs.is_empty(), "message {i} should have images stripped");
1575            assert!(msg.content.contains(&format!("Caption {}", i + 1)));
1576        }
1577        // Only the last message keeps its image
1578        let (_, refs_last) = parse_image_markers(&trimmed[4].content);
1579        assert_eq!(refs_last.len(), 1);
1580    }
1581
1582    #[tokio::test]
1583    async fn prepare_messages_trims_then_normalizes_surviving_images() {
1584        // End-to-end: 3 images, max 2. After trimming the oldest, the two
1585        // surviving images should be normalized (base64-encoded) successfully.
1586        let temp = tempfile::tempdir().unwrap();
1587        let mut paths = Vec::new();
1588        for name in ["old.png", "mid.png", "new.png"] {
1589            let p = temp.path().join(name);
1590            // Minimal valid PNG (1x1 white pixel)
1591            let png_data = [
1592                0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG signature
1593                0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, // IHDR chunk
1594                0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, 0x00, 0x00, 0x00, 0x90,
1595                0x77, 0x53, 0xDE, // 1x1 RGB
1596                0x00, 0x00, 0x00, 0x0C, 0x49, 0x44, 0x41, 0x54, // IDAT chunk
1597                0x08, 0xD7, 0x63, 0xF8, 0xCF, 0xC0, 0x00, 0x00, 0x00, 0x02, 0x00, 0x01, 0xE2, 0x21,
1598                0xBC, 0x33, // IDAT data + CRC
1599                0x00, 0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, 0x44, // IEND chunk
1600                0xAE, 0x42, 0x60, 0x82,
1601            ];
1602            std::fs::write(&p, png_data).unwrap();
1603            paths.push(p);
1604        }
1605
1606        let messages = vec![
1607            ChatMessage::user(format!("[IMAGE:{}]\nOld", paths[0].display().to_string())),
1608            ChatMessage::user(format!("[IMAGE:{}]\nMid", paths[1].display().to_string())),
1609            ChatMessage::user(format!("[IMAGE:{}]\nNew", paths[2].display().to_string())),
1610        ];
1611
1612        let config = MultimodalConfig {
1613            max_images: 2,
1614            max_image_size_mb: 5,
1615            allow_remote_fetch: false,
1616            ..Default::default()
1617        };
1618
1619        let result = prepare_messages_for_provider(&messages, &config)
1620            .await
1621            .expect("should succeed after trimming");
1622
1623        assert!(result.contains_images);
1624        assert_eq!(result.messages.len(), 3);
1625        // First message should have image stripped, text preserved
1626        assert!(!result.messages[0].content.contains("data:image"));
1627        assert!(result.messages[0].content.contains("Old"));
1628        // Second and third should have base64-encoded images
1629        assert!(result.messages[1].content.contains("data:image"));
1630        assert!(result.messages[2].content.contains("data:image"));
1631    }
1632
1633    #[tokio::test]
1634    async fn prepare_messages_skips_remote_url_when_disabled() {
1635        let messages = vec![ChatMessage::user(
1636            "Look [IMAGE:https://example.com/img.png]".to_string(),
1637        )];
1638
1639        let result = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1640            .await
1641            .expect("disabled remote image should be skipped");
1642
1643        assert!(!result.contains_images);
1644        assert_eq!(result.messages.len(), 1);
1645        assert!(result.messages[0].content.contains("Look"));
1646        assert!(
1647            result.messages[0]
1648                .content
1649                .contains("1 attached image(s) could not be loaded")
1650        );
1651        assert!(
1652            !result.messages[0]
1653                .content
1654                .contains("https://example.com/img.png")
1655        );
1656    }
1657
1658    #[tokio::test]
1659    async fn prepare_messages_skips_oversized_local_image() {
1660        let temp = tempfile::tempdir().unwrap();
1661        let image_path = temp.path().join("big.png");
1662
1663        let bytes = vec![0u8; 1024 * 1024 + 1];
1664        std::fs::write(&image_path, bytes).unwrap();
1665
1666        let messages = vec![ChatMessage::user(format!(
1667            "[IMAGE:{}]",
1668            image_path.display()
1669        ))];
1670        let config = MultimodalConfig {
1671            max_images: 4,
1672            max_image_size_mb: 1,
1673            allow_remote_fetch: false,
1674            ..Default::default()
1675        };
1676
1677        let result = prepare_messages_for_provider(&messages, &config)
1678            .await
1679            .expect("oversized local image should be skipped");
1680
1681        assert!(!result.contains_images);
1682        assert_eq!(result.messages.len(), 1);
1683        assert!(
1684            result.messages[0]
1685                .content
1686                .contains("1 attached image(s) could not be loaded")
1687        );
1688        assert!(
1689            !result.messages[0]
1690                .content
1691                .contains(image_path.to_string_lossy().as_ref())
1692        );
1693    }
1694
1695    #[tokio::test]
1696    async fn prepare_messages_keeps_successful_images_when_some_are_skipped() {
1697        let temp = tempfile::tempdir().unwrap();
1698        let image_path = temp.path().join("ok.png");
1699        std::fs::write(
1700            &image_path,
1701            [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1702        )
1703        .unwrap();
1704
1705        let messages = vec![ChatMessage::user(format!(
1706            "Look [IMAGE:{}] and [IMAGE:https://example.com/missing.png]",
1707            image_path.display()
1708        ))];
1709
1710        let result = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1711            .await
1712            .expect("valid local image should survive while remote image is skipped");
1713
1714        assert!(result.contains_images);
1715        assert!(
1716            result.messages[0]
1717                .content
1718                .contains("data:image/png;base64,")
1719        );
1720        assert!(
1721            result.messages[0]
1722                .content
1723                .contains("1 of 2 attached image(s) could not be loaded")
1724        );
1725        assert!(
1726            !result.messages[0]
1727                .content
1728                .contains("https://example.com/missing.png")
1729        );
1730    }
1731
1732    #[tokio::test]
1733    async fn skipped_images_do_not_consume_image_budget() {
1734        let temp = tempfile::tempdir().unwrap();
1735        let image_path = temp.path().join("older-valid.png");
1736        std::fs::write(
1737            &image_path,
1738            [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1739        )
1740        .unwrap();
1741
1742        let messages = vec![
1743            ChatMessage::user(format!(
1744                "Older valid image [IMAGE:{}]",
1745                image_path.display()
1746            )),
1747            ChatMessage::user(
1748                "Newer broken image [IMAGE:https://example.com/missing.png]".to_string(),
1749            ),
1750        ];
1751        let config = MultimodalConfig {
1752            max_images: 1,
1753            max_image_size_mb: 5,
1754            allow_remote_fetch: false,
1755            ..Default::default()
1756        };
1757
1758        let result = prepare_messages_for_provider(&messages, &config)
1759            .await
1760            .expect("broken image should not evict an older valid image");
1761
1762        assert!(result.contains_images);
1763        assert!(
1764            result.messages[0]
1765                .content
1766                .contains("data:image/png;base64,")
1767        );
1768        assert!(result.messages[1].content.contains("Newer broken image"));
1769        assert!(
1770            result.messages[1]
1771                .content
1772                .contains("1 attached image(s) could not be loaded")
1773        );
1774        assert!(
1775            !result.messages[1]
1776                .content
1777                .contains("https://example.com/missing.png")
1778        );
1779    }
1780
1781    #[test]
1782    fn extract_ollama_image_payload_supports_data_uris() {
1783        let payload = extract_ollama_image_payload("data:image/png;base64,abcd==")
1784            .expect("payload should be extracted");
1785        assert_eq!(payload, "abcd==");
1786    }
1787
1788    /// Stripping `[IMAGE:]` markers from history messages leaves only the text
1789    /// portion, which is the behaviour needed for non-vision model_providers.
1790    #[test]
1791    fn parse_image_markers_strips_markers_leaving_caption() {
1792        let input = "[IMAGE:/tmp/photo.jpg]\n\nDescribe this screenshot";
1793        let (cleaned, refs) = parse_image_markers(input);
1794        assert_eq!(cleaned, "Describe this screenshot");
1795        assert_eq!(refs.len(), 1);
1796        assert_eq!(refs[0], "/tmp/photo.jpg");
1797    }
1798
1799    /// An image-only message (no caption) should produce an empty string after
1800    /// marker stripping, so callers can drop it from history.
1801    #[test]
1802    fn parse_image_markers_image_only_message_becomes_empty() {
1803        let input = "[IMAGE:/tmp/photo.jpg]";
1804        let (cleaned, refs) = parse_image_markers(input);
1805        assert!(
1806            cleaned.is_empty(),
1807            "expected empty string, got: {cleaned:?}"
1808        );
1809        assert_eq!(refs.len(), 1);
1810    }
1811}