Skip to main content

zeroclaw_providers/
multimodal.rs

1use base64::{Engine as _, engine::general_purpose::STANDARD};
2use reqwest::Client;
3use std::collections::{HashMap, HashSet};
4use std::path::Path;
5use zeroclaw_api::model_provider::ChatMessage;
6use zeroclaw_config::schema::{MultimodalConfig, build_runtime_proxy_client_with_timeouts};
7
8const IMAGE_MARKER_PREFIX: &str = "[IMAGE:";
9// MIME types we will inline for vision providers. Deliberately excludes
10// `image/bmp`: no major vision provider (Anthropic, OpenAI) accepts BMP, so
11// inlining it would make the *entire* provider request fail rather than just
12// dropping the one image. Rejecting it here instead surfaces a clean
13// "could not be loaded" note while the request (and any accompanying text or
14// metadata) still goes through. BMP is still detected by `detect_mime` so the
15// skip is logged as an explicit unsupported-MIME event rather than "unknown".
16const ALLOWED_IMAGE_MIME_TYPES: &[&str] = &["image/png", "image/jpeg", "image/webp", "image/gif"];
17
18/// Per-path cache for resolved local image data URIs. Keyed by absolute
19/// path; stores `(len, mtime)` for freshness checks (`(0, 0)` sentinel
20/// = immutable upload). LRU evicts by both entry count and total bytes.
21#[derive(Debug, Default)]
22pub struct LocalImageCache {
23    entries: HashMap<String, (u64, i64, String)>,
24    order: std::collections::VecDeque<String>,
25    bytes: usize,
26}
27
28const LOCAL_IMAGE_CACHE_MAX_ENTRIES: usize = 32;
29const LOCAL_IMAGE_CACHE_MAX_BYTES: usize = 64 * 1024 * 1024;
30
31impl LocalImageCache {
32    pub fn new() -> Self {
33        Self::default()
34    }
35
36    fn get(&mut self, path: &str, len: u64, mtime: i64) -> Option<&str> {
37        let (cached_len, cached_mtime, _) = self.entries.get(path)?;
38        let immutable = *cached_len == 0 && *cached_mtime == 0;
39        let fresh = *cached_len == len && *cached_mtime == mtime;
40        if !immutable && !fresh {
41            return None;
42        }
43        if let Some(pos) = self.order.iter().position(|p| p == path) {
44            let key = self.order.remove(pos).expect("position valid");
45            self.order.push_back(key);
46        }
47        self.entries.get(path).map(|(_, _, uri)| uri.as_str())
48    }
49
50    fn insert(&mut self, path: String, len: u64, mtime: i64, data_uri: String) {
51        if let Some((_, _, old)) = self.entries.remove(&path) {
52            self.bytes = self.bytes.saturating_sub(old.len());
53            if let Some(pos) = self.order.iter().position(|p| p == &path) {
54                self.order.remove(pos);
55            }
56        }
57        self.bytes += data_uri.len();
58        self.entries.insert(path.clone(), (len, mtime, data_uri));
59        self.order.push_back(path);
60        while self.entries.len() > LOCAL_IMAGE_CACHE_MAX_ENTRIES
61            || self.bytes > LOCAL_IMAGE_CACHE_MAX_BYTES
62        {
63            let Some(victim) = self.order.pop_front() else {
64                break;
65            };
66            if let Some((_, _, uri)) = self.entries.remove(&victim) {
67                self.bytes = self.bytes.saturating_sub(uri.len());
68            }
69        }
70    }
71
72    pub fn len(&self) -> usize {
73        self.entries.len()
74    }
75
76    pub fn is_empty(&self) -> bool {
77        self.entries.is_empty()
78    }
79}
80
81#[derive(Debug, Clone)]
82pub struct PreparedMessages {
83    pub messages: Vec<ChatMessage>,
84    pub contains_images: bool,
85}
86
87#[derive(Debug, thiserror::Error)]
88pub enum MultimodalError {
89    #[error("multimodal image limit exceeded: max_images={max_images}, found={found}")]
90    TooManyImages { max_images: usize, found: usize },
91
92    #[error(
93        "multimodal image size limit exceeded for '{input}': {size_bytes} bytes > {max_bytes} bytes"
94    )]
95    ImageTooLarge {
96        input: String,
97        size_bytes: usize,
98        max_bytes: usize,
99    },
100
101    #[error("multimodal image MIME type is not allowed for '{input}': {mime}")]
102    UnsupportedMime { input: String, mime: String },
103
104    #[error("multimodal remote image fetch is disabled for '{input}'")]
105    RemoteFetchDisabled { input: String },
106
107    #[error("multimodal image source not found or unreadable: '{input}'")]
108    ImageSourceNotFound { input: String },
109
110    #[error("invalid multimodal image marker '{input}': {reason}")]
111    InvalidMarker { input: String, reason: String },
112
113    #[error("failed to download remote image '{input}': {reason}")]
114    RemoteFetchFailed { input: String, reason: String },
115
116    #[error("failed to read local image '{input}': {reason}")]
117    LocalReadFailed { input: String, reason: String },
118}
119
120/// Returns true for payloads that are plausibly loadable image references:
121/// absolute filesystem paths, `http(s)://` URLs, or base64 `data:` URIs.
122/// Placeholder-style payloads like `...`, `<path>`, or `example.png` fail
123/// this check and are left as literal text by [`parse_image_markers`], so
124/// illustrative markdown in a conversation does not trigger loader errors.
125fn is_loadable_image_reference(candidate: &str) -> bool {
126    candidate.starts_with('/')
127        || candidate.starts_with("http://")
128        || candidate.starts_with("https://")
129        || candidate.starts_with("data:")
130        || is_windows_path(candidate)
131        || is_windows_unc_path(candidate)
132}
133
134/// Returns true for Windows-style absolute paths like `C:\…` or `D:/…`.
135fn is_windows_path(candidate: &str) -> bool {
136    let mut chars = candidate.chars();
137    let Some(first) = chars.next() else {
138        return false;
139    };
140    if !first.is_ascii_alphabetic() {
141        return false;
142    }
143    let Some(second) = chars.next() else {
144        return false;
145    };
146    if second != ':' {
147        return false;
148    }
149    matches!(chars.next(), Some('\\') | Some('/'))
150}
151
152/// Returns true for Windows UNC share paths like `\\server\share\…`.
153///
154/// `image_info` emits these after unwrapping the verbatim-UNC prefix
155/// (`\\?\UNC\…`) that `canonicalize` produces on Windows. Without recognizing
156/// the unwrapped form here, [`is_loadable_image_reference`] would reject the
157/// marker (it is neither a `/`-rooted POSIX path nor a `C:\` drive path), so
158/// [`parse_image_markers`] would leave it as literal text and the image would
159/// never be inlined for vision models. Requires a non-empty server component
160/// and at least one further path segment; the verbatim/device prefixes
161/// (`\\?\…`, `\\.\…`) are rejected because they are not plain shares.
162fn is_windows_unc_path(candidate: &str) -> bool {
163    let Some(rest) = candidate.strip_prefix(r"\\") else {
164        return false;
165    };
166    if rest.starts_with('?') || rest.starts_with('.') {
167        return false;
168    }
169    let mut parts = rest.splitn(2, ['\\', '/']);
170    let server = parts.next().unwrap_or("");
171    let share = parts.next().unwrap_or("");
172    !server.is_empty() && !share.is_empty()
173}
174
175/// Normalize a marker payload that may have been line-wrapped when pasted
176/// from a terminal (e.g. a log line where a long path was broken across
177/// rows with leading indentation). Interior newlines — and any whitespace
178/// immediately following them — are dropped; leading/trailing whitespace
179/// is trimmed. Legitimate paths may contain spaces but never newlines, so
180/// this only recovers corrupted markers and does not mangle real paths.
181fn collapse_wrapped_marker(raw: &str) -> String {
182    if !raw.contains('\n') && !raw.contains('\r') {
183        return raw.trim().to_string();
184    }
185    let mut out = String::with_capacity(raw.len());
186    let mut skip_ws = false;
187    for ch in raw.chars() {
188        if ch == '\n' || ch == '\r' {
189            skip_ws = true;
190            continue;
191        }
192        if skip_ws {
193            if ch.is_whitespace() {
194                continue;
195            }
196            skip_ws = false;
197        }
198        out.push(ch);
199    }
200    out.trim().to_string()
201}
202
203pub fn parse_image_markers(content: &str) -> (String, Vec<String>) {
204    let mut refs = Vec::new();
205    let mut cleaned = String::with_capacity(content.len());
206    let mut cursor = 0usize;
207
208    while let Some(rel_start) = content[cursor..].find(IMAGE_MARKER_PREFIX) {
209        let start = cursor + rel_start;
210        cleaned.push_str(&content[cursor..start]);
211
212        let marker_start = start + IMAGE_MARKER_PREFIX.len();
213        let Some(rel_end) = content[marker_start..].find(']') else {
214            cleaned.push_str(&content[start..]);
215            cursor = content.len();
216            break;
217        };
218
219        let end = marker_start + rel_end;
220        let candidate = collapse_wrapped_marker(&content[marker_start..end]);
221
222        if candidate.is_empty() || !is_loadable_image_reference(&candidate) {
223            // Preserve the original marker text (placeholders like
224            // `[IMAGE:...]` or `[IMAGE:<path>]` should survive as prose
225            // rather than triggering a loader error).
226            cleaned.push_str(&content[start..=end]);
227        } else {
228            refs.push(candidate);
229        }
230
231        cursor = end + 1;
232    }
233
234    if cursor < content.len() {
235        cleaned.push_str(&content[cursor..]);
236    }
237
238    (cleaned.trim().to_string(), refs)
239}
240
241pub fn count_image_markers(messages: &[ChatMessage]) -> usize {
242    let latest_tool_indices = latest_tool_result_indices(messages);
243    count_image_markers_with_latest_tool_results(messages, &latest_tool_indices)
244}
245
246fn count_image_markers_with_latest_tool_results(
247    messages: &[ChatMessage],
248    latest_tool_result_indices: &HashSet<usize>,
249) -> usize {
250    messages
251        .iter()
252        .enumerate()
253        .filter(|(index, message)| {
254            should_normalize_message_images(*index, message, latest_tool_result_indices)
255        })
256        .map(|(_, message)| parse_image_markers(&message.content).1.len())
257        .sum()
258}
259
260pub fn contains_image_markers(messages: &[ChatMessage]) -> bool {
261    count_image_markers(messages) > 0
262}
263
264/// Count image markers that originate from genuine **user** messages (i.e.
265/// inbound attachments), excluding tool-result carriers (`role == "tool"` and
266/// `[Tool results]` user messages).
267///
268/// Callers use this to distinguish "the user sent an image we cannot see"
269/// (which should surface a user-facing capability error so the attachment is
270/// not silently ignored) from "an image marker arrived only via a tool result"
271/// (e.g. `image_info`/`screenshot`/`image_gen`), which can degrade to text-only
272/// on a non-vision provider without misleading the user.
273pub fn count_user_image_markers(messages: &[ChatMessage]) -> usize {
274    messages
275        .iter()
276        .filter(|message| message.role == "user" && !is_prompt_tool_result_message(message))
277        .map(|message| parse_image_markers(&message.content).1.len())
278        .sum()
279}
280
281/// Replace media markers (`[IMAGE:...]`, `[PHOTO:...]`, `[DOCUMENT:...]`,
282/// `[FILE:...]`, `[VIDEO:...]`, `[VOICE:...]`, `[AUDIO:...]`) with
283/// `[media attachment]`. Match is case-insensitive to align with the channel
284/// attachment parsers, which all uppercase the kind before comparing
285/// (`crates/zeroclaw-channels/src/util.rs::ATTACHMENT_KINDS`,
286/// `telegram.rs`, `discord.rs`, `qq.rs`, `whatsapp_web.rs`).
287///
288/// Use before passing user-facing text to auxiliary `chat_with_system` calls
289/// (intent classification, summarization, delegation) so that local file
290/// paths from inbound channels do not leak to the upstream provider — the
291/// upstream API would otherwise receive a filesystem path as `image_url.url`
292/// and reject the request.
293///
294/// Auxiliary calls do not need to *see* the media content; they only route
295/// or summarize, so the placeholder is sufficient. The main agent loop
296/// continues to call `prepare_messages_for_provider` for full normalization.
297pub fn strip_media_markers(text: &str) -> String {
298    static RE: std::sync::LazyLock<regex::Regex> = std::sync::LazyLock::new(|| {
299        regex::Regex::new(r"(?i)\[(?:IMAGE|PHOTO|DOCUMENT|FILE|VIDEO|VOICE|AUDIO):[^\]]*\]")
300            .unwrap()
301    });
302    RE.replace_all(text, "[media attachment]").into_owned()
303}
304
305pub fn extract_ollama_image_payload(image_ref: &str) -> Option<String> {
306    if image_ref.starts_with("data:") {
307        let comma_idx = image_ref.find(',')?;
308        let (_, payload) = image_ref.split_at(comma_idx + 1);
309        let payload = payload.trim();
310        if payload.is_empty() {
311            None
312        } else {
313            Some(payload.to_string())
314        }
315    } else {
316        Some(image_ref.trim().to_string()).filter(|value| !value.is_empty())
317    }
318}
319
320fn is_prompt_tool_result_message(message: &ChatMessage) -> bool {
321    message.role == "user" && message.content.trim_start().starts_with("[Tool results]")
322}
323
324fn is_tool_result_carrier(message: &ChatMessage) -> bool {
325    message.role == "tool" || is_prompt_tool_result_message(message)
326}
327
328fn latest_tool_result_indices(messages: &[ChatMessage]) -> HashSet<usize> {
329    let mut indices = HashSet::new();
330    let Some((last_index, last_message)) = messages.iter().enumerate().next_back() else {
331        return indices;
332    };
333
334    if is_prompt_tool_result_message(last_message) {
335        indices.insert(last_index);
336        return indices;
337    }
338
339    if last_message.role == "tool" {
340        for (index, message) in messages.iter().enumerate().rev() {
341            if message.role != "tool" {
342                break;
343            }
344            indices.insert(index);
345        }
346    }
347
348    indices
349}
350
351fn should_normalize_message_images(
352    index: usize,
353    message: &ChatMessage,
354    latest_tool_result_indices: &HashSet<usize>,
355) -> bool {
356    if is_tool_result_carrier(message) {
357        return latest_tool_result_indices.contains(&index);
358    }
359
360    message.role == "user"
361}
362
363fn stripped_image_marker_text(content: &str) -> String {
364    let (cleaned, refs) = parse_image_markers(content);
365    if refs.is_empty() {
366        return content.to_string();
367    }
368
369    if cleaned.trim().is_empty() {
370        "[image removed from history]".to_string()
371    } else {
372        cleaned
373    }
374}
375
376fn strip_tool_result_image_markers(message: &ChatMessage) -> ChatMessage {
377    if !message.content.contains(IMAGE_MARKER_PREFIX) {
378        return message.clone();
379    }
380
381    if message.role == "tool"
382        && let Ok(serde_json::Value::Object(mut obj)) =
383            serde_json::from_str::<serde_json::Value>(&message.content)
384        && let Some(serde_json::Value::String(inner)) = obj.get("content").cloned()
385    {
386        let stripped = stripped_image_marker_text(&inner);
387        if stripped == inner {
388            return message.clone();
389        }
390
391        obj.insert("content".to_string(), serde_json::Value::String(stripped));
392        return ChatMessage {
393            role: message.role.clone(),
394            content: serde_json::Value::Object(obj).to_string(),
395        };
396    }
397
398    ChatMessage {
399        role: message.role.clone(),
400        content: stripped_image_marker_text(&message.content),
401    }
402}
403
404fn replay_message_without_stale_tool_images(
405    index: usize,
406    message: &ChatMessage,
407    latest_tool_result_indices: &HashSet<usize>,
408) -> ChatMessage {
409    if is_tool_result_carrier(message) && !latest_tool_result_indices.contains(&index) {
410        strip_tool_result_image_markers(message)
411    } else {
412        message.clone()
413    }
414}
415
416/// Attempt to normalize image markers inside a native tool-result JSON
417/// payload produced by `NativeToolDispatcher::to_provider_messages`. On
418/// success, returns the reserialized JSON string with the inner `content`
419/// field rewritten to inline `[IMAGE:data:…]` markers (data URIs). Returns
420/// `Ok(None)` when the payload is not a JSON object with a string `content`
421/// field, when the inner content has no normalizable markers, or when no
422/// rewriting is needed — letting the caller fall through to the existing
423/// plain-text path. The returned JSON preserves `tool_call_id` and any
424/// other top-level fields so downstream native adapters
425/// (e.g. `OpenAiCompatibleProvider::convert_messages_for_native`) can keep
426/// recovering the tool-call linkage via `serde_json::from_str`.
427async fn normalize_native_tool_result_json(
428    content: &str,
429    config: &MultimodalConfig,
430    max_bytes: usize,
431    remote_client: &Client,
432    ctx: &ImageNormalizeCtx<'_>,
433    cache: Option<&mut LocalImageCache>,
434) -> Option<(String, bool)> {
435    let Ok(serde_json::Value::Object(mut obj)) = serde_json::from_str::<serde_json::Value>(content)
436    else {
437        return None;
438    };
439
440    let Some(serde_json::Value::String(inner)) = obj.get("content").cloned() else {
441        return None;
442    };
443
444    let (cleaned_text, refs) = parse_image_markers(&inner);
445    if refs.is_empty() {
446        return None;
447    }
448
449    let normalized =
450        normalize_image_references(&refs, config, max_bytes, remote_client, ctx, cache).await;
451    let new_inner = compose_multimodal_content(
452        &cleaned_text,
453        &normalized.data_uris,
454        normalized.skipped_count,
455        refs.len(),
456    );
457    obj.insert("content".to_string(), serde_json::Value::String(new_inner));
458
459    Some((
460        serde_json::Value::Object(obj).to_string(),
461        !normalized.data_uris.is_empty(),
462    ))
463}
464
465pub async fn prepare_messages_for_provider(
466    messages: &[ChatMessage],
467    config: &MultimodalConfig,
468) -> anyhow::Result<PreparedMessages> {
469    prepare_messages_inner(messages, config, None).await
470}
471
472/// Like [`prepare_messages_for_provider`] but reuses a [`LocalImageCache`]
473/// across calls so each unique local image file is read from disk at most
474/// once per session (or once per modification for mutable files).
475pub async fn prepare_messages_for_provider_cached(
476    messages: &[ChatMessage],
477    config: &MultimodalConfig,
478    cache: &mut LocalImageCache,
479) -> anyhow::Result<PreparedMessages> {
480    prepare_messages_inner(messages, config, Some(cache)).await
481}
482
483async fn prepare_messages_inner(
484    messages: &[ChatMessage],
485    config: &MultimodalConfig,
486    mut cache: Option<&mut LocalImageCache>,
487) -> anyhow::Result<PreparedMessages> {
488    let (max_images, max_image_size_mb) = config.effective_limits();
489    let max_bytes = max_image_size_mb.saturating_mul(1024 * 1024);
490
491    let latest_tool_indices = latest_tool_result_indices(messages);
492    let total_images = count_image_markers_with_latest_tool_results(messages, &latest_tool_indices);
493
494    if total_images == 0 {
495        return Ok(PreparedMessages {
496            messages: messages
497                .iter()
498                .enumerate()
499                .map(|(index, message)| {
500                    replay_message_without_stale_tool_images(index, message, &latest_tool_indices)
501                })
502                .collect(),
503            contains_images: false,
504        });
505    }
506
507    // When image count exceeds the limit, strip markers from oldest messages
508    // first so that the most recent (most relevant) images survive. This
509    // prevents conversations from becoming permanently stuck once the
510    // cumulative image count crosses the threshold.
511    let trimmed = if total_images > max_images {
512        ::zeroclaw_log::record!(
513            WARN,
514            ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
515                .with_outcome(::zeroclaw_log::EventOutcome::Unknown)
516                .with_attrs(::serde_json::json!({
517                    "total_images": total_images,
518                    "max_images": max_images,
519                    "trimmed_to": max_images,
520                })),
521            "multimodal: trimming oldest images — conversation exceeds image limit"
522        );
523        trim_old_images(messages, max_images)
524    } else {
525        messages.to_vec()
526    };
527
528    let remote_client = build_runtime_proxy_client_with_timeouts("model_provider.ollama", 30, 10);
529    let latest_tool_indices = latest_tool_result_indices(&trimmed);
530
531    let mut normalized_messages = Vec::with_capacity(messages.len());
532    let mut has_successful_images = false;
533    for (index, message) in messages.iter().enumerate() {
534        if !should_normalize_message_images(index, message, &latest_tool_indices) {
535            normalized_messages.push(replay_message_without_stale_tool_images(
536                index,
537                message,
538                &latest_tool_indices,
539            ));
540            continue;
541        }
542
543        // Native tool dispatchers wrap tool results as a JSON object
544        // (`{"tool_call_id":"…","content":"…"}`) so that provider adapters
545        // can recover `tool_call_id` via `serde_json::from_str` on
546        // `message.content`. Treating that JSON blob as plain text would
547        // strip markers out of the `content` field and append the data URI
548        // outside the JSON object, breaking the native tool-result contract
549        // and dropping `tool_call_id`. When we recognise that shape,
550        // normalize only the inner `content` string and reserialize the
551        // JSON so adapters keep seeing the structure they expect. Falls
552        // through to the plain-text path for non-JSON tool messages.
553        if message.role == "tool"
554            && let Some((prepared, contains_images)) = normalize_native_tool_result_json(
555                &message.content,
556                config,
557                max_bytes,
558                &remote_client,
559                &ImageNormalizeCtx {
560                    message_index: index,
561                    role: &message.role,
562                },
563                cache.as_deref_mut(),
564            )
565            .await
566        {
567            normalized_messages.push(ChatMessage {
568                role: message.role.clone(),
569                content: prepared,
570            });
571            has_successful_images |= contains_images;
572            continue;
573        }
574
575        let (cleaned_text, refs) = parse_image_markers(&message.content);
576        if refs.is_empty() {
577            normalized_messages.push(message.clone());
578            continue;
579        }
580
581        let normalized = normalize_image_references(
582            &refs,
583            config,
584            max_bytes,
585            &remote_client,
586            &ImageNormalizeCtx {
587                message_index: index,
588                role: &message.role,
589            },
590            cache.as_deref_mut(),
591        )
592        .await;
593        let content = compose_multimodal_content(
594            &cleaned_text,
595            &normalized.data_uris,
596            normalized.skipped_count,
597            refs.len(),
598        );
599        has_successful_images |= !normalized.data_uris.is_empty();
600        normalized_messages.push(ChatMessage {
601            role: message.role.clone(),
602            content,
603        });
604    }
605
606    // Apply age-based trimming when configured: strip images from user messages
607    // older than `max_image_turns` turns back from the end of history.
608    // `max_image_turns == 0` means disabled — no age trimming.
609    let age_trimmed = if config.max_image_turns > 0 {
610        let before = count_image_markers(&normalized_messages);
611        let trimmed = trim_images_by_age(&normalized_messages, config.max_image_turns);
612        let after = count_image_markers(&trimmed);
613        if after < before {
614            ::zeroclaw_log::record!(
615                INFO,
616                ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
617                    .with_attrs(::serde_json::json!({
618                        "max_image_turns": config.max_image_turns,
619                        "images_before": before,
620                        "images_after": after,
621                        "images_dropped": before - after,
622                    })),
623                "multimodal: age-trimmed old images from conversation history"
624            );
625        }
626        trimmed
627    } else {
628        normalized_messages
629    };
630
631    // Apply the per-request image cap after normalization so failed image refs
632    // do not consume budget and evict older images that could still be sent.
633    let capped_messages = if has_successful_images && count_image_markers(&age_trimmed) > max_images
634    {
635        ::zeroclaw_log::record!(
636            WARN,
637            ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
638                .with_outcome(::zeroclaw_log::EventOutcome::Unknown)
639                .with_attrs(::serde_json::json!({
640                    "images_after_normalization": count_image_markers(&age_trimmed),
641                    "max_images": max_images,
642                })),
643            "multimodal: post-normalization image cap exceeded — trimming oldest images"
644        );
645        trim_old_images(&age_trimmed, max_images)
646    } else {
647        age_trimmed
648    };
649
650    Ok(PreparedMessages {
651        contains_images: count_image_markers(&capped_messages) > 0,
652        messages: capped_messages,
653    })
654}
655/// Strip images from user messages that are more than `max_turns` turns back
656/// from the end of `messages`.  A "turn" here is counted as a user-role
657/// message, so `max_turns = 2` keeps images in the two most recent user
658/// messages and strips them from all earlier ones.  Tool-result images are
659/// handled by the stale-tool-result mechanism and are left untouched.
660fn trim_images_by_age(messages: &[ChatMessage], max_turns: usize) -> Vec<ChatMessage> {
661    // Count user messages from the end to find the cutoff index.
662    let mut user_turn_count = 0usize;
663    let mut cutoff = 0usize; // messages at index < cutoff are "too old"
664    for (i, m) in messages.iter().enumerate().rev() {
665        if m.role == "user" {
666            user_turn_count += 1;
667            if user_turn_count > max_turns {
668                // Everything up to and including this index is too old.
669                cutoff = i + 1;
670                break;
671            }
672        }
673    }
674
675    if cutoff == 0 {
676        return messages.to_vec();
677    }
678
679    messages
680        .iter()
681        .enumerate()
682        .map(|(i, m)| {
683            if i < cutoff && m.role == "user" {
684                let (cleaned, refs) = parse_image_markers(&m.content);
685                if refs.is_empty() {
686                    return m.clone();
687                }
688                let text = if cleaned.trim().is_empty() {
689                    "[image removed from history]".to_string()
690                } else {
691                    cleaned
692                };
693                ChatMessage {
694                    role: m.role.clone(),
695                    content: text,
696                }
697            } else {
698                m.clone()
699            }
700        })
701        .collect()
702}
703
704/// Strip image markers from older messages (oldest first) until total image
705/// count is within `max_images`. Keeps the text content of each message.
706fn trim_old_images(messages: &[ChatMessage], max_images: usize) -> Vec<ChatMessage> {
707    let latest_tool_indices = latest_tool_result_indices(messages);
708    // Find which messages (by index) contain images, oldest first.
709    let image_positions: Vec<(usize, usize)> = messages
710        .iter()
711        .enumerate()
712        .filter(|(index, message)| {
713            should_normalize_message_images(*index, message, &latest_tool_indices)
714        })
715        .filter_map(|(i, m)| {
716            let count = parse_image_markers(&m.content).1.len();
717            if count > 0 { Some((i, count)) } else { None }
718        })
719        .collect();
720
721    // Determine how many images to drop (from the oldest messages).
722    let total: usize = image_positions.iter().map(|(_, c)| c).sum();
723    let mut to_drop = total.saturating_sub(max_images);
724
725    // Collect indices of messages whose images should be stripped.
726    let mut strip_indices = std::collections::HashSet::new();
727    for &(idx, count) in &image_positions {
728        if to_drop == 0 {
729            break;
730        }
731        strip_indices.insert(idx);
732        to_drop = to_drop.saturating_sub(count);
733    }
734
735    messages
736        .iter()
737        .enumerate()
738        .map(|(i, m)| {
739            if strip_indices.contains(&i) {
740                let (cleaned, _) = parse_image_markers(&m.content);
741                let text = if cleaned.trim().is_empty() {
742                    "[image removed from history]".to_string()
743                } else {
744                    cleaned
745                };
746                ChatMessage {
747                    role: m.role.clone(),
748                    content: text,
749                }
750            } else {
751                replay_message_without_stale_tool_images(i, m, &latest_tool_indices)
752            }
753        })
754        .collect()
755}
756
757fn compose_multimodal_message(text: &str, data_uris: &[String]) -> String {
758    let mut content = String::new();
759    let trimmed = text.trim();
760
761    if !trimmed.is_empty() {
762        content.push_str(trimmed);
763        content.push_str("\n\n");
764    }
765
766    for (index, data_uri) in data_uris.iter().enumerate() {
767        if index > 0 {
768            content.push('\n');
769        }
770        content.push_str(IMAGE_MARKER_PREFIX);
771        content.push_str(data_uri);
772        content.push(']');
773    }
774
775    content
776}
777
778struct NormalizedImageReferences {
779    data_uris: Vec<String>,
780    skipped_count: usize,
781}
782
783/// Context attached to image-skip log events so callers can be identified.
784struct ImageNormalizeCtx<'a> {
785    /// Zero-based index of this message in the conversation history.
786    message_index: usize,
787    /// Role of the message containing the image reference.
788    role: &'a str,
789}
790
791async fn normalize_image_references(
792    refs: &[String],
793    config: &MultimodalConfig,
794    max_bytes: usize,
795    remote_client: &Client,
796    ctx: &ImageNormalizeCtx<'_>,
797    mut cache: Option<&mut LocalImageCache>,
798) -> NormalizedImageReferences {
799    let mut data_uris = Vec::with_capacity(refs.len());
800    let mut skipped_count = 0usize;
801
802    for reference in refs {
803        match normalize_image_reference(
804            reference,
805            config,
806            max_bytes,
807            remote_client,
808            cache.as_deref_mut(),
809        )
810        .await
811        {
812            Ok(data_uri) => data_uris.push(data_uri),
813            Err(error) => {
814                skipped_count += 1;
815                let error_reason = multimodal_error_reason(&error);
816                // Truncate the raw reference so we don't dump a full base64
817                // payload into the log, but keep enough to identify the source.
818                let marker_preview: String = reference.chars().take(120).collect();
819                let error_kind = multimodal_error_kind(&error);
820                let attrs = ::serde_json::json!({
821                    "message_index": ctx.message_index,
822                    "message_role": ctx.role,
823                    "source_kind": image_reference_kind(reference),
824                    "error_kind": error_kind,
825                    "reason": error_reason.as_deref().unwrap_or(""),
826                    "marker_preview": marker_preview,
827                });
828                // Severity rules:
829                //   - For inbound user attachments, any failure is a real
830                //     loss the operator cares about → WARN.
831                //   - For tool-result content, marker-looking strings often
832                //     come from tool output that just happened to contain
833                //     `[IMAGE:...]` patterns (e.g. an agent reading a test
834                //     fixture, a code search hitting an assertion, log
835                //     snippets). Treat best-effort recoverable failures as
836                //     DEBUG so they stop drowning real signal. Keep WARN
837                //     only for configuration/limit problems that the
838                //     operator can actually act on.
839                let is_tool_role = ctx.role == "tool";
840                let is_recoverable_load_failure = matches!(
841                    error_kind,
842                    "image_source_not_found"
843                        | "local_read_failed"
844                        | "remote_fetch_failed"
845                        | "invalid_marker"
846                );
847                if is_tool_role && is_recoverable_load_failure {
848                    ::zeroclaw_log::record!(
849                        DEBUG,
850                        ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
851                            .with_attrs(attrs),
852                        "skipping multimodal marker in tool result (likely not a real attachment)"
853                    );
854                } else {
855                    ::zeroclaw_log::record!(
856                        WARN,
857                        ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
858                            .with_outcome(::zeroclaw_log::EventOutcome::Unknown)
859                            .with_attrs(attrs),
860                        "skipping multimodal image that could not be loaded"
861                    );
862                }
863            }
864        }
865    }
866
867    NormalizedImageReferences {
868        data_uris,
869        skipped_count,
870    }
871}
872
873fn compose_multimodal_content(
874    text: &str,
875    data_uris: &[String],
876    skipped_count: usize,
877    total_refs: usize,
878) -> String {
879    if skipped_count == 0 {
880        return compose_multimodal_message(text, data_uris);
881    }
882
883    let text_with_note = append_skipped_image_note(text, skipped_count, total_refs);
884    if data_uris.is_empty() {
885        text_with_note.trim().to_string()
886    } else {
887        compose_multimodal_message(&text_with_note, data_uris)
888    }
889}
890
891fn append_skipped_image_note(text: &str, skipped_count: usize, total_refs: usize) -> String {
892    if skipped_count == 0 {
893        return text.to_string();
894    }
895
896    // This note is model-facing provider context, not direct localized UI text.
897    let note = if skipped_count == total_refs {
898        format!("{skipped_count} attached image(s) could not be loaded")
899    } else {
900        format!("{skipped_count} of {total_refs} attached image(s) could not be loaded")
901    };
902
903    let trimmed = text.trim();
904    if trimmed.is_empty() {
905        format!("Note: {note}.")
906    } else {
907        format!("{trimmed}\n\nNote: {note}.")
908    }
909}
910
911fn image_reference_kind(reference: &str) -> &'static str {
912    if reference.starts_with("data:") {
913        "data"
914    } else if reference.starts_with("http://") || reference.starts_with("https://") {
915        "remote"
916    } else {
917        "local"
918    }
919}
920
921fn multimodal_error_kind(error: &anyhow::Error) -> &'static str {
922    match error.downcast_ref::<MultimodalError>() {
923        Some(MultimodalError::TooManyImages { .. }) => "too_many_images",
924        Some(MultimodalError::ImageTooLarge { .. }) => "image_too_large",
925        Some(MultimodalError::UnsupportedMime { .. }) => "unsupported_mime",
926        Some(MultimodalError::RemoteFetchDisabled { .. }) => "remote_fetch_disabled",
927        Some(MultimodalError::ImageSourceNotFound { .. }) => "image_source_not_found",
928        Some(MultimodalError::InvalidMarker { .. }) => "invalid_marker",
929        Some(MultimodalError::RemoteFetchFailed { .. }) => "remote_fetch_failed",
930        Some(MultimodalError::LocalReadFailed { .. }) => "local_read_failed",
931        None => "unknown",
932    }
933}
934
935fn multimodal_error_reason(error: &anyhow::Error) -> Option<String> {
936    match error.downcast_ref::<MultimodalError>() {
937        Some(MultimodalError::InvalidMarker { input, reason })
938        | Some(MultimodalError::RemoteFetchFailed { input, reason })
939        | Some(MultimodalError::LocalReadFailed { input, reason }) => {
940            Some(reason.replace(input, "<source>"))
941        }
942        _ => None,
943    }
944}
945
946async fn normalize_image_reference(
947    source: &str,
948    config: &MultimodalConfig,
949    max_bytes: usize,
950    remote_client: &Client,
951    cache: Option<&mut LocalImageCache>,
952) -> anyhow::Result<String> {
953    if source.starts_with("data:") {
954        return normalize_data_uri(source, max_bytes);
955    }
956
957    if source.starts_with("http://") || source.starts_with("https://") {
958        if !config.allow_remote_fetch {
959            return Err(MultimodalError::RemoteFetchDisabled {
960                input: source.to_string(),
961            }
962            .into());
963        }
964
965        return normalize_remote_image(source, max_bytes, remote_client).await;
966    }
967
968    match cache {
969        Some(c) => normalize_local_image_cached(source, max_bytes, c).await,
970        None => normalize_local_image(source, max_bytes).await,
971    }
972}
973
974fn normalize_data_uri(source: &str, max_bytes: usize) -> anyhow::Result<String> {
975    let Some(comma_idx) = source.find(',') else {
976        return Err(MultimodalError::InvalidMarker {
977            input: source.to_string(),
978            reason: "expected data URI payload".to_string(),
979        }
980        .into());
981    };
982
983    let header = &source[..comma_idx];
984    let payload = source[comma_idx + 1..].trim();
985
986    if !header.contains(";base64") {
987        return Err(MultimodalError::InvalidMarker {
988            input: source.to_string(),
989            reason: "only base64 data URIs are supported".to_string(),
990        }
991        .into());
992    }
993
994    let mime = header
995        .trim_start_matches("data:")
996        .split(';')
997        .next()
998        .unwrap_or_default()
999        .trim()
1000        .to_ascii_lowercase();
1001
1002    validate_mime(source, &mime)?;
1003
1004    let decoded = STANDARD
1005        .decode(payload)
1006        .map_err(|error| MultimodalError::InvalidMarker {
1007            input: source.to_string(),
1008            reason: format!("invalid base64 payload: {error}"),
1009        })?;
1010
1011    validate_size(source, decoded.len(), max_bytes)?;
1012
1013    Ok(format!("data:{mime};base64,{}", STANDARD.encode(decoded)))
1014}
1015
1016async fn normalize_remote_image(
1017    source: &str,
1018    max_bytes: usize,
1019    remote_client: &Client,
1020) -> anyhow::Result<String> {
1021    let response = remote_client.get(source).send().await.map_err(|error| {
1022        MultimodalError::RemoteFetchFailed {
1023            input: source.to_string(),
1024            reason: error.to_string(),
1025        }
1026    })?;
1027
1028    let status = response.status();
1029    if !status.is_success() {
1030        return Err(MultimodalError::RemoteFetchFailed {
1031            input: source.to_string(),
1032            reason: format!("HTTP {status}"),
1033        }
1034        .into());
1035    }
1036
1037    if let Some(content_length) = response.content_length() {
1038        let content_length = usize::try_from(content_length).unwrap_or(usize::MAX);
1039        validate_size(source, content_length, max_bytes)?;
1040    }
1041
1042    let content_type = response
1043        .headers()
1044        .get(reqwest::header::CONTENT_TYPE)
1045        .and_then(|value| value.to_str().ok())
1046        .map(ToString::to_string);
1047
1048    let bytes = response
1049        .bytes()
1050        .await
1051        .map_err(|error| MultimodalError::RemoteFetchFailed {
1052            input: source.to_string(),
1053            reason: error.to_string(),
1054        })?;
1055
1056    validate_size(source, bytes.len(), max_bytes)?;
1057
1058    let mime = detect_mime(None, bytes.as_ref(), content_type.as_deref()).ok_or_else(|| {
1059        MultimodalError::UnsupportedMime {
1060            input: source.to_string(),
1061            mime: "unknown".to_string(),
1062        }
1063    })?;
1064
1065    validate_mime(source, &mime)?;
1066
1067    Ok(format!("data:{mime};base64,{}", STANDARD.encode(bytes)))
1068}
1069
1070async fn normalize_local_image(source: &str, max_bytes: usize) -> anyhow::Result<String> {
1071    let path = Path::new(source);
1072    if !path.exists() || !path.is_file() {
1073        return Err(MultimodalError::ImageSourceNotFound {
1074            input: source.to_string(),
1075        }
1076        .into());
1077    }
1078
1079    let metadata =
1080        tokio::fs::metadata(path)
1081            .await
1082            .map_err(|error| MultimodalError::LocalReadFailed {
1083                input: source.to_string(),
1084                reason: error.to_string(),
1085            })?;
1086
1087    validate_size(
1088        source,
1089        usize::try_from(metadata.len()).unwrap_or(usize::MAX),
1090        max_bytes,
1091    )?;
1092
1093    let bytes = tokio::fs::read(path)
1094        .await
1095        .map_err(|error| MultimodalError::LocalReadFailed {
1096            input: source.to_string(),
1097            reason: error.to_string(),
1098        })?;
1099
1100    validate_size(source, bytes.len(), max_bytes)?;
1101
1102    let mime =
1103        detect_mime(Some(path), &bytes, None).ok_or_else(|| MultimodalError::UnsupportedMime {
1104            input: source.to_string(),
1105            mime: "unknown".to_string(),
1106        })?;
1107
1108    validate_mime(source, &mime)?;
1109
1110    Ok(format!("data:{mime};base64,{}", STANDARD.encode(bytes)))
1111}
1112
1113/// Cache-aware local image loader. On a hit (path + metadata unchanged) returns
1114/// the stored data URI without touching the filesystem. Files under `/uploads/`
1115/// are content-addressed and treated as immutable — checked once, never re-read.
1116async fn normalize_local_image_cached(
1117    source: &str,
1118    max_bytes: usize,
1119    cache: &mut LocalImageCache,
1120) -> anyhow::Result<String> {
1121    let path = Path::new(source);
1122    if !path.exists() || !path.is_file() {
1123        return Err(MultimodalError::ImageSourceNotFound {
1124            input: source.to_string(),
1125        }
1126        .into());
1127    }
1128
1129    let metadata =
1130        tokio::fs::metadata(path)
1131            .await
1132            .map_err(|error| MultimodalError::LocalReadFailed {
1133                input: source.to_string(),
1134                reason: error.to_string(),
1135            })?;
1136
1137    let file_len = metadata.len();
1138    let is_immutable = source.contains("/uploads/");
1139    let mtime: i64 = if is_immutable {
1140        0
1141    } else {
1142        metadata
1143            .modified()
1144            .ok()
1145            .and_then(|t| {
1146                t.duration_since(std::time::UNIX_EPOCH)
1147                    .ok()
1148                    .map(|d| d.as_secs() as i64)
1149            })
1150            .unwrap_or(0)
1151    };
1152    let cache_len = if is_immutable { 0 } else { file_len };
1153
1154    if let Some(cached) = cache.get(source, cache_len, mtime) {
1155        return Ok(cached.to_string());
1156    }
1157
1158    validate_size(
1159        source,
1160        usize::try_from(file_len).unwrap_or(usize::MAX),
1161        max_bytes,
1162    )?;
1163
1164    let bytes = tokio::fs::read(path)
1165        .await
1166        .map_err(|error| MultimodalError::LocalReadFailed {
1167            input: source.to_string(),
1168            reason: error.to_string(),
1169        })?;
1170
1171    validate_size(source, bytes.len(), max_bytes)?;
1172
1173    let mime =
1174        detect_mime(Some(path), &bytes, None).ok_or_else(|| MultimodalError::UnsupportedMime {
1175            input: source.to_string(),
1176            mime: "unknown".to_string(),
1177        })?;
1178
1179    validate_mime(source, &mime)?;
1180
1181    let data_uri = format!("data:{mime};base64,{}", STANDARD.encode(&bytes));
1182    cache.insert(source.to_string(), cache_len, mtime, data_uri.clone());
1183    Ok(data_uri)
1184}
1185
1186fn validate_size(source: &str, size_bytes: usize, max_bytes: usize) -> anyhow::Result<()> {
1187    if size_bytes > max_bytes {
1188        return Err(MultimodalError::ImageTooLarge {
1189            input: source.to_string(),
1190            size_bytes,
1191            max_bytes,
1192        }
1193        .into());
1194    }
1195
1196    Ok(())
1197}
1198
1199fn validate_mime(source: &str, mime: &str) -> anyhow::Result<()> {
1200    if ALLOWED_IMAGE_MIME_TYPES.contains(&mime) {
1201        return Ok(());
1202    }
1203
1204    Err(MultimodalError::UnsupportedMime {
1205        input: source.to_string(),
1206        mime: mime.to_string(),
1207    }
1208    .into())
1209}
1210
1211fn detect_mime(
1212    path: Option<&Path>,
1213    bytes: &[u8],
1214    header_content_type: Option<&str>,
1215) -> Option<String> {
1216    if let Some(header_mime) = header_content_type.and_then(normalize_content_type) {
1217        return Some(header_mime);
1218    }
1219
1220    if let Some(path) = path
1221        && let Some(ext) = path.extension().and_then(|value| value.to_str())
1222        && let Some(mime) = mime_from_extension(ext)
1223    {
1224        return Some(mime.to_string());
1225    }
1226
1227    mime_from_magic(bytes).map(ToString::to_string)
1228}
1229
1230fn normalize_content_type(content_type: &str) -> Option<String> {
1231    let mime = content_type.split(';').next()?.trim().to_ascii_lowercase();
1232    if mime.is_empty() { None } else { Some(mime) }
1233}
1234
1235fn mime_from_extension(ext: &str) -> Option<&'static str> {
1236    match ext.to_ascii_lowercase().as_str() {
1237        "png" => Some("image/png"),
1238        "jpg" | "jpeg" => Some("image/jpeg"),
1239        "webp" => Some("image/webp"),
1240        "gif" => Some("image/gif"),
1241        "bmp" => Some("image/bmp"),
1242        _ => None,
1243    }
1244}
1245
1246fn mime_from_magic(bytes: &[u8]) -> Option<&'static str> {
1247    if bytes.len() >= 8 && bytes.starts_with(&[0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n']) {
1248        return Some("image/png");
1249    }
1250
1251    if bytes.len() >= 3 && bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1252        return Some("image/jpeg");
1253    }
1254
1255    if bytes.len() >= 6 && (bytes.starts_with(b"GIF87a") || bytes.starts_with(b"GIF89a")) {
1256        return Some("image/gif");
1257    }
1258
1259    if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1260        return Some("image/webp");
1261    }
1262
1263    if bytes.len() >= 2 && bytes.starts_with(b"BM") {
1264        return Some("image/bmp");
1265    }
1266
1267    None
1268}
1269
1270#[cfg(test)]
1271mod tests {
1272    use super::*;
1273
1274    #[test]
1275    fn strip_media_markers_replaces_image_local_path() {
1276        let input = "Look at [IMAGE:/zeroclaw-data/workspace/telegram_files/photo_1.jpg]";
1277        assert_eq!(strip_media_markers(input), "Look at [media attachment]");
1278    }
1279
1280    #[test]
1281    fn strip_media_markers_replaces_image_data_uri() {
1282        let input = "Inline [IMAGE:data:image/png;base64,abcd]";
1283        assert_eq!(strip_media_markers(input), "Inline [media attachment]");
1284    }
1285
1286    #[test]
1287    fn strip_media_markers_replaces_all_supported_kinds() {
1288        // Mirrors `ATTACHMENT_KINDS` in
1289        // `crates/zeroclaw-channels/src/util.rs`, which is the source of
1290        // truth for which marker spellings inbound channels can produce.
1291        let input = "[IMAGE:/a.jpg] [PHOTO:/b.jpg] [DOCUMENT:/c.pdf] [FILE:/d.zip] [VIDEO:/e.mp4] [VOICE:/f.ogg] [AUDIO:/g.wav]";
1292        let expected = "[media attachment] [media attachment] [media attachment] [media attachment] [media attachment] [media attachment] [media attachment]";
1293        assert_eq!(strip_media_markers(input), expected);
1294    }
1295
1296    #[test]
1297    fn strip_media_markers_is_case_insensitive() {
1298        // Channel parsers uppercase the kind before comparing, so by the time
1299        // a marker reaches conversation history it is normally upper-case —
1300        // but accept lower/mixed case too so we don't depend on that
1301        // invariant downstream.
1302        let input = "[image:/a.jpg] [Photo:/b.jpg] [video:/c.mp4]";
1303        let expected = "[media attachment] [media attachment] [media attachment]";
1304        assert_eq!(strip_media_markers(input), expected);
1305    }
1306
1307    #[test]
1308    fn strip_media_markers_leaves_plain_text_untouched() {
1309        let input = "No markers here, just text with [brackets] and (parens).";
1310        assert_eq!(strip_media_markers(input), input);
1311    }
1312
1313    #[test]
1314    fn strip_media_markers_preserves_unrelated_brackets() {
1315        // Markers that don't match the media kinds are left alone.
1316        let input = "Use [TODO:foo] and [NOTE:bar] but replace [IMAGE:/x.jpg]";
1317        assert_eq!(
1318            strip_media_markers(input),
1319            "Use [TODO:foo] and [NOTE:bar] but replace [media attachment]"
1320        );
1321    }
1322
1323    #[test]
1324    fn parse_image_markers_extracts_multiple_markers() {
1325        let input = "Check this [IMAGE:/tmp/a.png] and this [IMAGE:https://example.com/b.jpg]";
1326        let (cleaned, refs) = parse_image_markers(input);
1327
1328        assert_eq!(cleaned, "Check this  and this");
1329        assert_eq!(refs.len(), 2);
1330        assert_eq!(refs[0], "/tmp/a.png");
1331        assert_eq!(refs[1], "https://example.com/b.jpg");
1332    }
1333
1334    #[test]
1335    fn is_windows_unc_path_accepts_shares_and_rejects_others() {
1336        assert!(is_windows_unc_path(r"\\server\share\pic.png"));
1337        assert!(is_windows_unc_path(r"\\server\share\sub\pic.png"));
1338        // Verbatim / device prefixes are not plain shares.
1339        assert!(!is_windows_unc_path(r"\\?\C:\Users\me\a.png"));
1340        assert!(!is_windows_unc_path(r"\\?\UNC\server\share\a.png"));
1341        assert!(!is_windows_unc_path(r"\\.\PhysicalDrive0"));
1342        // Needs both a server and a further segment.
1343        assert!(!is_windows_unc_path(r"\\server"));
1344        assert!(!is_windows_unc_path(r"\\"));
1345        // Non-UNC inputs.
1346        assert!(!is_windows_unc_path("/home/me/a.png"));
1347        assert!(!is_windows_unc_path(r"C:\Users\me\a.png"));
1348    }
1349
1350    #[test]
1351    fn parse_image_markers_extracts_unc_path() {
1352        // Regression for the #7446 Windows follow-up: `image_info` unwraps the
1353        // verbatim-UNC prefix (`\\?\UNC\…`) to a plain `\\server\share\…`
1354        // path, which must be treated as a loadable image reference (not left
1355        // as literal text) so the image reaches vision models.
1356        let input = r"File: [IMAGE:\\server\share\pic.png]";
1357        let (_, refs) = parse_image_markers(input);
1358        assert_eq!(refs.len(), 1, "UNC marker should be extracted as a ref");
1359        assert_eq!(refs[0], r"\\server\share\pic.png");
1360    }
1361
1362    #[test]
1363    fn validate_mime_rejects_bmp_but_accepts_provider_supported_types() {
1364        for mime in ["image/png", "image/jpeg", "image/webp", "image/gif"] {
1365            assert!(
1366                validate_mime("src", mime).is_ok(),
1367                "{mime} should be allowed"
1368            );
1369        }
1370        // BMP is detectable but unsupported by vision providers; it must be
1371        // rejected here so it never breaks the whole provider request.
1372        let err = validate_mime("src", "image/bmp").unwrap_err();
1373        assert_eq!(multimodal_error_kind(&err), "unsupported_mime");
1374    }
1375
1376    #[test]
1377    fn parse_image_markers_collapses_line_wrapped_path() {
1378        // Terminal-wrapped paste: a long path split across two rows with
1379        // leading indentation should be recovered into the original path.
1380        let input = "from the logs whether the agent emits\n  [IMAGE:/home/zeroclaw_user/.zeroclaw/workspace/signal_i\n  nbound/attachment.jpg] (which the\n  channel resolves)";
1381        let (_, refs) = parse_image_markers(input);
1382        assert_eq!(refs.len(), 1);
1383        assert_eq!(
1384            refs[0],
1385            "/home/zeroclaw_user/.zeroclaw/workspace/signal_inbound/attachment.jpg"
1386        );
1387    }
1388
1389    #[test]
1390    fn parse_image_markers_leaves_placeholder_markers_as_literal_text() {
1391        // Illustrative markdown like `[IMAGE:...]` or `[IMAGE:<path>]`
1392        // (e.g. in agent-authored prose the user quotes back) is not a
1393        // loadable reference and must stay as literal text — otherwise the
1394        // multimodal loader errors every turn the conversation replays.
1395        let input = "example: `[IMAGE:...]` or `[IMAGE:<path>]` or `[IMAGE:example.png]`";
1396        let (cleaned, refs) = parse_image_markers(input);
1397        assert!(
1398            refs.is_empty(),
1399            "no placeholder should be treated as a loadable ref, got: {refs:?}"
1400        );
1401        assert!(cleaned.contains("[IMAGE:...]"));
1402        assert!(cleaned.contains("[IMAGE:<path>]"));
1403        assert!(cleaned.contains("[IMAGE:example.png]"));
1404    }
1405
1406    #[test]
1407    fn parse_image_markers_preserves_spaces_in_path() {
1408        // Spaces within a single-line marker are legitimate (paths can
1409        // contain spaces) and must survive unchanged.
1410        let input = "look at [IMAGE:/tmp/my photos/beetle.png] please";
1411        let (_, refs) = parse_image_markers(input);
1412        assert_eq!(refs.len(), 1);
1413        assert_eq!(refs[0], "/tmp/my photos/beetle.png");
1414    }
1415
1416    #[test]
1417    fn parse_image_markers_keeps_invalid_empty_marker() {
1418        let input = "hello [IMAGE:] world";
1419        let (cleaned, refs) = parse_image_markers(input);
1420
1421        assert_eq!(cleaned, "hello [IMAGE:] world");
1422        assert!(refs.is_empty());
1423    }
1424
1425    #[tokio::test]
1426    async fn prepare_messages_normalizes_local_image_to_data_uri() {
1427        let temp = tempfile::tempdir().unwrap();
1428        let image_path = temp.path().join("sample.png");
1429
1430        // Minimal PNG signature bytes are enough for MIME detection.
1431        std::fs::write(
1432            &image_path,
1433            [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1434        )
1435        .unwrap();
1436
1437        let messages = vec![ChatMessage::user(format!(
1438            "Please inspect this screenshot [IMAGE:{}]",
1439            image_path.display()
1440        ))];
1441
1442        let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1443            .await
1444            .unwrap();
1445
1446        assert!(prepared.contains_images);
1447        assert_eq!(prepared.messages.len(), 1);
1448
1449        let (cleaned, refs) = parse_image_markers(&prepared.messages[0].content);
1450        assert_eq!(cleaned, "Please inspect this screenshot");
1451        assert_eq!(refs.len(), 1);
1452        assert!(refs[0].starts_with("data:image/png;base64,"));
1453    }
1454
1455    #[tokio::test]
1456    // Covers the plain-text fallback path for `role == "tool"` messages
1457    // whose `content` is not a native-dispatcher JSON payload (e.g.
1458    // synthetic XML-shaped input or future non-JSON tool transports). The
1459    // JSON-shaped native contract is exercised by
1460    // `prepare_messages_preserves_native_tool_result_json_shape` below.
1461    async fn prepare_messages_normalizes_tool_message_local_image_to_data_uri() {
1462        let temp = tempfile::tempdir().unwrap();
1463        let image_path = temp.path().join("tool-sample.png");
1464
1465        std::fs::write(
1466            &image_path,
1467            [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1468        )
1469        .unwrap();
1470
1471        let messages = vec![ChatMessage::tool(format!(
1472            "<tool_result name=\"image_gen\">\nGenerated image [IMAGE:{}]\n</tool_result>",
1473            image_path.display()
1474        ))];
1475
1476        let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1477            .await
1478            .unwrap();
1479
1480        assert!(prepared.contains_images);
1481        assert_eq!(prepared.messages.len(), 1);
1482        assert_eq!(prepared.messages[0].role, "tool");
1483
1484        let (cleaned, refs) = parse_image_markers(&prepared.messages[0].content);
1485        assert!(cleaned.contains("<tool_result name=\"image_gen\">"));
1486        assert!(cleaned.contains("Generated image"));
1487        assert_eq!(refs.len(), 1);
1488        assert!(refs[0].starts_with("data:image/png;base64,"));
1489    }
1490
1491    // Regression for the JSON-clobber bug surfaced on PR #6183: native tool
1492    // dispatchers serialize tool results as `{"tool_call_id":"…","content":"…"}`
1493    // and downstream adapters (e.g. `OpenAiCompatibleProvider::convert_messages_for_native`)
1494    // recover `tool_call_id` via `serde_json::from_str` on the message
1495    // content. The multimodal preprocessor must keep that JSON intact while
1496    // still inlining any `[IMAGE:/path]` markers inside the inner `content`
1497    // field. Asserts:
1498    //   1. Prepared content is still valid JSON.
1499    //   2. `tool_call_id` survives unchanged.
1500    //   3. The inner `content` field carries `data:image/png;base64,…`
1501    //      (marker rewritten) and keeps surrounding text.
1502    #[tokio::test]
1503    async fn prepare_messages_preserves_native_tool_result_json_shape() {
1504        let temp = tempfile::tempdir().unwrap();
1505        let image_path = temp.path().join("native-tool-result.png");
1506        std::fs::write(
1507            &image_path,
1508            [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1509        )
1510        .unwrap();
1511
1512        let native_tool_content = serde_json::json!({
1513            "tool_call_id": "tc1",
1514            "content": format!("see attached [IMAGE:{}]", image_path.display().to_string()),
1515        })
1516        .to_string();
1517
1518        let messages = vec![ChatMessage::tool(native_tool_content)];
1519
1520        let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1521            .await
1522            .expect("preparation should succeed for native tool-result JSON");
1523
1524        assert!(prepared.contains_images);
1525        assert_eq!(prepared.messages.len(), 1);
1526        assert_eq!(prepared.messages[0].role, "tool");
1527
1528        let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1529            .expect("prepared tool message must remain valid JSON");
1530
1531        assert_eq!(
1532            value.get("tool_call_id").and_then(|v| v.as_str()),
1533            Some("tc1"),
1534            "tool_call_id must survive multimodal preprocessing unchanged"
1535        );
1536
1537        let inner = value
1538            .get("content")
1539            .and_then(|v| v.as_str())
1540            .expect("content must remain a JSON string");
1541        assert!(
1542            inner.contains("see attached"),
1543            "surrounding text in tool content should survive normalization"
1544        );
1545        assert!(
1546            inner.contains("data:image/png;base64,"),
1547            "local image path inside tool content should be rewritten to a data URI"
1548        );
1549        assert!(
1550            !inner.contains("native-tool-result.png"),
1551            "raw local path must not leak after normalization"
1552        );
1553    }
1554
1555    #[tokio::test]
1556    async fn prepare_messages_preserves_native_tool_json_when_image_is_skipped() {
1557        let native_tool_content = serde_json::json!({
1558            "tool_call_id": "tc1",
1559            "content": "generated screenshot [IMAGE:https://example.com/missing.png]",
1560        })
1561        .to_string();
1562
1563        let prepared = prepare_messages_for_provider(
1564            &[ChatMessage::tool(native_tool_content)],
1565            &MultimodalConfig::default(),
1566        )
1567        .await
1568        .expect("skipped native tool image should not fail message preparation");
1569
1570        assert!(!prepared.contains_images);
1571        assert_eq!(prepared.messages.len(), 1);
1572
1573        let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1574            .expect("native tool result must remain valid JSON");
1575        assert_eq!(
1576            value.get("tool_call_id").and_then(|v| v.as_str()),
1577            Some("tc1")
1578        );
1579
1580        let inner = value
1581            .get("content")
1582            .and_then(|v| v.as_str())
1583            .expect("content should remain a JSON string");
1584        assert!(inner.contains("generated screenshot"));
1585        assert!(inner.contains("1 attached image(s) could not be loaded"));
1586        assert!(!inner.contains("[IMAGE:"));
1587        assert!(!inner.contains("https://example.com/missing.png"));
1588    }
1589
1590    #[tokio::test]
1591    async fn prepare_messages_preserves_native_tool_json_with_mixed_images() {
1592        let temp = tempfile::tempdir().unwrap();
1593        let image_path = temp.path().join("mixed-native-tool-result.png");
1594        std::fs::write(
1595            &image_path,
1596            [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1597        )
1598        .unwrap();
1599
1600        let native_tool_content = serde_json::json!({
1601            "tool_call_id": "tc1",
1602            "content": format!(
1603                "generated [IMAGE:{}] and [IMAGE:https://example.com/missing.png]",
1604                image_path.display()
1605            ),
1606        })
1607        .to_string();
1608
1609        let prepared = prepare_messages_for_provider(
1610            &[ChatMessage::tool(native_tool_content)],
1611            &MultimodalConfig::default(),
1612        )
1613        .await
1614        .expect("valid native tool image should survive while bad ref is skipped");
1615
1616        assert!(prepared.contains_images);
1617        assert_eq!(prepared.messages.len(), 1);
1618
1619        let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1620            .expect("native tool result must remain valid JSON");
1621        assert_eq!(
1622            value.get("tool_call_id").and_then(|v| v.as_str()),
1623            Some("tc1")
1624        );
1625
1626        let inner = value
1627            .get("content")
1628            .and_then(|v| v.as_str())
1629            .expect("content should remain a JSON string");
1630        assert!(inner.contains("generated"));
1631        assert!(inner.contains("data:image/png;base64,"));
1632        assert!(inner.contains("1 of 2 attached image(s) could not be loaded"));
1633        assert!(!inner.contains("mixed-native-tool-result.png"));
1634        assert!(!inner.contains("https://example.com/missing.png"));
1635    }
1636
1637    #[tokio::test]
1638    async fn prepare_messages_strips_stale_native_tool_result_images() {
1639        let temp = tempfile::tempdir().unwrap();
1640        let image_path = temp.path().join("stale-native-tool-result.png");
1641        std::fs::write(
1642            &image_path,
1643            [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1644        )
1645        .unwrap();
1646
1647        let native_tool_content = serde_json::json!({
1648            "tool_call_id": "tc1",
1649            "content": format!("generated screenshot [IMAGE:{}]", image_path.display().to_string()),
1650        })
1651        .to_string();
1652
1653        let messages = vec![
1654            ChatMessage::tool(native_tool_content),
1655            ChatMessage {
1656                role: "assistant".to_string(),
1657                content: "I generated the screenshot.".to_string(),
1658            },
1659            ChatMessage::user("What happened next?".to_string()),
1660        ];
1661
1662        let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1663            .await
1664            .expect("preparation should strip stale tool images without loading them");
1665
1666        assert!(
1667            !prepared.contains_images,
1668            "stale tool-result images should not keep the request in vision mode"
1669        );
1670
1671        let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1672            .expect("stale native tool result should remain valid JSON");
1673        assert_eq!(
1674            value.get("tool_call_id").and_then(|v| v.as_str()),
1675            Some("tc1")
1676        );
1677
1678        let inner = value
1679            .get("content")
1680            .and_then(|v| v.as_str())
1681            .expect("content should remain a JSON string");
1682        assert!(inner.contains("generated screenshot"));
1683        assert!(!inner.contains("[IMAGE:"));
1684        assert!(!inner.contains("data:image"));
1685        assert!(!inner.contains("stale-native-tool-result.png"));
1686    }
1687
1688    #[tokio::test]
1689    async fn prepare_messages_strips_stale_prompt_tool_result_images() {
1690        let temp = tempfile::tempdir().unwrap();
1691        let image_path = temp.path().join("stale-prompt-tool-result.png");
1692        std::fs::write(
1693            &image_path,
1694            [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
1695        )
1696        .unwrap();
1697
1698        let messages = vec![
1699            ChatMessage::user(format!(
1700                "[Tool results]\n<tool_result name=\"image_gen\">Generated [IMAGE:{}]</tool_result>",
1701                image_path.display()
1702            )),
1703            ChatMessage {
1704                role: "assistant".to_string(),
1705                content: "I generated the screenshot.".to_string(),
1706            },
1707            ChatMessage::user("Continue.".to_string()),
1708        ];
1709
1710        let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1711            .await
1712            .expect("preparation should strip stale prompt-mode tool images");
1713
1714        assert!(!prepared.contains_images);
1715        assert!(prepared.messages[0].content.contains("[Tool results]"));
1716        assert!(prepared.messages[0].content.contains("Generated"));
1717        assert!(!prepared.messages[0].content.contains("[IMAGE:"));
1718        assert!(!prepared.messages[0].content.contains("data:image"));
1719        assert!(
1720            !prepared.messages[0]
1721                .content
1722                .contains("stale-prompt-tool-result.png")
1723        );
1724    }
1725
1726    #[tokio::test]
1727    async fn prepare_messages_strips_stale_tool_image_while_normalizing_current_user_image() {
1728        let temp = tempfile::tempdir().unwrap();
1729        let stale_path = temp.path().join("stale-tool-result.png");
1730        let fresh_path = temp.path().join("fresh-user-image.png");
1731        let png = [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'];
1732        std::fs::write(&stale_path, png).unwrap();
1733        std::fs::write(&fresh_path, png).unwrap();
1734
1735        let native_tool_content = serde_json::json!({
1736            "tool_call_id": "tc1",
1737            "content": format!("generated screenshot [IMAGE:{}]", stale_path.display().to_string()),
1738        })
1739        .to_string();
1740
1741        let messages = vec![
1742            ChatMessage::tool(native_tool_content),
1743            ChatMessage {
1744                role: "assistant".to_string(),
1745                content: "I generated the screenshot.".to_string(),
1746            },
1747            ChatMessage::user(format!(
1748                "Now inspect this [IMAGE:{}]",
1749                fresh_path.display().to_string()
1750            )),
1751        ];
1752
1753        let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
1754            .await
1755            .expect("preparation should strip stale tool images and normalize current user image");
1756
1757        assert!(prepared.contains_images);
1758
1759        let value: serde_json::Value = serde_json::from_str(&prepared.messages[0].content)
1760            .expect("stale native tool result should remain valid JSON");
1761        let inner = value
1762            .get("content")
1763            .and_then(|v| v.as_str())
1764            .expect("content should remain a JSON string");
1765        assert!(inner.contains("generated screenshot"));
1766        assert!(!inner.contains("[IMAGE:"));
1767        assert!(!inner.contains("data:image"));
1768        assert!(!inner.contains("stale-tool-result.png"));
1769
1770        let (cleaned, refs) = parse_image_markers(&prepared.messages[2].content);
1771        assert_eq!(cleaned, "Now inspect this");
1772        assert_eq!(refs.len(), 1);
1773        assert!(refs[0].starts_with("data:image/png;base64,"));
1774        assert!(
1775            !prepared.messages[2]
1776                .content
1777                .contains("fresh-user-image.png")
1778        );
1779    }
1780
1781    #[test]
1782    fn count_image_markers_ignores_stale_tool_results() {
1783        let messages = vec![
1784            ChatMessage::tool("[IMAGE:/tmp/stale-tool.png]\nGenerated".to_string()),
1785            ChatMessage {
1786                role: "assistant".to_string(),
1787                content: "Done.".to_string(),
1788            },
1789            ChatMessage::user("Next question".to_string()),
1790        ];
1791
1792        assert_eq!(count_image_markers(&messages), 0);
1793
1794        let messages = vec![
1795            ChatMessage::user("Create an image".to_string()),
1796            ChatMessage::tool("[IMAGE:/tmp/latest-tool.png]\nGenerated".to_string()),
1797        ];
1798
1799        assert_eq!(count_image_markers(&messages), 1);
1800    }
1801
1802    #[tokio::test]
1803    async fn prepare_messages_trims_excess_images_from_older_messages() {
1804        // 3 messages, each with 1 image — max is 2.
1805        // The oldest message's image should be stripped.
1806        let messages = vec![
1807            ChatMessage::user("[IMAGE:/tmp/old.png]\nOld caption".to_string()),
1808            ChatMessage::user("[IMAGE:/tmp/mid.png]\nMid caption".to_string()),
1809            ChatMessage::user("[IMAGE:/tmp/new.png]\nNew caption".to_string()),
1810        ];
1811
1812        // Should not error — instead trims oldest.
1813        // (Will error on normalize_image_reference for the surviving images
1814        //  since /tmp/mid.png and /tmp/new.png don't exist, but the trimming
1815        //  itself should succeed.)
1816        let trimmed = trim_old_images(&messages, 2);
1817        assert_eq!(trimmed.len(), 3);
1818
1819        // Oldest message should have image stripped
1820        let (_, refs0) = parse_image_markers(&trimmed[0].content);
1821        assert!(refs0.is_empty(), "oldest image should be stripped");
1822        assert!(trimmed[0].content.contains("Old caption"));
1823
1824        // Newer messages keep their images
1825        let (_, refs1) = parse_image_markers(&trimmed[1].content);
1826        assert_eq!(refs1.len(), 1);
1827        let (_, refs2) = parse_image_markers(&trimmed[2].content);
1828        assert_eq!(refs2.len(), 1);
1829    }
1830
1831    #[test]
1832    fn trim_old_images_replaces_image_only_message() {
1833        // A message with only an image and no text should get a placeholder.
1834        let messages = vec![
1835            ChatMessage::user("[IMAGE:/tmp/old.png]".to_string()),
1836            ChatMessage::user("[IMAGE:/tmp/new.png]\nKeep this".to_string()),
1837        ];
1838
1839        let trimmed = trim_old_images(&messages, 1);
1840        assert_eq!(trimmed[0].content, "[image removed from history]");
1841        assert!(trimmed[1].content.contains("[IMAGE:/tmp/new.png]"));
1842    }
1843
1844    #[test]
1845    fn trim_old_images_multi_image_message_stripped_as_unit() {
1846        // A single message has 3 images. We need to drop 2 to reach max=1.
1847        // But trimming works at message granularity — the entire message gets
1848        // stripped (all 3 images removed), which over-trims to 0. The newest
1849        // message (text-only) is untouched.
1850        let messages = vec![
1851            ChatMessage::user(
1852                "[IMAGE:/tmp/a.png]\n[IMAGE:/tmp/b.png]\n[IMAGE:/tmp/c.png]\nThree pics"
1853                    .to_string(),
1854            ),
1855            ChatMessage::user("Just text, no images".to_string()),
1856        ];
1857
1858        let trimmed = trim_old_images(&messages, 1);
1859        assert_eq!(trimmed.len(), 2);
1860        // All images in the first message are gone, but text remains
1861        let (_, refs0) = parse_image_markers(&trimmed[0].content);
1862        assert!(refs0.is_empty());
1863        assert!(trimmed[0].content.contains("Three pics"));
1864        // Second message unchanged
1865        assert_eq!(trimmed[1].content, "Just text, no images");
1866    }
1867
1868    #[test]
1869    fn trim_old_images_skips_assistant_messages() {
1870        // Assistant messages with image markers should not be counted or stripped.
1871        let messages = vec![
1872            ChatMessage {
1873                role: "assistant".to_string(),
1874                content: "[IMAGE:/tmp/assistant.png]\nAssistant generated".to_string(),
1875            },
1876            ChatMessage::user("[IMAGE:/tmp/user1.png]\nFirst".to_string()),
1877            ChatMessage::user("[IMAGE:/tmp/user2.png]\nSecond".to_string()),
1878        ];
1879
1880        let trimmed = trim_old_images(&messages, 1);
1881        // Assistant message untouched (not counted toward limit)
1882        assert!(trimmed[0].content.contains("[IMAGE:/tmp/assistant.png]"));
1883        // Oldest user image stripped
1884        let (_, refs1) = parse_image_markers(&trimmed[1].content);
1885        assert!(refs1.is_empty());
1886        assert!(trimmed[1].content.contains("First"));
1887        // Newest user image kept
1888        let (_, refs2) = parse_image_markers(&trimmed[2].content);
1889        assert_eq!(refs2.len(), 1);
1890    }
1891
1892    #[test]
1893    fn trim_old_images_counts_latest_tool_messages() {
1894        let messages = vec![
1895            ChatMessage::user("[IMAGE:/tmp/user-old.png]\nOldest".to_string()),
1896            ChatMessage::tool("[IMAGE:/tmp/tool-new.png]\nGenerated".to_string()),
1897        ];
1898
1899        let trimmed = trim_old_images(&messages, 1);
1900        let (_, refs0) = parse_image_markers(&trimmed[0].content);
1901        assert!(refs0.is_empty(), "oldest user image should be stripped");
1902        assert!(trimmed[0].content.contains("Oldest"));
1903
1904        let (_, refs1) = parse_image_markers(&trimmed[1].content);
1905        assert_eq!(refs1.len(), 1);
1906    }
1907
1908    #[test]
1909    fn trim_old_images_no_trimming_when_under_limit() {
1910        let messages = vec![
1911            ChatMessage::user("[IMAGE:/tmp/a.png]\nCaption A".to_string()),
1912            ChatMessage::user("[IMAGE:/tmp/b.png]\nCaption B".to_string()),
1913        ];
1914
1915        let trimmed = trim_old_images(&messages, 5);
1916        // Nothing should change — both images are under the limit
1917        assert_eq!(trimmed[0].content, messages[0].content);
1918        assert_eq!(trimmed[1].content, messages[1].content);
1919    }
1920
1921    #[test]
1922    fn trim_old_images_no_trimming_when_exactly_at_limit() {
1923        let messages = vec![
1924            ChatMessage::user("[IMAGE:/tmp/a.png]\nA".to_string()),
1925            ChatMessage::user("[IMAGE:/tmp/b.png]\nB".to_string()),
1926        ];
1927
1928        let trimmed = trim_old_images(&messages, 2);
1929        assert_eq!(trimmed[0].content, messages[0].content);
1930        assert_eq!(trimmed[1].content, messages[1].content);
1931    }
1932
1933    #[test]
1934    fn trim_old_images_empty_messages() {
1935        let trimmed = trim_old_images(&[], 4);
1936        assert!(trimmed.is_empty());
1937    }
1938
1939    #[test]
1940    fn trim_old_images_interleaved_roles() {
1941        // Realistic conversation: user sends image, assistant replies, user sends
1942        // another image, etc. Only user messages should be candidates for trimming.
1943        let messages = vec![
1944            ChatMessage::user("[IMAGE:/tmp/1.png]\nLook at this".to_string()),
1945            ChatMessage {
1946                role: "assistant".to_string(),
1947                content: "I see a photo.".to_string(),
1948            },
1949            ChatMessage::user("[IMAGE:/tmp/2.png]\nWhat about this?".to_string()),
1950            ChatMessage {
1951                role: "assistant".to_string(),
1952                content: "That's a chart.".to_string(),
1953            },
1954            ChatMessage::user("[IMAGE:/tmp/3.png]\nAnd this one".to_string()),
1955        ];
1956
1957        let trimmed = trim_old_images(&messages, 2);
1958        assert_eq!(trimmed.len(), 5);
1959        // Oldest user image stripped
1960        let (_, refs0) = parse_image_markers(&trimmed[0].content);
1961        assert!(refs0.is_empty());
1962        assert!(trimmed[0].content.contains("Look at this"));
1963        // Assistant messages untouched
1964        assert_eq!(trimmed[1].content, "I see a photo.");
1965        assert_eq!(trimmed[3].content, "That's a chart.");
1966        // Two newest user images kept
1967        let (_, refs2) = parse_image_markers(&trimmed[2].content);
1968        assert_eq!(refs2.len(), 1);
1969        let (_, refs4) = parse_image_markers(&trimmed[4].content);
1970        assert_eq!(refs4.len(), 1);
1971    }
1972
1973    #[test]
1974    fn trim_old_images_strips_multiple_oldest_messages() {
1975        // 5 user images, max 1 — should strip the first 4 messages' images.
1976        let messages: Vec<ChatMessage> = (1..=5)
1977            .map(|i| ChatMessage::user(format!("[IMAGE:/tmp/{i}.png]\nCaption {i}")))
1978            .collect();
1979
1980        let trimmed = trim_old_images(&messages, 1);
1981        assert_eq!(trimmed.len(), 5);
1982        for (i, msg) in trimmed.iter().enumerate().take(4) {
1983            let (_, refs) = parse_image_markers(&msg.content);
1984            assert!(refs.is_empty(), "message {i} should have images stripped");
1985            assert!(msg.content.contains(&format!("Caption {}", i + 1)));
1986        }
1987        // Only the last message keeps its image
1988        let (_, refs_last) = parse_image_markers(&trimmed[4].content);
1989        assert_eq!(refs_last.len(), 1);
1990    }
1991
1992    #[tokio::test]
1993    async fn prepare_messages_trims_then_normalizes_surviving_images() {
1994        // End-to-end: 3 images, max 2. After trimming the oldest, the two
1995        // surviving images should be normalized (base64-encoded) successfully.
1996        let temp = tempfile::tempdir().unwrap();
1997        let mut paths = Vec::new();
1998        for name in ["old.png", "mid.png", "new.png"] {
1999            let p = temp.path().join(name);
2000            // Minimal valid PNG (1x1 white pixel)
2001            let png_data = [
2002                0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG signature
2003                0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, // IHDR chunk
2004                0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, 0x00, 0x00, 0x00, 0x90,
2005                0x77, 0x53, 0xDE, // 1x1 RGB
2006                0x00, 0x00, 0x00, 0x0C, 0x49, 0x44, 0x41, 0x54, // IDAT chunk
2007                0x08, 0xD7, 0x63, 0xF8, 0xCF, 0xC0, 0x00, 0x00, 0x00, 0x02, 0x00, 0x01, 0xE2, 0x21,
2008                0xBC, 0x33, // IDAT data + CRC
2009                0x00, 0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, 0x44, // IEND chunk
2010                0xAE, 0x42, 0x60, 0x82,
2011            ];
2012            std::fs::write(&p, png_data).unwrap();
2013            paths.push(p);
2014        }
2015
2016        let messages = vec![
2017            ChatMessage::user(format!("[IMAGE:{}]\nOld", paths[0].display().to_string())),
2018            ChatMessage::user(format!("[IMAGE:{}]\nMid", paths[1].display().to_string())),
2019            ChatMessage::user(format!("[IMAGE:{}]\nNew", paths[2].display().to_string())),
2020        ];
2021
2022        let config = MultimodalConfig {
2023            max_images: 2,
2024            max_image_size_mb: 5,
2025            allow_remote_fetch: false,
2026            ..Default::default()
2027        };
2028
2029        let result = prepare_messages_for_provider(&messages, &config)
2030            .await
2031            .expect("should succeed after trimming");
2032
2033        assert!(result.contains_images);
2034        assert_eq!(result.messages.len(), 3);
2035        // First message should have image stripped, text preserved
2036        assert!(!result.messages[0].content.contains("data:image"));
2037        assert!(result.messages[0].content.contains("Old"));
2038        // Second and third should have base64-encoded images
2039        assert!(result.messages[1].content.contains("data:image"));
2040        assert!(result.messages[2].content.contains("data:image"));
2041    }
2042
2043    #[tokio::test]
2044    async fn prepare_messages_skips_remote_url_when_disabled() {
2045        let messages = vec![ChatMessage::user(
2046            "Look [IMAGE:https://example.com/img.png]".to_string(),
2047        )];
2048
2049        let result = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
2050            .await
2051            .expect("disabled remote image should be skipped");
2052
2053        assert!(!result.contains_images);
2054        assert_eq!(result.messages.len(), 1);
2055        assert!(result.messages[0].content.contains("Look"));
2056        assert!(
2057            result.messages[0]
2058                .content
2059                .contains("1 attached image(s) could not be loaded")
2060        );
2061        assert!(
2062            !result.messages[0]
2063                .content
2064                .contains("https://example.com/img.png")
2065        );
2066    }
2067
2068    #[tokio::test]
2069    async fn prepare_messages_skips_oversized_local_image() {
2070        let temp = tempfile::tempdir().unwrap();
2071        let image_path = temp.path().join("big.png");
2072
2073        let bytes = vec![0u8; 1024 * 1024 + 1];
2074        std::fs::write(&image_path, bytes).unwrap();
2075
2076        let messages = vec![ChatMessage::user(format!(
2077            "[IMAGE:{}]",
2078            image_path.display()
2079        ))];
2080        let config = MultimodalConfig {
2081            max_images: 4,
2082            max_image_size_mb: 1,
2083            allow_remote_fetch: false,
2084            ..Default::default()
2085        };
2086
2087        let result = prepare_messages_for_provider(&messages, &config)
2088            .await
2089            .expect("oversized local image should be skipped");
2090
2091        assert!(!result.contains_images);
2092        assert_eq!(result.messages.len(), 1);
2093        assert!(
2094            result.messages[0]
2095                .content
2096                .contains("1 attached image(s) could not be loaded")
2097        );
2098        assert!(
2099            !result.messages[0]
2100                .content
2101                .contains(image_path.to_string_lossy().as_ref())
2102        );
2103    }
2104
2105    #[tokio::test]
2106    async fn prepare_messages_keeps_successful_images_when_some_are_skipped() {
2107        let temp = tempfile::tempdir().unwrap();
2108        let image_path = temp.path().join("ok.png");
2109        std::fs::write(
2110            &image_path,
2111            [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
2112        )
2113        .unwrap();
2114
2115        let messages = vec![ChatMessage::user(format!(
2116            "Look [IMAGE:{}] and [IMAGE:https://example.com/missing.png]",
2117            image_path.display()
2118        ))];
2119
2120        let result = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
2121            .await
2122            .expect("valid local image should survive while remote image is skipped");
2123
2124        assert!(result.contains_images);
2125        assert!(
2126            result.messages[0]
2127                .content
2128                .contains("data:image/png;base64,")
2129        );
2130        assert!(
2131            result.messages[0]
2132                .content
2133                .contains("1 of 2 attached image(s) could not be loaded")
2134        );
2135        assert!(
2136            !result.messages[0]
2137                .content
2138                .contains("https://example.com/missing.png")
2139        );
2140    }
2141
2142    #[tokio::test]
2143    async fn skipped_images_do_not_consume_image_budget() {
2144        let temp = tempfile::tempdir().unwrap();
2145        let image_path = temp.path().join("older-valid.png");
2146        std::fs::write(
2147            &image_path,
2148            [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
2149        )
2150        .unwrap();
2151
2152        let messages = vec![
2153            ChatMessage::user(format!(
2154                "Older valid image [IMAGE:{}]",
2155                image_path.display()
2156            )),
2157            ChatMessage::user(
2158                "Newer broken image [IMAGE:https://example.com/missing.png]".to_string(),
2159            ),
2160        ];
2161        let config = MultimodalConfig {
2162            max_images: 1,
2163            max_image_size_mb: 5,
2164            allow_remote_fetch: false,
2165            ..Default::default()
2166        };
2167
2168        let result = prepare_messages_for_provider(&messages, &config)
2169            .await
2170            .expect("broken image should not evict an older valid image");
2171
2172        assert!(result.contains_images);
2173        assert!(
2174            result.messages[0]
2175                .content
2176                .contains("data:image/png;base64,")
2177        );
2178        assert!(result.messages[1].content.contains("Newer broken image"));
2179        assert!(
2180            result.messages[1]
2181                .content
2182                .contains("1 attached image(s) could not be loaded")
2183        );
2184        assert!(
2185            !result.messages[1]
2186                .content
2187                .contains("https://example.com/missing.png")
2188        );
2189    }
2190
2191    #[test]
2192    fn extract_ollama_image_payload_supports_data_uris() {
2193        let payload = extract_ollama_image_payload("data:image/png;base64,abcd==")
2194            .expect("payload should be extracted");
2195        assert_eq!(payload, "abcd==");
2196    }
2197
2198    /// Stripping `[IMAGE:]` markers from history messages leaves only the text
2199    /// portion, which is the behaviour needed for non-vision model_providers.
2200    #[test]
2201    fn parse_image_markers_strips_markers_leaving_caption() {
2202        let input = "[IMAGE:/tmp/photo.jpg]\n\nDescribe this screenshot";
2203        let (cleaned, refs) = parse_image_markers(input);
2204        assert_eq!(cleaned, "Describe this screenshot");
2205        assert_eq!(refs.len(), 1);
2206        assert_eq!(refs[0], "/tmp/photo.jpg");
2207    }
2208
2209    /// An image-only message (no caption) should produce an empty string after
2210    /// marker stripping, so callers can drop it from history.
2211    #[test]
2212    fn parse_image_markers_image_only_message_becomes_empty() {
2213        let input = "[IMAGE:/tmp/photo.jpg]";
2214        let (cleaned, refs) = parse_image_markers(input);
2215        assert!(
2216            cleaned.is_empty(),
2217            "expected empty string, got: {cleaned:?}"
2218        );
2219        assert_eq!(refs.len(), 1);
2220    }
2221}