zeroclaw_channels/
link_enricher.rs

1//! Link enricher: auto-detects URLs in inbound messages, fetches their content,
2//! and prepends summaries so the agent has link context without explicit tool calls.
3
4use regex::Regex;
5use std::net::IpAddr;
6use std::sync::LazyLock;
7use std::time::Duration;
8
9/// Configuration for the link enricher pipeline stage.
10#[derive(Debug, Clone)]
11pub struct LinkEnricherConfig {
12    pub enabled: bool,
13    pub max_links: usize,
14    pub timeout_secs: u64,
15}
16
17impl Default for LinkEnricherConfig {
18    fn default() -> Self {
19        Self {
20            enabled: false,
21            max_links: 3,
22            timeout_secs: 10,
23        }
24    }
25}
26
27/// URL regex: matches `http://` and `https://` URLs, stopping at whitespace, angle
28/// brackets, or double-quotes.
29static URL_RE: LazyLock<Regex> =
30    LazyLock::new(|| Regex::new(r#"https?://[^\s<>"']+"#).expect("URL regex must compile"));
31
32/// Extract URLs from message text, returning up to `max` unique URLs.
33pub fn extract_urls(text: &str, max: usize) -> Vec<String> {
34    let mut seen = Vec::new();
35    for m in URL_RE.find_iter(text) {
36        let url = m.as_str().to_string();
37        if !seen.contains(&url) {
38            seen.push(url);
39            if seen.len() >= max {
40                break;
41            }
42        }
43    }
44    seen
45}
46
47/// Returns `true` if the URL points to a private/local address that should be
48/// blocked for SSRF protection.
49pub fn is_ssrf_target(url: &str) -> bool {
50    let host = match extract_host(url) {
51        Some(h) => h,
52        None => return true, // unparseable URLs are rejected
53    };
54
55    // Check hostname-based locals
56    if host == "localhost"
57        || host.ends_with(".localhost")
58        || host.ends_with(".local")
59        || host == "local"
60    {
61        return true;
62    }
63
64    // Check IP-based private ranges
65    if let Ok(ip) = host.parse::<IpAddr>() {
66        return is_private_ip(ip);
67    }
68
69    false
70}
71
72/// Extract the host portion from a URL string.
73fn extract_host(url: &str) -> Option<String> {
74    let rest = url
75        .strip_prefix("https://")
76        .or_else(|| url.strip_prefix("http://"))?;
77    let authority = rest.split(['/', '?', '#']).next()?;
78    if authority.is_empty() {
79        return None;
80    }
81    // Strip port
82    let host = if authority.starts_with('[') {
83        // IPv6 in brackets — reject for simplicity
84        return None;
85    } else {
86        authority.split(':').next().unwrap_or(authority)
87    };
88    Some(host.to_lowercase())
89}
90
91/// Check if an IP address falls within private/reserved ranges.
92fn is_private_ip(ip: IpAddr) -> bool {
93    match ip {
94        IpAddr::V4(v4) => {
95            v4.is_loopback()           // 127.0.0.0/8
96                || v4.is_private()     // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
97                || v4.is_link_local()  // 169.254.0.0/16
98                || v4.is_unspecified() // 0.0.0.0
99                || v4.is_broadcast()   // 255.255.255.255
100                || v4.is_multicast() // 224.0.0.0/4
101        }
102        IpAddr::V6(v6) => {
103            v6.is_loopback()       // ::1
104                || v6.is_unspecified() // ::
105                || v6.is_multicast()
106                // Check for IPv4-mapped IPv6 addresses
107                || v6.to_ipv4_mapped().is_some_and(|v4| {
108                    v4.is_loopback()
109                        || v4.is_private()
110                        || v4.is_link_local()
111                        || v4.is_unspecified()
112                })
113        }
114    }
115}
116
117/// Extract the `<title>` tag content from HTML.
118pub fn extract_title(html: &str) -> Option<String> {
119    // Case-insensitive search for <title>...</title>
120    let lower = html.to_lowercase();
121    let start = lower.find("<title")? + "<title".len();
122    // Skip attributes if any (e.g. <title lang="en">)
123    let start = lower[start..].find('>')? + start + 1;
124    let end = lower[start..].find("</title")? + start;
125    let title = lower[start..end].trim().to_string();
126    if title.is_empty() {
127        None
128    } else {
129        Some(html_entity_decode_basic(&title))
130    }
131}
132
133/// Extract the first `max_chars` of visible body text from HTML.
134pub fn extract_body_text(html: &str, max_chars: usize) -> String {
135    let text = nanohtml2text::html2text(html);
136    let trimmed = text.trim();
137    if trimmed.len() <= max_chars {
138        trimmed.to_string()
139    } else {
140        let mut result: String = trimmed.chars().take(max_chars).collect();
141        result.push_str("...");
142        result
143    }
144}
145
146/// Basic HTML entity decoding for title content.
147fn html_entity_decode_basic(s: &str) -> String {
148    s.replace("&amp;", "&")
149        .replace("&lt;", "<")
150        .replace("&gt;", ">")
151        .replace("&quot;", "\"")
152        .replace("&#39;", "'")
153        .replace("&apos;", "'")
154}
155
156/// Summary of a fetched link.
157struct LinkSummary {
158    title: String,
159    snippet: String,
160}
161
162/// Fetch a single URL and extract a summary. Returns `None` on any failure.
163async fn fetch_link_summary(url: &str, timeout_secs: u64) -> Option<LinkSummary> {
164    let client = reqwest::Client::builder()
165        .timeout(Duration::from_secs(timeout_secs))
166        .connect_timeout(Duration::from_secs(5))
167        .redirect(reqwest::redirect::Policy::limited(5))
168        .user_agent("ZeroClaw/0.1 (link-enricher)")
169        .build()
170        .ok()?;
171
172    let response = client.get(url).send().await.ok()?;
173    if !response.status().is_success() {
174        return None;
175    }
176
177    // Only process text/html responses
178    let content_type = response
179        .headers()
180        .get(reqwest::header::CONTENT_TYPE)
181        .and_then(|v| v.to_str().ok())
182        .unwrap_or("")
183        .to_lowercase();
184
185    if !content_type.contains("text/html") && !content_type.is_empty() {
186        return None;
187    }
188
189    // Read up to 256KB to extract title and snippet
190    let max_bytes: usize = 256 * 1024;
191    let bytes = response.bytes().await.ok()?;
192    let body = if bytes.len() > max_bytes {
193        String::from_utf8_lossy(&bytes[..max_bytes]).into_owned()
194    } else {
195        String::from_utf8_lossy(&bytes).into_owned()
196    };
197
198    let title = extract_title(&body).unwrap_or_else(|| "Untitled".to_string());
199    let snippet = extract_body_text(&body, 200);
200
201    Some(LinkSummary { title, snippet })
202}
203
204/// Enrich a message by prepending link summaries for any URLs found in the text.
205///
206/// This is the main entry point called from the channel message processing pipeline.
207/// If the enricher is disabled or no URLs are found, the original message is returned
208/// unchanged.
209pub async fn enrich_message(content: &str, config: &LinkEnricherConfig) -> String {
210    if !config.enabled || config.max_links == 0 {
211        return content.to_string();
212    }
213
214    let urls = extract_urls(content, config.max_links);
215    if urls.is_empty() {
216        return content.to_string();
217    }
218
219    // Filter out SSRF targets
220    let safe_urls: Vec<&str> = urls
221        .iter()
222        .filter(|u| !is_ssrf_target(u))
223        .map(|u| u.as_str())
224        .collect();
225    if safe_urls.is_empty() {
226        return content.to_string();
227    }
228
229    let mut enrichments = Vec::new();
230    for url in safe_urls {
231        match fetch_link_summary(url, config.timeout_secs).await {
232            Some(summary) => {
233                enrichments.push(format!("[Link: {} — {}]", summary.title, summary.snippet));
234            }
235            None => {
236                ::zeroclaw_log::record!(
237                    DEBUG,
238                    ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
239                        .with_attrs(::serde_json::json!({"url": url})),
240                    "Link enricher: failed to fetch or extract summary"
241                );
242            }
243        }
244    }
245
246    if enrichments.is_empty() {
247        return content.to_string();
248    }
249
250    let prefix = enrichments.join("\n");
251    format!("{prefix}\n{content}")
252}
253
254#[cfg(test)]
255mod tests {
256    use super::*;
257
258    // ── URL extraction ──────────────────────────────────────────────
259
260    #[test]
261    fn extract_urls_finds_http_and_https() {
262        let text = "Check https://example.com and http://test.org/page for info";
263        let urls = extract_urls(text, 10);
264        assert_eq!(urls, vec!["https://example.com", "http://test.org/page",]);
265    }
266
267    #[test]
268    fn extract_urls_respects_max() {
269        let text = "https://a.com https://b.com https://c.com https://d.com";
270        let urls = extract_urls(text, 2);
271        assert_eq!(urls.len(), 2);
272        assert_eq!(urls[0], "https://a.com");
273        assert_eq!(urls[1], "https://b.com");
274    }
275
276    #[test]
277    fn extract_urls_deduplicates() {
278        let text = "Visit https://example.com and https://example.com again";
279        let urls = extract_urls(text, 10);
280        assert_eq!(urls.len(), 1);
281    }
282
283    #[test]
284    fn extract_urls_handles_no_urls() {
285        let text = "Just a normal message without links";
286        let urls = extract_urls(text, 10);
287        assert!(urls.is_empty());
288    }
289
290    #[test]
291    fn extract_urls_stops_at_angle_brackets() {
292        let text = "Link: <https://example.com/path> done";
293        let urls = extract_urls(text, 10);
294        assert_eq!(urls, vec!["https://example.com/path"]);
295    }
296
297    #[test]
298    fn extract_urls_stops_at_quotes() {
299        let text = r#"href="https://example.com/page" end"#;
300        let urls = extract_urls(text, 10);
301        assert_eq!(urls, vec!["https://example.com/page"]);
302    }
303
304    // ── SSRF protection ─────────────────────────────────────────────
305
306    #[test]
307    fn ssrf_blocks_localhost() {
308        assert!(is_ssrf_target("http://localhost/admin"));
309        assert!(is_ssrf_target("https://localhost:8080/api"));
310    }
311
312    #[test]
313    fn ssrf_blocks_loopback_ip() {
314        assert!(is_ssrf_target("http://127.0.0.1/secret"));
315        assert!(is_ssrf_target("http://127.0.0.2:9090"));
316    }
317
318    #[test]
319    fn ssrf_blocks_private_10_network() {
320        assert!(is_ssrf_target("http://10.0.0.1/internal"));
321        assert!(is_ssrf_target("http://10.255.255.255"));
322    }
323
324    #[test]
325    fn ssrf_blocks_private_172_network() {
326        assert!(is_ssrf_target("http://172.16.0.1/admin"));
327        assert!(is_ssrf_target("http://172.31.255.255"));
328    }
329
330    #[test]
331    fn ssrf_blocks_private_192_168_network() {
332        assert!(is_ssrf_target("http://192.168.1.1/router"));
333        assert!(is_ssrf_target("http://192.168.0.100:3000"));
334    }
335
336    #[test]
337    fn ssrf_blocks_link_local() {
338        assert!(is_ssrf_target("http://169.254.0.1/metadata"));
339        assert!(is_ssrf_target("http://169.254.169.254/latest"));
340    }
341
342    #[test]
343    fn ssrf_blocks_ipv6_loopback() {
344        // IPv6 in brackets is rejected by extract_host
345        assert!(is_ssrf_target("http://[::1]/admin"));
346    }
347
348    #[test]
349    fn ssrf_blocks_dot_local() {
350        assert!(is_ssrf_target("http://myhost.local/api"));
351    }
352
353    #[test]
354    fn ssrf_allows_public_urls() {
355        assert!(!is_ssrf_target("https://example.com/page"));
356        assert!(!is_ssrf_target("https://www.google.com"));
357        assert!(!is_ssrf_target("http://93.184.216.34/resource"));
358    }
359
360    // ── Title extraction ────────────────────────────────────────────
361
362    #[test]
363    fn extract_title_basic() {
364        let html = "<html><head><title>My Page Title</title></head><body>Hello</body></html>";
365        assert_eq!(extract_title(html), Some("my page title".to_string()));
366    }
367
368    #[test]
369    fn extract_title_with_entities() {
370        let html = "<title>Tom &amp; Jerry&#39;s Page</title>";
371        assert_eq!(extract_title(html), Some("tom & jerry's page".to_string()));
372    }
373
374    #[test]
375    fn extract_title_case_insensitive() {
376        let html = "<HTML><HEAD><TITLE>Upper Case</TITLE></HEAD></HTML>";
377        assert_eq!(extract_title(html), Some("upper case".to_string()));
378    }
379
380    #[test]
381    fn extract_title_multibyte_chars_no_panic() {
382        // İ (U+0130) lowercases to 2 chars, changing byte length.
383        // This must not panic or produce wrong offsets.
384        let html = "<title>İstanbul Guide</title>";
385        let result = extract_title(html);
386        assert!(result.is_some());
387        let title = result.unwrap();
388        assert!(title.contains("stanbul"));
389    }
390
391    #[test]
392    fn extract_title_missing() {
393        let html = "<html><body>No title here</body></html>";
394        assert_eq!(extract_title(html), None);
395    }
396
397    #[test]
398    fn extract_title_empty() {
399        let html = "<title>   </title>";
400        assert_eq!(extract_title(html), None);
401    }
402
403    // ── Body text extraction ────────────────────────────────────────
404
405    #[test]
406    fn extract_body_text_strips_html() {
407        let html = "<html><body><h1>Header</h1><p>Some content here</p></body></html>";
408        let text = extract_body_text(html, 200);
409        assert!(text.contains("Header"));
410        assert!(text.contains("Some content"));
411        assert!(!text.contains("<h1>"));
412    }
413
414    #[test]
415    fn extract_body_text_truncates() {
416        let html = "<p>A very long paragraph that should be truncated to fit within the limit.</p>";
417        let text = extract_body_text(html, 20);
418        assert!(text.len() <= 25); // 20 chars + "..."
419        assert!(text.ends_with("..."));
420    }
421
422    // ── Config toggle ───────────────────────────────────────────────
423
424    #[tokio::test]
425    async fn enrich_message_disabled_returns_original() {
426        let config = LinkEnricherConfig {
427            enabled: false,
428            max_links: 3,
429            timeout_secs: 10,
430        };
431        let msg = "Check https://example.com for details";
432        let result = enrich_message(msg, &config).await;
433        assert_eq!(result, msg);
434    }
435
436    #[tokio::test]
437    async fn enrich_message_no_urls_returns_original() {
438        let config = LinkEnricherConfig {
439            enabled: true,
440            max_links: 3,
441            timeout_secs: 10,
442        };
443        let msg = "No links in this message";
444        let result = enrich_message(msg, &config).await;
445        assert_eq!(result, msg);
446    }
447
448    #[tokio::test]
449    async fn enrich_message_ssrf_urls_returns_original() {
450        let config = LinkEnricherConfig {
451            enabled: true,
452            max_links: 3,
453            timeout_secs: 10,
454        };
455        let msg = "Try http://127.0.0.1/admin and http://192.168.1.1/router";
456        let result = enrich_message(msg, &config).await;
457        assert_eq!(result, msg);
458    }
459
460    #[test]
461    fn default_config_is_disabled() {
462        let config = LinkEnricherConfig::default();
463        assert!(!config.enabled);
464        assert_eq!(config.max_links, 3);
465        assert_eq!(config.timeout_secs, 10);
466    }
467}
zeroclaw_channels/link_enricher.rs

zeroclaw_channels/
link_enricher.rs