zeroclaw_channels/
link_enricher.rs1use regex::Regex;
5use std::net::IpAddr;
6use std::sync::LazyLock;
7use std::time::Duration;
8
9#[derive(Debug, Clone)]
11pub struct LinkEnricherConfig {
12 pub enabled: bool,
13 pub max_links: usize,
14 pub timeout_secs: u64,
15}
16
17impl Default for LinkEnricherConfig {
18 fn default() -> Self {
19 Self {
20 enabled: false,
21 max_links: 3,
22 timeout_secs: 10,
23 }
24 }
25}
26
27static URL_RE: LazyLock<Regex> =
30 LazyLock::new(|| Regex::new(r#"https?://[^\s<>"']+"#).expect("URL regex must compile"));
31
32pub fn extract_urls(text: &str, max: usize) -> Vec<String> {
34 let mut seen = Vec::new();
35 for m in URL_RE.find_iter(text) {
36 let url = m.as_str().to_string();
37 if !seen.contains(&url) {
38 seen.push(url);
39 if seen.len() >= max {
40 break;
41 }
42 }
43 }
44 seen
45}
46
47pub fn is_ssrf_target(url: &str) -> bool {
50 let host = match extract_host(url) {
51 Some(h) => h,
52 None => return true, };
54
55 if host == "localhost"
57 || host.ends_with(".localhost")
58 || host.ends_with(".local")
59 || host == "local"
60 {
61 return true;
62 }
63
64 if let Ok(ip) = host.parse::<IpAddr>() {
66 return is_private_ip(ip);
67 }
68
69 false
70}
71
72fn extract_host(url: &str) -> Option<String> {
74 let rest = url
75 .strip_prefix("https://")
76 .or_else(|| url.strip_prefix("http://"))?;
77 let authority = rest.split(['/', '?', '#']).next()?;
78 if authority.is_empty() {
79 return None;
80 }
81 let host = if authority.starts_with('[') {
83 return None;
85 } else {
86 authority.split(':').next().unwrap_or(authority)
87 };
88 Some(host.to_lowercase())
89}
90
91fn is_private_ip(ip: IpAddr) -> bool {
93 match ip {
94 IpAddr::V4(v4) => {
95 v4.is_loopback() || v4.is_private() || v4.is_link_local() || v4.is_unspecified() || v4.is_broadcast() || v4.is_multicast() }
102 IpAddr::V6(v6) => {
103 v6.is_loopback() || v6.is_unspecified() || v6.is_multicast()
106 || v6.to_ipv4_mapped().is_some_and(|v4| {
108 v4.is_loopback()
109 || v4.is_private()
110 || v4.is_link_local()
111 || v4.is_unspecified()
112 })
113 }
114 }
115}
116
117pub fn extract_title(html: &str) -> Option<String> {
119 let lower = html.to_lowercase();
121 let start = lower.find("<title")? + "<title".len();
122 let start = lower[start..].find('>')? + start + 1;
124 let end = lower[start..].find("</title")? + start;
125 let title = lower[start..end].trim().to_string();
126 if title.is_empty() {
127 None
128 } else {
129 Some(html_entity_decode_basic(&title))
130 }
131}
132
133pub fn extract_body_text(html: &str, max_chars: usize) -> String {
135 let text = nanohtml2text::html2text(html);
136 let trimmed = text.trim();
137 if trimmed.len() <= max_chars {
138 trimmed.to_string()
139 } else {
140 let mut result: String = trimmed.chars().take(max_chars).collect();
141 result.push_str("...");
142 result
143 }
144}
145
146fn html_entity_decode_basic(s: &str) -> String {
148 s.replace("&", "&")
149 .replace("<", "<")
150 .replace(">", ">")
151 .replace(""", "\"")
152 .replace("'", "'")
153 .replace("'", "'")
154}
155
156struct LinkSummary {
158 title: String,
159 snippet: String,
160}
161
162async fn fetch_link_summary(url: &str, timeout_secs: u64) -> Option<LinkSummary> {
164 let client = reqwest::Client::builder()
165 .timeout(Duration::from_secs(timeout_secs))
166 .connect_timeout(Duration::from_secs(5))
167 .redirect(reqwest::redirect::Policy::limited(5))
168 .user_agent("ZeroClaw/0.1 (link-enricher)")
169 .build()
170 .ok()?;
171
172 let response = client.get(url).send().await.ok()?;
173 if !response.status().is_success() {
174 return None;
175 }
176
177 let content_type = response
179 .headers()
180 .get(reqwest::header::CONTENT_TYPE)
181 .and_then(|v| v.to_str().ok())
182 .unwrap_or("")
183 .to_lowercase();
184
185 if !content_type.contains("text/html") && !content_type.is_empty() {
186 return None;
187 }
188
189 let max_bytes: usize = 256 * 1024;
191 let bytes = response.bytes().await.ok()?;
192 let body = if bytes.len() > max_bytes {
193 String::from_utf8_lossy(&bytes[..max_bytes]).into_owned()
194 } else {
195 String::from_utf8_lossy(&bytes).into_owned()
196 };
197
198 let title = extract_title(&body).unwrap_or_else(|| "Untitled".to_string());
199 let snippet = extract_body_text(&body, 200);
200
201 Some(LinkSummary { title, snippet })
202}
203
204pub async fn enrich_message(content: &str, config: &LinkEnricherConfig) -> String {
210 if !config.enabled || config.max_links == 0 {
211 return content.to_string();
212 }
213
214 let urls = extract_urls(content, config.max_links);
215 if urls.is_empty() {
216 return content.to_string();
217 }
218
219 let safe_urls: Vec<&str> = urls
221 .iter()
222 .filter(|u| !is_ssrf_target(u))
223 .map(|u| u.as_str())
224 .collect();
225 if safe_urls.is_empty() {
226 return content.to_string();
227 }
228
229 let mut enrichments = Vec::new();
230 for url in safe_urls {
231 match fetch_link_summary(url, config.timeout_secs).await {
232 Some(summary) => {
233 enrichments.push(format!("[Link: {} — {}]", summary.title, summary.snippet));
234 }
235 None => {
236 ::zeroclaw_log::record!(
237 DEBUG,
238 ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
239 .with_attrs(::serde_json::json!({"url": url})),
240 "Link enricher: failed to fetch or extract summary"
241 );
242 }
243 }
244 }
245
246 if enrichments.is_empty() {
247 return content.to_string();
248 }
249
250 let prefix = enrichments.join("\n");
251 format!("{prefix}\n{content}")
252}
253
254#[cfg(test)]
255mod tests {
256 use super::*;
257
258 #[test]
261 fn extract_urls_finds_http_and_https() {
262 let text = "Check https://example.com and http://test.org/page for info";
263 let urls = extract_urls(text, 10);
264 assert_eq!(urls, vec!["https://example.com", "http://test.org/page",]);
265 }
266
267 #[test]
268 fn extract_urls_respects_max() {
269 let text = "https://a.com https://b.com https://c.com https://d.com";
270 let urls = extract_urls(text, 2);
271 assert_eq!(urls.len(), 2);
272 assert_eq!(urls[0], "https://a.com");
273 assert_eq!(urls[1], "https://b.com");
274 }
275
276 #[test]
277 fn extract_urls_deduplicates() {
278 let text = "Visit https://example.com and https://example.com again";
279 let urls = extract_urls(text, 10);
280 assert_eq!(urls.len(), 1);
281 }
282
283 #[test]
284 fn extract_urls_handles_no_urls() {
285 let text = "Just a normal message without links";
286 let urls = extract_urls(text, 10);
287 assert!(urls.is_empty());
288 }
289
290 #[test]
291 fn extract_urls_stops_at_angle_brackets() {
292 let text = "Link: <https://example.com/path> done";
293 let urls = extract_urls(text, 10);
294 assert_eq!(urls, vec!["https://example.com/path"]);
295 }
296
297 #[test]
298 fn extract_urls_stops_at_quotes() {
299 let text = r#"href="https://example.com/page" end"#;
300 let urls = extract_urls(text, 10);
301 assert_eq!(urls, vec!["https://example.com/page"]);
302 }
303
304 #[test]
307 fn ssrf_blocks_localhost() {
308 assert!(is_ssrf_target("http://localhost/admin"));
309 assert!(is_ssrf_target("https://localhost:8080/api"));
310 }
311
312 #[test]
313 fn ssrf_blocks_loopback_ip() {
314 assert!(is_ssrf_target("http://127.0.0.1/secret"));
315 assert!(is_ssrf_target("http://127.0.0.2:9090"));
316 }
317
318 #[test]
319 fn ssrf_blocks_private_10_network() {
320 assert!(is_ssrf_target("http://10.0.0.1/internal"));
321 assert!(is_ssrf_target("http://10.255.255.255"));
322 }
323
324 #[test]
325 fn ssrf_blocks_private_172_network() {
326 assert!(is_ssrf_target("http://172.16.0.1/admin"));
327 assert!(is_ssrf_target("http://172.31.255.255"));
328 }
329
330 #[test]
331 fn ssrf_blocks_private_192_168_network() {
332 assert!(is_ssrf_target("http://192.168.1.1/router"));
333 assert!(is_ssrf_target("http://192.168.0.100:3000"));
334 }
335
336 #[test]
337 fn ssrf_blocks_link_local() {
338 assert!(is_ssrf_target("http://169.254.0.1/metadata"));
339 assert!(is_ssrf_target("http://169.254.169.254/latest"));
340 }
341
342 #[test]
343 fn ssrf_blocks_ipv6_loopback() {
344 assert!(is_ssrf_target("http://[::1]/admin"));
346 }
347
348 #[test]
349 fn ssrf_blocks_dot_local() {
350 assert!(is_ssrf_target("http://myhost.local/api"));
351 }
352
353 #[test]
354 fn ssrf_allows_public_urls() {
355 assert!(!is_ssrf_target("https://example.com/page"));
356 assert!(!is_ssrf_target("https://www.google.com"));
357 assert!(!is_ssrf_target("http://93.184.216.34/resource"));
358 }
359
360 #[test]
363 fn extract_title_basic() {
364 let html = "<html><head><title>My Page Title</title></head><body>Hello</body></html>";
365 assert_eq!(extract_title(html), Some("my page title".to_string()));
366 }
367
368 #[test]
369 fn extract_title_with_entities() {
370 let html = "<title>Tom & Jerry's Page</title>";
371 assert_eq!(extract_title(html), Some("tom & jerry's page".to_string()));
372 }
373
374 #[test]
375 fn extract_title_case_insensitive() {
376 let html = "<HTML><HEAD><TITLE>Upper Case</TITLE></HEAD></HTML>";
377 assert_eq!(extract_title(html), Some("upper case".to_string()));
378 }
379
380 #[test]
381 fn extract_title_multibyte_chars_no_panic() {
382 let html = "<title>İstanbul Guide</title>";
385 let result = extract_title(html);
386 assert!(result.is_some());
387 let title = result.unwrap();
388 assert!(title.contains("stanbul"));
389 }
390
391 #[test]
392 fn extract_title_missing() {
393 let html = "<html><body>No title here</body></html>";
394 assert_eq!(extract_title(html), None);
395 }
396
397 #[test]
398 fn extract_title_empty() {
399 let html = "<title> </title>";
400 assert_eq!(extract_title(html), None);
401 }
402
403 #[test]
406 fn extract_body_text_strips_html() {
407 let html = "<html><body><h1>Header</h1><p>Some content here</p></body></html>";
408 let text = extract_body_text(html, 200);
409 assert!(text.contains("Header"));
410 assert!(text.contains("Some content"));
411 assert!(!text.contains("<h1>"));
412 }
413
414 #[test]
415 fn extract_body_text_truncates() {
416 let html = "<p>A very long paragraph that should be truncated to fit within the limit.</p>";
417 let text = extract_body_text(html, 20);
418 assert!(text.len() <= 25); assert!(text.ends_with("..."));
420 }
421
422 #[tokio::test]
425 async fn enrich_message_disabled_returns_original() {
426 let config = LinkEnricherConfig {
427 enabled: false,
428 max_links: 3,
429 timeout_secs: 10,
430 };
431 let msg = "Check https://example.com for details";
432 let result = enrich_message(msg, &config).await;
433 assert_eq!(result, msg);
434 }
435
436 #[tokio::test]
437 async fn enrich_message_no_urls_returns_original() {
438 let config = LinkEnricherConfig {
439 enabled: true,
440 max_links: 3,
441 timeout_secs: 10,
442 };
443 let msg = "No links in this message";
444 let result = enrich_message(msg, &config).await;
445 assert_eq!(result, msg);
446 }
447
448 #[tokio::test]
449 async fn enrich_message_ssrf_urls_returns_original() {
450 let config = LinkEnricherConfig {
451 enabled: true,
452 max_links: 3,
453 timeout_secs: 10,
454 };
455 let msg = "Try http://127.0.0.1/admin and http://192.168.1.1/router";
456 let result = enrich_message(msg, &config).await;
457 assert_eq!(result, msg);
458 }
459
460 #[test]
461 fn default_config_is_disabled() {
462 let config = LinkEnricherConfig::default();
463 assert!(!config.enabled);
464 assert_eq!(config.max_links, 3);
465 assert_eq!(config.timeout_secs, 10);
466 }
467}