Skip to main content

zeroclaw_runtime/security/
leak_detector.rs

1//! Credential leak detection for outbound content.
2//!
3//! Scans outbound messages for potential credential leaks before they are sent,
4//! preventing accidental exfiltration of API keys, tokens, passwords, and other
5//! sensitive values.
6//!
7//! Contributed from RustyClaw (MIT licensed).
8
9use regex::Regex;
10use std::collections::HashMap;
11use std::sync::OnceLock;
12
13/// Minimum token length considered for high-entropy detection.
14const ENTROPY_TOKEN_MIN_LEN: usize = 24;
15
16/// Result of leak detection.
17#[derive(Debug, Clone)]
18pub enum LeakResult {
19    /// No leaks detected.
20    Clean,
21    /// Potential leaks detected with redacted versions.
22    Detected {
23        /// Descriptions of detected leak patterns.
24        patterns: Vec<String>,
25        /// Content with sensitive values redacted.
26        redacted: String,
27    },
28}
29
30/// Credential leak detector for outbound content.
31#[derive(Debug, Clone)]
32pub struct LeakDetector {
33    /// Sensitivity threshold (0.0-1.0, higher = more aggressive detection).
34    sensitivity: f64,
35}
36
37impl Default for LeakDetector {
38    fn default() -> Self {
39        Self::new()
40    }
41}
42
43impl LeakDetector {
44    /// Create a new leak detector with default sensitivity.
45    pub fn new() -> Self {
46        Self { sensitivity: 0.7 }
47    }
48
49    /// Create a detector with custom sensitivity.
50    pub fn with_sensitivity(sensitivity: f64) -> Self {
51        Self {
52            sensitivity: sensitivity.clamp(0.0, 1.0),
53        }
54    }
55
56    /// Scan content for potential credential leaks.
57    pub fn scan(&self, content: &str) -> LeakResult {
58        let mut patterns = Vec::new();
59        let mut redacted = content.to_string();
60
61        // Check each pattern type
62        self.check_api_keys(content, &mut patterns, &mut redacted);
63        self.check_aws_credentials(content, &mut patterns, &mut redacted);
64        self.check_generic_secrets(content, &mut patterns, &mut redacted);
65        self.check_private_keys(content, &mut patterns, &mut redacted);
66        self.check_jwt_tokens(content, &mut patterns, &mut redacted);
67        self.check_database_urls(content, &mut patterns, &mut redacted);
68        self.check_high_entropy_tokens(content, &mut patterns, &mut redacted);
69
70        if patterns.is_empty() {
71            LeakResult::Clean
72        } else {
73            LeakResult::Detected { patterns, redacted }
74        }
75    }
76
77    /// Check for common API key patterns.
78    fn check_api_keys(&self, content: &str, patterns: &mut Vec<String>, redacted: &mut String) {
79        static API_KEY_PATTERNS: OnceLock<Vec<(Regex, &'static str)>> = OnceLock::new();
80        let regexes = API_KEY_PATTERNS.get_or_init(|| {
81            vec![
82                // Stripe
83                (
84                    Regex::new(r"sk_(live|test)_[a-zA-Z0-9]{24,}").unwrap(),
85                    "Stripe secret key",
86                ),
87                (
88                    Regex::new(r"pk_(live|test)_[a-zA-Z0-9]{24,}").unwrap(),
89                    "Stripe publishable key",
90                ),
91                // OpenAI
92                (
93                    Regex::new(r"sk-[a-zA-Z0-9]{20,}T3BlbkFJ[a-zA-Z0-9]{20,}").unwrap(),
94                    "OpenAI API key",
95                ),
96                (
97                    Regex::new(r"sk-[a-zA-Z0-9]{48,}").unwrap(),
98                    "OpenAI-style API key",
99                ),
100                // Anthropic
101                (
102                    Regex::new(r"sk-ant-[a-zA-Z0-9-_]{32,}").unwrap(),
103                    "Anthropic API key",
104                ),
105                // Groq
106                (Regex::new(r"gsk_[a-zA-Z0-9]{20,}").unwrap(), "Groq API key"),
107                // Google
108                (
109                    Regex::new(r"AIza[a-zA-Z0-9_-]{35}").unwrap(),
110                    "Google API key",
111                ),
112                // GitHub
113                (
114                    Regex::new(r"gh[pousr]_[a-zA-Z0-9]{36,}").unwrap(),
115                    "GitHub token",
116                ),
117                (
118                    Regex::new(r"github_pat_[a-zA-Z0-9_]{22,}").unwrap(),
119                    "GitHub PAT",
120                ),
121                // Generic
122                (
123                    Regex::new(r#"api[_-]?key[=:]\s*['"]*[a-zA-Z0-9_-]{20,}"#).unwrap(),
124                    "Generic API key",
125                ),
126            ]
127        });
128
129        for (regex, name) in regexes {
130            if regex.is_match(content) {
131                patterns.push(String::from(*name));
132                *redacted = regex
133                    .replace_all(redacted, "[REDACTED_API_KEY]")
134                    .to_string();
135            }
136        }
137    }
138
139    /// Check for AWS credentials.
140    fn check_aws_credentials(
141        &self,
142        content: &str,
143        patterns: &mut Vec<String>,
144        redacted: &mut String,
145    ) {
146        static AWS_PATTERNS: OnceLock<Vec<(Regex, &'static str)>> = OnceLock::new();
147        let regexes = AWS_PATTERNS.get_or_init(|| {
148            vec![
149                (
150                    Regex::new(r"AKIA[A-Z0-9]{16}").unwrap(),
151                    "AWS Access Key ID",
152                ),
153                (
154                    Regex::new(
155                        r#"aws[_-]?secret[_-]?access[_-]?key[=:]\s*['"]*[a-zA-Z0-9/+=]{40}"#,
156                    )
157                    .unwrap(),
158                    "AWS Secret Access Key",
159                ),
160            ]
161        });
162
163        for (regex, name) in regexes {
164            if regex.is_match(content) {
165                patterns.push(String::from(*name));
166                *redacted = regex
167                    .replace_all(redacted, "[REDACTED_AWS_CREDENTIAL]")
168                    .to_string();
169            }
170        }
171    }
172
173    /// Check for generic secret patterns.
174    fn check_generic_secrets(
175        &self,
176        content: &str,
177        patterns: &mut Vec<String>,
178        redacted: &mut String,
179    ) {
180        static SECRET_PATTERNS: OnceLock<Vec<(Regex, &'static str)>> = OnceLock::new();
181        let regexes = SECRET_PATTERNS.get_or_init(|| {
182            vec![
183                (
184                    Regex::new(r#"(?i)password[=:]\s*['"]*[^\s'"]{8,}"#).unwrap(),
185                    "Password in config",
186                ),
187                (
188                    Regex::new(r#"(?i)secret[=:]\s*['"]*[a-zA-Z0-9_-]{16,}"#).unwrap(),
189                    "Secret value",
190                ),
191                (
192                    Regex::new(r#"(?i)token[=:]\s*['"]*[a-zA-Z0-9_.-]{20,}"#).unwrap(),
193                    "Token value",
194                ),
195            ]
196        });
197
198        for (regex, name) in regexes {
199            if regex.is_match(content) && self.sensitivity > 0.5 {
200                patterns.push(String::from(*name));
201                *redacted = regex.replace_all(redacted, "[REDACTED_SECRET]").to_string();
202            }
203        }
204    }
205
206    /// Check for private keys.
207    fn check_private_keys(&self, content: &str, patterns: &mut Vec<String>, redacted: &mut String) {
208        // PEM-encoded private keys
209        let key_patterns = [
210            (
211                "-----BEGIN RSA PRIVATE KEY-----",
212                "-----END RSA PRIVATE KEY-----",
213                "RSA private key",
214            ),
215            (
216                "-----BEGIN EC PRIVATE KEY-----",
217                "-----END EC PRIVATE KEY-----",
218                "EC private key",
219            ),
220            (
221                "-----BEGIN PRIVATE KEY-----",
222                "-----END PRIVATE KEY-----",
223                "Private key",
224            ),
225            (
226                "-----BEGIN OPENSSH PRIVATE KEY-----",
227                "-----END OPENSSH PRIVATE KEY-----",
228                "OpenSSH private key",
229            ),
230        ];
231
232        for (begin, end, name) in key_patterns {
233            if content.contains(begin) && content.contains(end) {
234                patterns.push(name.to_string());
235                // Redact the entire key block
236                if let Some(start_idx) = content.find(begin)
237                    && let Some(end_idx) = content.find(end)
238                {
239                    let key_block = &content[start_idx..end_idx + end.len()];
240                    *redacted = redacted.replace(key_block, "[REDACTED_PRIVATE_KEY]");
241                }
242            }
243        }
244    }
245
246    /// Check for JWT tokens.
247    fn check_jwt_tokens(&self, content: &str, patterns: &mut Vec<String>, redacted: &mut String) {
248        static JWT_PATTERN: OnceLock<Regex> = OnceLock::new();
249        let regex = JWT_PATTERN.get_or_init(|| {
250            // JWT: three base64url-encoded parts separated by dots
251            Regex::new(r"eyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]*").unwrap()
252        });
253
254        if regex.is_match(content) {
255            patterns.push("JWT token".to_string());
256            *redacted = regex.replace_all(redacted, "[REDACTED_JWT]").to_string();
257        }
258    }
259
260    /// Check for database connection URLs.
261    fn check_database_urls(
262        &self,
263        content: &str,
264        patterns: &mut Vec<String>,
265        redacted: &mut String,
266    ) {
267        static DB_PATTERNS: OnceLock<Vec<(Regex, &'static str)>> = OnceLock::new();
268        let regexes = DB_PATTERNS.get_or_init(|| {
269            vec![
270                (
271                    Regex::new(r"postgres(ql)?://[^:]+:[^@]+@[^\s]+").unwrap(),
272                    "PostgreSQL connection URL",
273                ),
274                (
275                    Regex::new(r"mysql://[^:]+:[^@]+@[^\s]+").unwrap(),
276                    "MySQL connection URL",
277                ),
278                (
279                    Regex::new(r"mongodb(\+srv)?://[^:]+:[^@]+@[^\s]+").unwrap(),
280                    "MongoDB connection URL",
281                ),
282                (
283                    Regex::new(r"redis://[^:]+:[^@]+@[^\s]+").unwrap(),
284                    "Redis connection URL",
285                ),
286            ]
287        });
288
289        for (regex, name) in regexes {
290            if regex.is_match(content) {
291                patterns.push(String::from(*name));
292                *redacted = regex
293                    .replace_all(redacted, "[REDACTED_DATABASE_URL]")
294                    .to_string();
295            }
296        }
297    }
298
299    /// Check for high-entropy tokens that may be leaked credentials.
300    ///
301    /// Extracts candidate tokens from content (after stripping URLs to avoid
302    /// false-positives on path segments) and flags any that exceed the Shannon
303    /// entropy threshold derived from the detector's sensitivity.
304    fn check_high_entropy_tokens(
305        &self,
306        content: &str,
307        patterns: &mut Vec<String>,
308        redacted: &mut String,
309    ) {
310        // Entropy threshold scales with sensitivity: at 0.7 this is ~4.37.
311        let entropy_threshold = 3.5 + self.sensitivity * 1.25;
312
313        // Strip URLs and media markers before extracting tokens so that path
314        // segments are not mistaken for high-entropy credentials.
315        // Media markers like [IMAGE:/path/to/file.png] contain filesystem paths
316        // that look like high-entropy tokens when `/` is included in the token
317        // character set.
318        static URL_PATTERN: OnceLock<Regex> = OnceLock::new();
319        let url_re = URL_PATTERN.get_or_init(|| Regex::new(r"https?://\S+").unwrap());
320        static MEDIA_MARKER_PATTERN: OnceLock<Regex> = OnceLock::new();
321        let media_re = MEDIA_MARKER_PATTERN.get_or_init(|| {
322            Regex::new(r"\[(IMAGE|VIDEO|VOICE|AUDIO|DOCUMENT|FILE):[^\]]*\]").unwrap()
323        });
324        // Tool receipts (zc-receipt-...) are runtime-generated HMAC tokens that
325        // intentionally appear in output. Strip them before entropy scanning so
326        // they are not redacted as leaked credentials.
327        static RECEIPT_PATTERN: OnceLock<Regex> = OnceLock::new();
328        let receipt_re =
329            RECEIPT_PATTERN.get_or_init(|| Regex::new(r"zc-receipt-\d+-[A-Za-z0-9_-]+").unwrap());
330        let content_stripped = url_re.replace_all(content, "");
331        let content_without_urls = media_re.replace_all(&content_stripped, "");
332        let content_without_receipts = receipt_re.replace_all(&content_without_urls, "");
333
334        let tokens = extract_candidate_tokens(&content_without_receipts);
335
336        for token in tokens {
337            if token.len() >= ENTROPY_TOKEN_MIN_LEN {
338                let entropy = shannon_entropy(token);
339                if entropy >= entropy_threshold && has_mixed_alpha_digit(token) {
340                    patterns.push("High-entropy token".to_string());
341                    *redacted = redacted.replace(token, "[REDACTED_HIGH_ENTROPY_TOKEN]");
342                }
343            }
344        }
345    }
346}
347
348/// Extract candidate tokens by splitting on characters outside the
349/// alphanumeric + common credential character set.
350fn extract_candidate_tokens(content: &str) -> Vec<&str> {
351    content
352        .split(|c: char| !c.is_ascii_alphanumeric() && c != '_' && c != '-' && c != '+' && c != '/')
353        .filter(|s| !s.is_empty())
354        .collect()
355}
356
357/// Compute Shannon entropy (bits per character) for the given string.
358fn shannon_entropy(s: &str) -> f64 {
359    let len = s.len() as f64;
360    if len == 0.0 {
361        return 0.0;
362    }
363    let mut freq: HashMap<u8, usize> = HashMap::new();
364    for &b in s.as_bytes() {
365        *freq.entry(b).or_insert(0) += 1;
366    }
367    freq.values().fold(0.0, |acc, &count| {
368        let p = count as f64 / len;
369        acc - p * p.log2()
370    })
371}
372
373/// Check whether a token contains both alphabetic and digit characters.
374fn has_mixed_alpha_digit(s: &str) -> bool {
375    let has_alpha = s.bytes().any(|b| b.is_ascii_alphabetic());
376    let has_digit = s.bytes().any(|b| b.is_ascii_digit());
377    has_alpha && has_digit
378}
379
380#[cfg(test)]
381mod tests {
382    use super::*;
383
384    #[test]
385    fn clean_content_passes() {
386        let detector = LeakDetector::new();
387        let result = detector.scan("This is just some normal text");
388        assert!(matches!(result, LeakResult::Clean));
389    }
390
391    #[test]
392    fn detects_stripe_keys() {
393        let detector = LeakDetector::new();
394        let content = "My Stripe key is sk_test_1234567890abcdefghijklmnop";
395        let result = detector.scan(content);
396        match result {
397            LeakResult::Detected { patterns, redacted } => {
398                assert!(patterns.iter().any(|p| p.contains("Stripe")));
399                assert!(redacted.contains("[REDACTED"));
400            }
401            LeakResult::Clean => panic!("Should detect Stripe key"),
402        }
403    }
404
405    #[test]
406    fn detects_aws_credentials() {
407        let detector = LeakDetector::new();
408        let content = "AWS key: AKIAIOSFODNN7EXAMPLE";
409        let result = detector.scan(content);
410        match result {
411            LeakResult::Detected { patterns, .. } => {
412                assert!(patterns.iter().any(|p| p.contains("AWS")));
413            }
414            LeakResult::Clean => panic!("Should detect AWS key"),
415        }
416    }
417
418    #[test]
419    fn detects_groq_api_keys() {
420        let detector = LeakDetector::new();
421        let content = "Groq key: gsk_abcdefghijklmnopqrstuvwxyz123456";
422        let result = detector.scan(content);
423        match result {
424            LeakResult::Detected { patterns, redacted } => {
425                assert!(patterns.iter().any(|p| p.contains("Groq")));
426                assert!(redacted.contains("[REDACTED"));
427                assert!(!redacted.contains("gsk_abcdefghijklmnopqrstuvwxyz123456"));
428            }
429            LeakResult::Clean => panic!("Should detect Groq API key"),
430        }
431    }
432
433    #[test]
434    fn detects_private_keys() {
435        let detector = LeakDetector::new();
436        let content = r#"
437-----BEGIN RSA PRIVATE KEY-----
438MIIEowIBAAKCAQEA0ZPr5JeyVDonXsKhfq...
439-----END RSA PRIVATE KEY-----
440"#;
441        let result = detector.scan(content);
442        match result {
443            LeakResult::Detected { patterns, redacted } => {
444                assert!(patterns.iter().any(|p| p.contains("private key")));
445                assert!(redacted.contains("[REDACTED_PRIVATE_KEY]"));
446            }
447            LeakResult::Clean => panic!("Should detect private key"),
448        }
449    }
450
451    #[test]
452    fn detects_jwt_tokens() {
453        let detector = LeakDetector::new();
454        let content = "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.dozjgNryP4J3jVmNHl0w5N_XgL0n3I9PlFUP0THsR8U";
455        let result = detector.scan(content);
456        match result {
457            LeakResult::Detected { patterns, redacted } => {
458                assert!(patterns.iter().any(|p| p.contains("JWT")));
459                assert!(redacted.contains("[REDACTED_JWT]"));
460            }
461            LeakResult::Clean => panic!("Should detect JWT"),
462        }
463    }
464
465    #[test]
466    fn detects_database_urls() {
467        let detector = LeakDetector::new();
468        let content = "DATABASE_URL=postgres://user:secretpassword@localhost:5432/mydb";
469        let result = detector.scan(content);
470        match result {
471            LeakResult::Detected { patterns, .. } => {
472                assert!(patterns.iter().any(|p| p.contains("PostgreSQL")));
473            }
474            LeakResult::Clean => panic!("Should detect database URL"),
475        }
476    }
477
478    #[test]
479    fn low_sensitivity_skips_generic() {
480        let detector = LeakDetector::with_sensitivity(0.3);
481        let content = "secret=mygenericvalue123456";
482        let result = detector.scan(content);
483        // Low sensitivity should not flag generic secrets
484        assert!(matches!(result, LeakResult::Clean));
485    }
486
487    #[test]
488    fn url_path_segments_not_flagged() {
489        let detector = LeakDetector::new();
490        // URL with a long mixed-alphanumeric path segment that would previously
491        // false-positive as a high-entropy token.
492        let content =
493            "See https://example.org/documents/2024-report-a1b2c3d4e5f6g7h8i9j0.pdf for details";
494        let result = detector.scan(content);
495        assert!(
496            matches!(result, LeakResult::Clean),
497            "URL path segments should not trigger high-entropy detection"
498        );
499    }
500
501    #[test]
502    fn url_with_long_path_not_redacted() {
503        let detector = LeakDetector::new();
504        let content = "Reference: https://gov.example.com/publications/research/2024-annual-fiscal-policy-review-9a8b7c6d5e4f3g2h1i0j.html";
505        let result = detector.scan(content);
506        assert!(
507            matches!(result, LeakResult::Clean),
508            "Long URL paths should not be redacted"
509        );
510    }
511
512    #[test]
513    fn tool_receipts_not_redacted_as_high_entropy() {
514        let detector = LeakDetector::new();
515        let content = "The date is Fri Mar 27.\n\n[receipt: zc-receipt-1774608496-gzpEBuUIRYX1vd4fQl4oYkqhq4-GnoJDStmlYzvQiWA]";
516        let result = detector.scan(content);
517        assert!(
518            matches!(result, LeakResult::Clean),
519            "Tool receipts (zc-receipt-...) should not be redacted"
520        );
521    }
522
523    #[test]
524    fn media_markers_not_redacted_as_high_entropy() {
525        let detector = LeakDetector::new();
526        let content = "Here is the image: [IMAGE:/Users/matt/.zeroclaw/workspace/skills/image-gen/images/20260324_135911.png]";
527        let result = detector.scan(content);
528        assert!(
529            matches!(result, LeakResult::Clean),
530            "Local media markers should not be redacted"
531        );
532    }
533
534    #[test]
535    fn detects_high_entropy_token_outside_url() {
536        let detector = LeakDetector::new();
537        // A standalone high-entropy token (not in a URL) should still be detected.
538        let content = "Found credential: aB3xK9mW2pQ7vL4nR8sT1yU6hD0jF5cG";
539        let result = detector.scan(content);
540        match result {
541            LeakResult::Detected { patterns, redacted } => {
542                assert!(patterns.iter().any(|p| p.contains("High-entropy")));
543                assert!(redacted.contains("[REDACTED_HIGH_ENTROPY_TOKEN]"));
544            }
545            LeakResult::Clean => panic!("Should detect high-entropy token"),
546        }
547    }
548
549    #[test]
550    fn low_sensitivity_raises_entropy_threshold() {
551        let detector = LeakDetector::with_sensitivity(0.3);
552        // At low sensitivity the entropy threshold is higher (3.5 + 0.3*1.25 = 3.875).
553        // A repetitive mixed token has low entropy and should not be flagged.
554        let content = "token found: ab12ab12ab12ab12ab12ab12ab12ab12";
555        let result = detector.scan(content);
556        assert!(
557            matches!(result, LeakResult::Clean),
558            "Low-entropy repetitive tokens should not be flagged"
559        );
560    }
561
562    #[test]
563    fn extract_candidate_tokens_splits_correctly() {
564        let tokens = extract_candidate_tokens("foo.bar:baz qux-quux key=val");
565        assert!(tokens.contains(&"foo"));
566        assert!(tokens.contains(&"bar"));
567        assert!(tokens.contains(&"baz"));
568        assert!(tokens.contains(&"qux-quux"));
569        // '=' is a delimiter, not part of tokens
570        assert!(tokens.contains(&"key"));
571        assert!(tokens.contains(&"val"));
572    }
573
574    #[test]
575    fn media_marker_image_path_not_redacted() {
576        let detector = LeakDetector::new();
577        let content = "Here is your image: [IMAGE:/Users/matt/.zeroclaw/workspace/skills/image-gen/images/20260324_135911.png]";
578        let result = detector.scan(content);
579        assert!(
580            matches!(result, LeakResult::Clean),
581            "Media marker image paths should not trigger high-entropy detection"
582        );
583    }
584
585    #[test]
586    fn media_marker_video_not_redacted() {
587        let detector = LeakDetector::new();
588        let content = "Attached: [VIDEO:/path/to/long/video/file/name123456.mp4]";
589        let result = detector.scan(content);
590        assert!(
591            matches!(result, LeakResult::Clean),
592            "Media marker video paths should not trigger high-entropy detection"
593        );
594    }
595
596    #[test]
597    fn actual_high_entropy_still_detected() {
598        let detector = LeakDetector::new();
599        let content = "Leaked credential: aB3xK9mW2pQ7vL4nR8sT1yU6hD0jF5cG";
600        let result = detector.scan(content);
601        match result {
602            LeakResult::Detected { patterns, redacted } => {
603                assert!(patterns.iter().any(|p| p.contains("High-entropy")));
604                assert!(redacted.contains("[REDACTED_HIGH_ENTROPY_TOKEN]"));
605            }
606            LeakResult::Clean => {
607                panic!("Should still detect high-entropy tokens outside media markers")
608            }
609        }
610    }
611
612    #[test]
613    fn shannon_entropy_empty_string() {
614        assert_eq!(shannon_entropy(""), 0.0);
615    }
616
617    #[test]
618    fn shannon_entropy_single_char() {
619        // All same characters: entropy = 0
620        assert_eq!(shannon_entropy("aaaa"), 0.0);
621    }
622
623    #[test]
624    fn shannon_entropy_two_equal_chars() {
625        // "ab" repeated: entropy = 1.0 bit
626        let e = shannon_entropy("abab");
627        assert!((e - 1.0).abs() < 0.001);
628    }
629}