1use regex::Regex;
10use std::collections::HashMap;
11use std::sync::OnceLock;
12
13const ENTROPY_TOKEN_MIN_LEN: usize = 24;
15
16#[derive(Debug, Clone)]
18pub enum LeakResult {
19 Clean,
21 Detected {
23 patterns: Vec<String>,
25 redacted: String,
27 },
28}
29
30#[derive(Debug, Clone)]
32pub struct LeakDetector {
33 sensitivity: f64,
35}
36
37impl Default for LeakDetector {
38 fn default() -> Self {
39 Self::new()
40 }
41}
42
43impl LeakDetector {
44 pub fn new() -> Self {
46 Self { sensitivity: 0.7 }
47 }
48
49 pub fn with_sensitivity(sensitivity: f64) -> Self {
51 Self {
52 sensitivity: sensitivity.clamp(0.0, 1.0),
53 }
54 }
55
56 pub fn scan(&self, content: &str) -> LeakResult {
58 let mut patterns = Vec::new();
59 let mut redacted = content.to_string();
60
61 self.check_api_keys(content, &mut patterns, &mut redacted);
63 self.check_aws_credentials(content, &mut patterns, &mut redacted);
64 self.check_generic_secrets(content, &mut patterns, &mut redacted);
65 self.check_private_keys(content, &mut patterns, &mut redacted);
66 self.check_jwt_tokens(content, &mut patterns, &mut redacted);
67 self.check_database_urls(content, &mut patterns, &mut redacted);
68 self.check_high_entropy_tokens(content, &mut patterns, &mut redacted);
69
70 if patterns.is_empty() {
71 LeakResult::Clean
72 } else {
73 LeakResult::Detected { patterns, redacted }
74 }
75 }
76
77 fn check_api_keys(&self, content: &str, patterns: &mut Vec<String>, redacted: &mut String) {
79 static API_KEY_PATTERNS: OnceLock<Vec<(Regex, &'static str)>> = OnceLock::new();
80 let regexes = API_KEY_PATTERNS.get_or_init(|| {
81 vec![
82 (
84 Regex::new(r"sk_(live|test)_[a-zA-Z0-9]{24,}").unwrap(),
85 "Stripe secret key",
86 ),
87 (
88 Regex::new(r"pk_(live|test)_[a-zA-Z0-9]{24,}").unwrap(),
89 "Stripe publishable key",
90 ),
91 (
93 Regex::new(r"sk-[a-zA-Z0-9]{20,}T3BlbkFJ[a-zA-Z0-9]{20,}").unwrap(),
94 "OpenAI API key",
95 ),
96 (
97 Regex::new(r"sk-[a-zA-Z0-9]{48,}").unwrap(),
98 "OpenAI-style API key",
99 ),
100 (
102 Regex::new(r"sk-ant-[a-zA-Z0-9-_]{32,}").unwrap(),
103 "Anthropic API key",
104 ),
105 (Regex::new(r"gsk_[a-zA-Z0-9]{20,}").unwrap(), "Groq API key"),
107 (
109 Regex::new(r"AIza[a-zA-Z0-9_-]{35}").unwrap(),
110 "Google API key",
111 ),
112 (
114 Regex::new(r"gh[pousr]_[a-zA-Z0-9]{36,}").unwrap(),
115 "GitHub token",
116 ),
117 (
118 Regex::new(r"github_pat_[a-zA-Z0-9_]{22,}").unwrap(),
119 "GitHub PAT",
120 ),
121 (
123 Regex::new(r#"api[_-]?key[=:]\s*['"]*[a-zA-Z0-9_-]{20,}"#).unwrap(),
124 "Generic API key",
125 ),
126 ]
127 });
128
129 for (regex, name) in regexes {
130 if regex.is_match(content) {
131 patterns.push(String::from(*name));
132 *redacted = regex
133 .replace_all(redacted, "[REDACTED_API_KEY]")
134 .to_string();
135 }
136 }
137 }
138
139 fn check_aws_credentials(
141 &self,
142 content: &str,
143 patterns: &mut Vec<String>,
144 redacted: &mut String,
145 ) {
146 static AWS_PATTERNS: OnceLock<Vec<(Regex, &'static str)>> = OnceLock::new();
147 let regexes = AWS_PATTERNS.get_or_init(|| {
148 vec![
149 (
150 Regex::new(r"AKIA[A-Z0-9]{16}").unwrap(),
151 "AWS Access Key ID",
152 ),
153 (
154 Regex::new(
155 r#"aws[_-]?secret[_-]?access[_-]?key[=:]\s*['"]*[a-zA-Z0-9/+=]{40}"#,
156 )
157 .unwrap(),
158 "AWS Secret Access Key",
159 ),
160 ]
161 });
162
163 for (regex, name) in regexes {
164 if regex.is_match(content) {
165 patterns.push(String::from(*name));
166 *redacted = regex
167 .replace_all(redacted, "[REDACTED_AWS_CREDENTIAL]")
168 .to_string();
169 }
170 }
171 }
172
173 fn check_generic_secrets(
175 &self,
176 content: &str,
177 patterns: &mut Vec<String>,
178 redacted: &mut String,
179 ) {
180 static SECRET_PATTERNS: OnceLock<Vec<(Regex, &'static str)>> = OnceLock::new();
181 let regexes = SECRET_PATTERNS.get_or_init(|| {
182 vec![
183 (
184 Regex::new(r#"(?i)password[=:]\s*['"]*[^\s'"]{8,}"#).unwrap(),
185 "Password in config",
186 ),
187 (
188 Regex::new(r#"(?i)secret[=:]\s*['"]*[a-zA-Z0-9_-]{16,}"#).unwrap(),
189 "Secret value",
190 ),
191 (
192 Regex::new(r#"(?i)token[=:]\s*['"]*[a-zA-Z0-9_.-]{20,}"#).unwrap(),
193 "Token value",
194 ),
195 ]
196 });
197
198 for (regex, name) in regexes {
199 if regex.is_match(content) && self.sensitivity > 0.5 {
200 patterns.push(String::from(*name));
201 *redacted = regex.replace_all(redacted, "[REDACTED_SECRET]").to_string();
202 }
203 }
204 }
205
206 fn check_private_keys(&self, content: &str, patterns: &mut Vec<String>, redacted: &mut String) {
208 let key_patterns = [
210 (
211 "-----BEGIN RSA PRIVATE KEY-----",
212 "-----END RSA PRIVATE KEY-----",
213 "RSA private key",
214 ),
215 (
216 "-----BEGIN EC PRIVATE KEY-----",
217 "-----END EC PRIVATE KEY-----",
218 "EC private key",
219 ),
220 (
221 "-----BEGIN PRIVATE KEY-----",
222 "-----END PRIVATE KEY-----",
223 "Private key",
224 ),
225 (
226 "-----BEGIN OPENSSH PRIVATE KEY-----",
227 "-----END OPENSSH PRIVATE KEY-----",
228 "OpenSSH private key",
229 ),
230 ];
231
232 for (begin, end, name) in key_patterns {
233 if content.contains(begin) && content.contains(end) {
234 patterns.push(name.to_string());
235 if let Some(start_idx) = content.find(begin)
237 && let Some(end_idx) = content.find(end)
238 {
239 let key_block = &content[start_idx..end_idx + end.len()];
240 *redacted = redacted.replace(key_block, "[REDACTED_PRIVATE_KEY]");
241 }
242 }
243 }
244 }
245
246 fn check_jwt_tokens(&self, content: &str, patterns: &mut Vec<String>, redacted: &mut String) {
248 static JWT_PATTERN: OnceLock<Regex> = OnceLock::new();
249 let regex = JWT_PATTERN.get_or_init(|| {
250 Regex::new(r"eyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]*").unwrap()
252 });
253
254 if regex.is_match(content) {
255 patterns.push("JWT token".to_string());
256 *redacted = regex.replace_all(redacted, "[REDACTED_JWT]").to_string();
257 }
258 }
259
260 fn check_database_urls(
262 &self,
263 content: &str,
264 patterns: &mut Vec<String>,
265 redacted: &mut String,
266 ) {
267 static DB_PATTERNS: OnceLock<Vec<(Regex, &'static str)>> = OnceLock::new();
268 let regexes = DB_PATTERNS.get_or_init(|| {
269 vec![
270 (
271 Regex::new(r"postgres(ql)?://[^:]+:[^@]+@[^\s]+").unwrap(),
272 "PostgreSQL connection URL",
273 ),
274 (
275 Regex::new(r"mysql://[^:]+:[^@]+@[^\s]+").unwrap(),
276 "MySQL connection URL",
277 ),
278 (
279 Regex::new(r"mongodb(\+srv)?://[^:]+:[^@]+@[^\s]+").unwrap(),
280 "MongoDB connection URL",
281 ),
282 (
283 Regex::new(r"redis://[^:]+:[^@]+@[^\s]+").unwrap(),
284 "Redis connection URL",
285 ),
286 ]
287 });
288
289 for (regex, name) in regexes {
290 if regex.is_match(content) {
291 patterns.push(String::from(*name));
292 *redacted = regex
293 .replace_all(redacted, "[REDACTED_DATABASE_URL]")
294 .to_string();
295 }
296 }
297 }
298
299 fn check_high_entropy_tokens(
305 &self,
306 content: &str,
307 patterns: &mut Vec<String>,
308 redacted: &mut String,
309 ) {
310 let entropy_threshold = 3.5 + self.sensitivity * 1.25;
312
313 static URL_PATTERN: OnceLock<Regex> = OnceLock::new();
319 let url_re = URL_PATTERN.get_or_init(|| Regex::new(r"https?://\S+").unwrap());
320 static MEDIA_MARKER_PATTERN: OnceLock<Regex> = OnceLock::new();
321 let media_re = MEDIA_MARKER_PATTERN.get_or_init(|| {
322 Regex::new(r"\[(IMAGE|VIDEO|VOICE|AUDIO|DOCUMENT|FILE):[^\]]*\]").unwrap()
323 });
324 static RECEIPT_PATTERN: OnceLock<Regex> = OnceLock::new();
328 let receipt_re =
329 RECEIPT_PATTERN.get_or_init(|| Regex::new(r"zc-receipt-\d+-[A-Za-z0-9_-]+").unwrap());
330 let content_stripped = url_re.replace_all(content, "");
331 let content_without_urls = media_re.replace_all(&content_stripped, "");
332 let content_without_receipts = receipt_re.replace_all(&content_without_urls, "");
333
334 let tokens = extract_candidate_tokens(&content_without_receipts);
335
336 for token in tokens {
337 if token.len() >= ENTROPY_TOKEN_MIN_LEN {
338 let entropy = shannon_entropy(token);
339 if entropy >= entropy_threshold && has_mixed_alpha_digit(token) {
340 patterns.push("High-entropy token".to_string());
341 *redacted = redacted.replace(token, "[REDACTED_HIGH_ENTROPY_TOKEN]");
342 }
343 }
344 }
345 }
346}
347
348fn extract_candidate_tokens(content: &str) -> Vec<&str> {
351 content
352 .split(|c: char| !c.is_ascii_alphanumeric() && c != '_' && c != '-' && c != '+' && c != '/')
353 .filter(|s| !s.is_empty())
354 .collect()
355}
356
357fn shannon_entropy(s: &str) -> f64 {
359 let len = s.len() as f64;
360 if len == 0.0 {
361 return 0.0;
362 }
363 let mut freq: HashMap<u8, usize> = HashMap::new();
364 for &b in s.as_bytes() {
365 *freq.entry(b).or_insert(0) += 1;
366 }
367 freq.values().fold(0.0, |acc, &count| {
368 let p = count as f64 / len;
369 acc - p * p.log2()
370 })
371}
372
373fn has_mixed_alpha_digit(s: &str) -> bool {
375 let has_alpha = s.bytes().any(|b| b.is_ascii_alphabetic());
376 let has_digit = s.bytes().any(|b| b.is_ascii_digit());
377 has_alpha && has_digit
378}
379
380#[cfg(test)]
381mod tests {
382 use super::*;
383
384 #[test]
385 fn clean_content_passes() {
386 let detector = LeakDetector::new();
387 let result = detector.scan("This is just some normal text");
388 assert!(matches!(result, LeakResult::Clean));
389 }
390
391 #[test]
392 fn detects_stripe_keys() {
393 let detector = LeakDetector::new();
394 let content = "My Stripe key is sk_test_1234567890abcdefghijklmnop";
395 let result = detector.scan(content);
396 match result {
397 LeakResult::Detected { patterns, redacted } => {
398 assert!(patterns.iter().any(|p| p.contains("Stripe")));
399 assert!(redacted.contains("[REDACTED"));
400 }
401 LeakResult::Clean => panic!("Should detect Stripe key"),
402 }
403 }
404
405 #[test]
406 fn detects_aws_credentials() {
407 let detector = LeakDetector::new();
408 let content = "AWS key: AKIAIOSFODNN7EXAMPLE";
409 let result = detector.scan(content);
410 match result {
411 LeakResult::Detected { patterns, .. } => {
412 assert!(patterns.iter().any(|p| p.contains("AWS")));
413 }
414 LeakResult::Clean => panic!("Should detect AWS key"),
415 }
416 }
417
418 #[test]
419 fn detects_groq_api_keys() {
420 let detector = LeakDetector::new();
421 let content = "Groq key: gsk_abcdefghijklmnopqrstuvwxyz123456";
422 let result = detector.scan(content);
423 match result {
424 LeakResult::Detected { patterns, redacted } => {
425 assert!(patterns.iter().any(|p| p.contains("Groq")));
426 assert!(redacted.contains("[REDACTED"));
427 assert!(!redacted.contains("gsk_abcdefghijklmnopqrstuvwxyz123456"));
428 }
429 LeakResult::Clean => panic!("Should detect Groq API key"),
430 }
431 }
432
433 #[test]
434 fn detects_private_keys() {
435 let detector = LeakDetector::new();
436 let content = r#"
437-----BEGIN RSA PRIVATE KEY-----
438MIIEowIBAAKCAQEA0ZPr5JeyVDonXsKhfq...
439-----END RSA PRIVATE KEY-----
440"#;
441 let result = detector.scan(content);
442 match result {
443 LeakResult::Detected { patterns, redacted } => {
444 assert!(patterns.iter().any(|p| p.contains("private key")));
445 assert!(redacted.contains("[REDACTED_PRIVATE_KEY]"));
446 }
447 LeakResult::Clean => panic!("Should detect private key"),
448 }
449 }
450
451 #[test]
452 fn detects_jwt_tokens() {
453 let detector = LeakDetector::new();
454 let content = "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.dozjgNryP4J3jVmNHl0w5N_XgL0n3I9PlFUP0THsR8U";
455 let result = detector.scan(content);
456 match result {
457 LeakResult::Detected { patterns, redacted } => {
458 assert!(patterns.iter().any(|p| p.contains("JWT")));
459 assert!(redacted.contains("[REDACTED_JWT]"));
460 }
461 LeakResult::Clean => panic!("Should detect JWT"),
462 }
463 }
464
465 #[test]
466 fn detects_database_urls() {
467 let detector = LeakDetector::new();
468 let content = "DATABASE_URL=postgres://user:secretpassword@localhost:5432/mydb";
469 let result = detector.scan(content);
470 match result {
471 LeakResult::Detected { patterns, .. } => {
472 assert!(patterns.iter().any(|p| p.contains("PostgreSQL")));
473 }
474 LeakResult::Clean => panic!("Should detect database URL"),
475 }
476 }
477
478 #[test]
479 fn low_sensitivity_skips_generic() {
480 let detector = LeakDetector::with_sensitivity(0.3);
481 let content = "secret=mygenericvalue123456";
482 let result = detector.scan(content);
483 assert!(matches!(result, LeakResult::Clean));
485 }
486
487 #[test]
488 fn url_path_segments_not_flagged() {
489 let detector = LeakDetector::new();
490 let content =
493 "See https://example.org/documents/2024-report-a1b2c3d4e5f6g7h8i9j0.pdf for details";
494 let result = detector.scan(content);
495 assert!(
496 matches!(result, LeakResult::Clean),
497 "URL path segments should not trigger high-entropy detection"
498 );
499 }
500
501 #[test]
502 fn url_with_long_path_not_redacted() {
503 let detector = LeakDetector::new();
504 let content = "Reference: https://gov.example.com/publications/research/2024-annual-fiscal-policy-review-9a8b7c6d5e4f3g2h1i0j.html";
505 let result = detector.scan(content);
506 assert!(
507 matches!(result, LeakResult::Clean),
508 "Long URL paths should not be redacted"
509 );
510 }
511
512 #[test]
513 fn tool_receipts_not_redacted_as_high_entropy() {
514 let detector = LeakDetector::new();
515 let content = "The date is Fri Mar 27.\n\n[receipt: zc-receipt-1774608496-gzpEBuUIRYX1vd4fQl4oYkqhq4-GnoJDStmlYzvQiWA]";
516 let result = detector.scan(content);
517 assert!(
518 matches!(result, LeakResult::Clean),
519 "Tool receipts (zc-receipt-...) should not be redacted"
520 );
521 }
522
523 #[test]
524 fn media_markers_not_redacted_as_high_entropy() {
525 let detector = LeakDetector::new();
526 let content = "Here is the image: [IMAGE:/Users/matt/.zeroclaw/workspace/skills/image-gen/images/20260324_135911.png]";
527 let result = detector.scan(content);
528 assert!(
529 matches!(result, LeakResult::Clean),
530 "Local media markers should not be redacted"
531 );
532 }
533
534 #[test]
535 fn detects_high_entropy_token_outside_url() {
536 let detector = LeakDetector::new();
537 let content = "Found credential: aB3xK9mW2pQ7vL4nR8sT1yU6hD0jF5cG";
539 let result = detector.scan(content);
540 match result {
541 LeakResult::Detected { patterns, redacted } => {
542 assert!(patterns.iter().any(|p| p.contains("High-entropy")));
543 assert!(redacted.contains("[REDACTED_HIGH_ENTROPY_TOKEN]"));
544 }
545 LeakResult::Clean => panic!("Should detect high-entropy token"),
546 }
547 }
548
549 #[test]
550 fn low_sensitivity_raises_entropy_threshold() {
551 let detector = LeakDetector::with_sensitivity(0.3);
552 let content = "token found: ab12ab12ab12ab12ab12ab12ab12ab12";
555 let result = detector.scan(content);
556 assert!(
557 matches!(result, LeakResult::Clean),
558 "Low-entropy repetitive tokens should not be flagged"
559 );
560 }
561
562 #[test]
563 fn extract_candidate_tokens_splits_correctly() {
564 let tokens = extract_candidate_tokens("foo.bar:baz qux-quux key=val");
565 assert!(tokens.contains(&"foo"));
566 assert!(tokens.contains(&"bar"));
567 assert!(tokens.contains(&"baz"));
568 assert!(tokens.contains(&"qux-quux"));
569 assert!(tokens.contains(&"key"));
571 assert!(tokens.contains(&"val"));
572 }
573
574 #[test]
575 fn media_marker_image_path_not_redacted() {
576 let detector = LeakDetector::new();
577 let content = "Here is your image: [IMAGE:/Users/matt/.zeroclaw/workspace/skills/image-gen/images/20260324_135911.png]";
578 let result = detector.scan(content);
579 assert!(
580 matches!(result, LeakResult::Clean),
581 "Media marker image paths should not trigger high-entropy detection"
582 );
583 }
584
585 #[test]
586 fn media_marker_video_not_redacted() {
587 let detector = LeakDetector::new();
588 let content = "Attached: [VIDEO:/path/to/long/video/file/name123456.mp4]";
589 let result = detector.scan(content);
590 assert!(
591 matches!(result, LeakResult::Clean),
592 "Media marker video paths should not trigger high-entropy detection"
593 );
594 }
595
596 #[test]
597 fn actual_high_entropy_still_detected() {
598 let detector = LeakDetector::new();
599 let content = "Leaked credential: aB3xK9mW2pQ7vL4nR8sT1yU6hD0jF5cG";
600 let result = detector.scan(content);
601 match result {
602 LeakResult::Detected { patterns, redacted } => {
603 assert!(patterns.iter().any(|p| p.contains("High-entropy")));
604 assert!(redacted.contains("[REDACTED_HIGH_ENTROPY_TOKEN]"));
605 }
606 LeakResult::Clean => {
607 panic!("Should still detect high-entropy tokens outside media markers")
608 }
609 }
610 }
611
612 #[test]
613 fn shannon_entropy_empty_string() {
614 assert_eq!(shannon_entropy(""), 0.0);
615 }
616
617 #[test]
618 fn shannon_entropy_single_char() {
619 assert_eq!(shannon_entropy("aaaa"), 0.0);
621 }
622
623 #[test]
624 fn shannon_entropy_two_equal_chars() {
625 let e = shannon_entropy("abab");
627 assert!((e - 1.0).abs() < 0.001);
628 }
629}