Skip to main content

zeroclaw_tools/
pdf_read.rs

1use async_trait::async_trait;
2use serde_json::json;
3use std::sync::Arc;
4use zeroclaw_api::tool::{Tool, ToolResult};
5use zeroclaw_config::policy::SecurityPolicy;
6
7/// Maximum PDF file size (50 MB).
8const MAX_PDF_BYTES: u64 = 50 * 1024 * 1024;
9/// Default character limit returned to the LLM.
10const DEFAULT_MAX_CHARS: usize = 50_000;
11/// Hard ceiling regardless of what the caller requests.
12const MAX_OUTPUT_CHARS: usize = 200_000;
13
14/// Extract plain text from a PDF file in the workspace.
15///
16/// PDF extraction requires the `rag-pdf` feature flag:
17///   cargo build --features rag-pdf
18///
19/// Without the feature the tool is still registered so the LLM receives a
20/// clear, actionable error rather than a missing-tool confusion.
21pub struct PdfReadTool {
22    security: Arc<SecurityPolicy>,
23}
24
25impl PdfReadTool {
26    pub fn new(security: Arc<SecurityPolicy>) -> Self {
27        Self { security }
28    }
29}
30
31#[async_trait]
32impl Tool for PdfReadTool {
33    fn name(&self) -> &str {
34        "pdf_read"
35    }
36
37    fn description(&self) -> &str {
38        "Extract plain text from a PDF file in the workspace. \
39         Returns all readable text. Image-only or encrypted PDFs return an empty result. \
40         Requires the 'rag-pdf' build feature."
41    }
42
43    fn parameters_schema(&self) -> serde_json::Value {
44        json!({
45            "type": "object",
46            "properties": {
47                "path": {
48                    "type": "string",
49                    "description": "Path to the PDF file. Relative paths resolve from workspace; outside paths require policy allowlist."
50                },
51                "max_chars": {
52                    "type": "integer",
53                    "description": "Maximum characters to return (default: 50000, max: 200000)",
54                    "minimum": 1,
55                    "maximum": 200_000
56                }
57            },
58            "required": ["path"]
59        })
60    }
61
62    async fn execute(&self, args: serde_json::Value) -> anyhow::Result<ToolResult> {
63        let path = args.get("path").and_then(|v| v.as_str()).ok_or_else(|| {
64            ::zeroclaw_log::record!(
65                WARN,
66                ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Reject)
67                    .with_outcome(::zeroclaw_log::EventOutcome::Failure)
68                    .with_attrs(::serde_json::json!({"param": "path"})),
69                "pdf_read: missing path parameter"
70            );
71            anyhow::Error::msg("Missing 'path' parameter")
72        })?;
73
74        let max_chars = args
75            .get("max_chars")
76            .and_then(|v| v.as_u64())
77            .map(|n| {
78                usize::try_from(n)
79                    .unwrap_or(MAX_OUTPUT_CHARS)
80                    .min(MAX_OUTPUT_CHARS)
81            })
82            .unwrap_or(DEFAULT_MAX_CHARS);
83
84        // Cross-cutting rate limiting and path-allowlist checks live in the
85        // RateLimitedTool + PathGuardedTool wrappers at registration time
86        // (see zeroclaw-runtime::tools::mod).  Successful reads consume one
87        // budget slot via the outer RateLimitedTool.
88        //
89        // Read-tool exception: post-`PathGuardedTool` canonicalize failures
90        // (probing nonexistent files) and post-canonicalization policy
91        // failures (`is_resolved_path_allowed`) also consume one budget slot,
92        // charged here, so that callers cannot probe path existence or
93        // resolved-path policy decisions for free.  The outer wrapper only
94        // records on `success: true`, so these explicit charges total
95        // exactly one slot per attempt — matching the pre-wrapper semantics.
96
97        let full_path = self.security.resolve_tool_path(path);
98
99        let resolved_path = match tokio::fs::canonicalize(&full_path).await {
100            Ok(p) => p,
101            Err(e) => {
102                let _ = self.security.record_action();
103                return Ok(ToolResult {
104                    success: false,
105                    output: String::new(),
106                    error: Some(format!("Failed to resolve file path: {e}")),
107                });
108            }
109        };
110
111        if !self.security.is_resolved_path_readable(&resolved_path) {
112            return Ok(ToolResult {
113                success: false,
114                output: String::new(),
115                error: Some(
116                    self.security
117                        .resolved_path_violation_message(&resolved_path),
118                ),
119            });
120        }
121
122        ::zeroclaw_log::record!(
123            DEBUG,
124            ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note),
125            &format!("Reading PDF: {}", resolved_path.display())
126        );
127
128        match tokio::fs::metadata(&resolved_path).await {
129            Ok(meta) => {
130                if meta.len() > MAX_PDF_BYTES {
131                    return Ok(ToolResult {
132                        success: false,
133                        output: String::new(),
134                        error: Some(format!(
135                            "PDF too large: {} bytes (limit: {MAX_PDF_BYTES} bytes)",
136                            meta.len()
137                        )),
138                    });
139                }
140            }
141            Err(e) => {
142                return Ok(ToolResult {
143                    success: false,
144                    output: String::new(),
145                    error: Some(format!("Failed to read file metadata: {e}")),
146                });
147            }
148        }
149
150        let bytes = match tokio::fs::read(&resolved_path).await {
151            Ok(b) => b,
152            Err(e) => {
153                return Ok(ToolResult {
154                    success: false,
155                    output: String::new(),
156                    error: Some(format!("Failed to read PDF file: {e}")),
157                });
158            }
159        };
160
161        // pdf_extract is a blocking CPU-bound operation; keep it off the async executor.
162        #[cfg(feature = "rag-pdf")]
163        {
164            let text = match tokio::task::spawn_blocking(move || {
165                pdf_extract::extract_text_from_mem(&bytes)
166            })
167            .await
168            {
169                Ok(Ok(t)) => t,
170                Ok(Err(e)) => {
171                    return Ok(ToolResult {
172                        success: false,
173                        output: String::new(),
174                        error: Some(format!("PDF extraction failed: {e}")),
175                    });
176                }
177                Err(e) => {
178                    return Ok(ToolResult {
179                        success: false,
180                        output: String::new(),
181                        error: Some(format!("PDF extraction task panicked: {e}")),
182                    });
183                }
184            };
185
186            if text.trim().is_empty() {
187                return Ok(ToolResult {
188                    success: true,
189                    // Agent dispatchers currently forward `error` only when `success=false`.
190                    // Keep this as successful execution and expose the warning in `output`.
191                    output: "PDF contains no extractable text (may be image-only or encrypted)"
192                        .into(),
193                    error: None,
194                });
195            }
196
197            let output = if text.chars().count() > max_chars {
198                let mut truncated: String = text.chars().take(max_chars).collect();
199                use std::fmt::Write as _;
200                let _ = write!(truncated, "\n\n... [truncated at {max_chars} chars]");
201                truncated
202            } else {
203                text
204            };
205
206            return Ok(ToolResult {
207                success: true,
208                output,
209                error: None,
210            });
211        }
212
213        #[cfg(not(feature = "rag-pdf"))]
214        {
215            let _ = bytes;
216            let _ = max_chars;
217            Ok(ToolResult {
218                success: false,
219                output: String::new(),
220                error: Some(
221                    "PDF extraction is not enabled. \
222                     Rebuild with: cargo build --features rag-pdf"
223                        .into(),
224                ),
225            })
226        }
227    }
228}
229
230#[cfg(test)]
231mod tests {
232    use super::*;
233    use crate::wrappers::{PathGuardedTool, RateLimitedTool};
234    use tempfile::TempDir;
235    use zeroclaw_config::autonomy::AutonomyLevel;
236    use zeroclaw_config::policy::SecurityPolicy;
237
238    fn test_security(workspace: std::path::PathBuf) -> Arc<SecurityPolicy> {
239        Arc::new(SecurityPolicy {
240            autonomy: AutonomyLevel::Supervised,
241            workspace_dir: workspace,
242            ..SecurityPolicy::default()
243        })
244    }
245
246    fn test_security_with_limit(
247        workspace: std::path::PathBuf,
248        max_actions: u32,
249    ) -> Arc<SecurityPolicy> {
250        Arc::new(SecurityPolicy {
251            autonomy: AutonomyLevel::Supervised,
252            workspace_dir: workspace,
253            max_actions_per_hour: max_actions,
254            ..SecurityPolicy::default()
255        })
256    }
257
258    /// Wraps `PdfReadTool` with the production `PathGuardedTool` + `RateLimitedTool`
259    /// stack, mirroring the registration in `zeroclaw-runtime::tools::mod`. Use this
260    /// in tests that exercise path-allowlist or rate-limit behavior.
261    fn wrapped_tool(workspace: std::path::PathBuf) -> Box<dyn Tool> {
262        let security = test_security(workspace);
263        Box::new(RateLimitedTool::new(
264            PathGuardedTool::new(PdfReadTool::new(security.clone()), security.clone()),
265            security,
266        ))
267    }
268
269    #[test]
270    fn name_is_pdf_read() {
271        let tool = PdfReadTool::new(test_security(std::env::temp_dir()));
272        assert_eq!(tool.name(), "pdf_read");
273    }
274
275    #[test]
276    fn description_not_empty() {
277        let tool = PdfReadTool::new(test_security(std::env::temp_dir()));
278        assert!(!tool.description().is_empty());
279    }
280
281    #[test]
282    fn schema_has_path_required() {
283        let tool = PdfReadTool::new(test_security(std::env::temp_dir()));
284        let schema = tool.parameters_schema();
285        assert!(schema["properties"]["path"].is_object());
286        assert!(schema["properties"]["max_chars"].is_object());
287        let required = schema["required"].as_array().unwrap();
288        assert!(required.contains(&json!("path")));
289    }
290
291    #[test]
292    fn spec_matches_metadata() {
293        let tool = PdfReadTool::new(test_security(std::env::temp_dir()));
294        let spec = tool.spec();
295        assert_eq!(spec.name, "pdf_read");
296        assert!(spec.parameters.is_object());
297    }
298
299    #[tokio::test]
300    async fn missing_path_param_returns_error() {
301        let tool = PdfReadTool::new(test_security(std::env::temp_dir()));
302        let result = tool.execute(json!({})).await;
303        assert!(result.is_err());
304        assert!(result.unwrap_err().to_string().contains("path"));
305    }
306
307    #[tokio::test]
308    async fn absolute_path_is_blocked() {
309        let tool = wrapped_tool(std::env::temp_dir());
310
311        #[cfg(unix)]
312        let target = "/etc/passwd";
313        #[cfg(windows)]
314        let target = {
315            let sysroot = std::env::var("SystemRoot").unwrap_or_else(|_| r"C:\Windows".to_string());
316            std::path::PathBuf::from(sysroot).join(r"System32\drivers\etc\hosts")
317        };
318
319        let result = tool.execute(json!({"path": target})).await.unwrap();
320        assert!(!result.success);
321        assert!(
322            result
323                .error
324                .as_deref()
325                .unwrap_or("")
326                .contains("Path blocked"),
327            "expected 'Path blocked' error, got: {:?}",
328            result.error
329        );
330    }
331
332    #[tokio::test]
333    async fn path_traversal_is_blocked() {
334        let tmp = TempDir::new().unwrap();
335        let tool = wrapped_tool(tmp.path().to_path_buf());
336        let result = tool
337            .execute(json!({"path": "../../../etc/passwd"}))
338            .await
339            .unwrap();
340        assert!(!result.success);
341        assert!(
342            result
343                .error
344                .as_deref()
345                .unwrap_or("")
346                .contains("Path blocked"),
347            "expected 'Path blocked' error, got: {:?}",
348            result.error
349        );
350    }
351
352    #[tokio::test]
353    async fn nonexistent_file_returns_error() {
354        let tmp = TempDir::new().unwrap();
355        let tool = PdfReadTool::new(test_security(tmp.path().to_path_buf()));
356        let result = tool
357            .execute(json!({"path": "does_not_exist.pdf"}))
358            .await
359            .unwrap();
360        assert!(!result.success);
361        assert!(
362            result
363                .error
364                .as_deref()
365                .unwrap_or("")
366                .contains("Failed to resolve")
367        );
368    }
369
370    #[cfg(unix)]
371    #[tokio::test]
372    async fn symlink_escape_is_blocked() {
373        use std::os::unix::fs::symlink;
374
375        let root = TempDir::new().unwrap();
376        let workspace = root.path().join("workspace");
377        let outside = root.path().join("outside");
378        tokio::fs::create_dir_all(&workspace).await.unwrap();
379        tokio::fs::create_dir_all(&outside).await.unwrap();
380        tokio::fs::write(outside.join("secret.pdf"), b"%PDF-1.4 secret")
381            .await
382            .unwrap();
383        symlink(outside.join("secret.pdf"), workspace.join("link.pdf")).unwrap();
384
385        let tool = PdfReadTool::new(test_security(workspace));
386        let result = tool.execute(json!({"path": "link.pdf"})).await.unwrap();
387        assert!(!result.success);
388        assert!(
389            result
390                .error
391                .as_deref()
392                .unwrap_or("")
393                .contains("escapes workspace")
394        );
395    }
396
397    /// Extraction tests require the rag-pdf feature.
398    #[cfg(feature = "rag-pdf")]
399    mod extraction {
400        use super::*;
401
402        /// Minimal valid PDF with one text page ("Hello PDF").
403        /// Generated offline and verified with pdf-extract 0.10.
404        fn minimal_pdf_bytes() -> Vec<u8> {
405            // A hand-crafted single-page PDF containing the text "Hello PDF".
406            let body = b"%PDF-1.4\n\
407                1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj\n\
408                2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj\n\
409                3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R\
410                /Contents 4 0 R/Resources<</Font<</F1 5 0 R>>>>>>endobj\n\
411                4 0 obj<</Length 44>>\nstream\n\
412                BT /F1 12 Tf 72 720 Td (Hello PDF) Tj ET\n\
413                endstream\nendobj\n\
414                5 0 obj<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>endobj\n";
415
416            let xref_offset = body.len();
417
418            let xref = format!(
419                "xref\n0 6\n\
420                 0000000000 65535 f \n\
421                 0000000009 00000 n \n\
422                 0000000058 00000 n \n\
423                 0000000115 00000 n \n\
424                 0000000274 00000 n \n\
425                 0000000370 00000 n \n\
426                 trailer<</Size 6/Root 1 0 R>>\n\
427                 startxref\n{xref_offset}\n%%EOF\n"
428            );
429
430            let mut pdf = body.to_vec();
431            pdf.extend_from_slice(xref.as_bytes());
432            pdf
433        }
434
435        #[tokio::test]
436        async fn extracts_text_from_valid_pdf() {
437            let tmp = TempDir::new().unwrap();
438            let pdf_path = tmp.path().join("test.pdf");
439            tokio::fs::write(&pdf_path, minimal_pdf_bytes())
440                .await
441                .unwrap();
442
443            let tool = PdfReadTool::new(test_security(tmp.path().to_path_buf()));
444            let result = tool.execute(json!({"path": "test.pdf"})).await.unwrap();
445
446            // Either successfully extracts text, or reports no extractable text
447            // (acceptable: minimal hand-crafted PDFs may not parse perfectly).
448            assert!(
449                result.success
450                    || result
451                        .error
452                        .as_deref()
453                        .unwrap_or("")
454                        .contains("no extractable")
455            );
456        }
457
458        #[tokio::test]
459        async fn max_chars_truncates_output() {
460            let tmp = TempDir::new().unwrap();
461            // Write a text file and rename as PDF to exercise the truncation path
462            // with known content length.
463            let pdf_path = tmp.path().join("trunc.pdf");
464            tokio::fs::write(&pdf_path, minimal_pdf_bytes())
465                .await
466                .unwrap();
467
468            let tool = PdfReadTool::new(test_security(tmp.path().to_path_buf()));
469            let result = tool
470                .execute(json!({"path": "trunc.pdf", "max_chars": 5}))
471                .await
472                .unwrap();
473
474            // If extraction succeeded the output must respect the char limit
475            // (plus the truncation suffix).
476            if result.success && !result.output.is_empty() {
477                assert!(
478                    result.output.chars().count() <= 5 + "[truncated".len() + 50,
479                    "output longer than expected: {} chars",
480                    result.output.chars().count()
481                );
482            }
483        }
484
485        #[tokio::test]
486        async fn image_only_pdf_returns_empty_text_warning() {
487            // A well-formed PDF with no text streams will yield empty output.
488            // We simulate this with an otherwise valid PDF that has an empty content stream.
489            let tmp = TempDir::new().unwrap();
490            let empty_content_pdf = b"%PDF-1.4\n\
491                1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj\n\
492                2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj\n\
493                3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R\
494                /Contents 4 0 R/Resources<<>>>>endobj\n\
495                4 0 obj<</Length 0>>\nstream\n\nendstream\nendobj\n\
496                xref\n0 5\n\
497                0000000000 65535 f \n\
498                0000000009 00000 n \n\
499                0000000058 00000 n \n\
500                0000000115 00000 n \n\
501                0000000250 00000 n \n\
502                trailer<</Size 5/Root 1 0 R>>\nstartxref\n300\n%%EOF\n";
503
504            tokio::fs::write(tmp.path().join("empty.pdf"), empty_content_pdf)
505                .await
506                .unwrap();
507
508            let tool = PdfReadTool::new(test_security(tmp.path().to_path_buf()));
509            let result = tool.execute(json!({"path": "empty.pdf"})).await.unwrap();
510
511            // Acceptable outcomes: empty text warning, or extraction error for
512            // malformed hand-crafted PDF.
513            let is_empty_warning = result.success && result.output.contains("no extractable text");
514            let is_extraction_error =
515                !result.success && result.error.as_deref().unwrap_or("").contains("extraction");
516            let is_resolve_error =
517                !result.success && result.error.as_deref().unwrap_or("").contains("Failed");
518            assert!(
519                is_empty_warning || is_extraction_error || is_resolve_error,
520                "unexpected result: success={} error={:?}",
521                result.success,
522                result.error
523            );
524        }
525    }
526
527    #[cfg(not(feature = "rag-pdf"))]
528    #[tokio::test]
529    async fn without_feature_returns_clear_error() {
530        let tmp = TempDir::new().unwrap();
531        let pdf_path = tmp.path().join("doc.pdf");
532        tokio::fs::write(&pdf_path, b"%PDF-1.4 fake").await.unwrap();
533
534        let tool = PdfReadTool::new(test_security(tmp.path().to_path_buf()));
535        let result = tool.execute(json!({"path": "doc.pdf"})).await.unwrap();
536        assert!(!result.success);
537        assert!(
538            result.error.as_deref().unwrap_or("").contains("rag-pdf"),
539            "expected feature hint in error, got: {:?}",
540            result.error
541        );
542    }
543
544    /// Anti-probing regression: a caller cannot probe PDF existence for free.
545    /// Each failed canonicalize must consume one action-budget slot via the
546    /// inner-tool charge, so repeated probes hit the rate limit.
547    #[tokio::test]
548    async fn probing_nonexistent_consumes_rate_limit_budget() {
549        let tmp = TempDir::new().unwrap();
550        let security = test_security_with_limit(tmp.path().to_path_buf(), 2);
551        let tool = PdfReadTool::new(security.clone());
552
553        let r1 = tool.execute(json!({"path": "a.pdf"})).await.unwrap();
554        assert!(!r1.success);
555        assert!(
556            r1.error
557                .as_deref()
558                .unwrap_or("")
559                .contains("Failed to resolve")
560        );
561
562        let r2 = tool.execute(json!({"path": "b.pdf"})).await.unwrap();
563        assert!(!r2.success);
564        assert!(
565            r2.error
566                .as_deref()
567                .unwrap_or("")
568                .contains("Failed to resolve")
569        );
570
571        // Budget must now be exhausted.
572        assert!(
573            !security.record_action(),
574            "budget must be exhausted after two failed probes"
575        );
576    }
577}