1use async_trait::async_trait;
2use serde_json::json;
3use std::sync::Arc;
4use zeroclaw_api::tool::{Tool, ToolResult};
5use zeroclaw_config::policy::SecurityPolicy;
6
7const MAX_PDF_BYTES: u64 = 50 * 1024 * 1024;
9const DEFAULT_MAX_CHARS: usize = 50_000;
11const MAX_OUTPUT_CHARS: usize = 200_000;
13
14pub struct PdfReadTool {
22 security: Arc<SecurityPolicy>,
23}
24
25impl PdfReadTool {
26 pub fn new(security: Arc<SecurityPolicy>) -> Self {
27 Self { security }
28 }
29}
30
31#[async_trait]
32impl Tool for PdfReadTool {
33 fn name(&self) -> &str {
34 "pdf_read"
35 }
36
37 fn description(&self) -> &str {
38 "Extract plain text from a PDF file in the workspace. \
39 Returns all readable text. Image-only or encrypted PDFs return an empty result. \
40 Requires the 'rag-pdf' build feature."
41 }
42
43 fn parameters_schema(&self) -> serde_json::Value {
44 json!({
45 "type": "object",
46 "properties": {
47 "path": {
48 "type": "string",
49 "description": "Path to the PDF file. Relative paths resolve from workspace; outside paths require policy allowlist."
50 },
51 "max_chars": {
52 "type": "integer",
53 "description": "Maximum characters to return (default: 50000, max: 200000)",
54 "minimum": 1,
55 "maximum": 200_000
56 }
57 },
58 "required": ["path"]
59 })
60 }
61
62 async fn execute(&self, args: serde_json::Value) -> anyhow::Result<ToolResult> {
63 let path = args.get("path").and_then(|v| v.as_str()).ok_or_else(|| {
64 ::zeroclaw_log::record!(
65 WARN,
66 ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Reject)
67 .with_outcome(::zeroclaw_log::EventOutcome::Failure)
68 .with_attrs(::serde_json::json!({"param": "path"})),
69 "pdf_read: missing path parameter"
70 );
71 anyhow::Error::msg("Missing 'path' parameter")
72 })?;
73
74 let max_chars = args
75 .get("max_chars")
76 .and_then(|v| v.as_u64())
77 .map(|n| {
78 usize::try_from(n)
79 .unwrap_or(MAX_OUTPUT_CHARS)
80 .min(MAX_OUTPUT_CHARS)
81 })
82 .unwrap_or(DEFAULT_MAX_CHARS);
83
84 let full_path = self.security.resolve_tool_path(path);
98
99 let resolved_path = match tokio::fs::canonicalize(&full_path).await {
100 Ok(p) => p,
101 Err(e) => {
102 let _ = self.security.record_action();
103 return Ok(ToolResult {
104 success: false,
105 output: String::new(),
106 error: Some(format!("Failed to resolve file path: {e}")),
107 });
108 }
109 };
110
111 if !self.security.is_resolved_path_readable(&resolved_path) {
112 return Ok(ToolResult {
113 success: false,
114 output: String::new(),
115 error: Some(
116 self.security
117 .resolved_path_violation_message(&resolved_path),
118 ),
119 });
120 }
121
122 ::zeroclaw_log::record!(
123 DEBUG,
124 ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note),
125 &format!("Reading PDF: {}", resolved_path.display())
126 );
127
128 match tokio::fs::metadata(&resolved_path).await {
129 Ok(meta) => {
130 if meta.len() > MAX_PDF_BYTES {
131 return Ok(ToolResult {
132 success: false,
133 output: String::new(),
134 error: Some(format!(
135 "PDF too large: {} bytes (limit: {MAX_PDF_BYTES} bytes)",
136 meta.len()
137 )),
138 });
139 }
140 }
141 Err(e) => {
142 return Ok(ToolResult {
143 success: false,
144 output: String::new(),
145 error: Some(format!("Failed to read file metadata: {e}")),
146 });
147 }
148 }
149
150 let bytes = match tokio::fs::read(&resolved_path).await {
151 Ok(b) => b,
152 Err(e) => {
153 return Ok(ToolResult {
154 success: false,
155 output: String::new(),
156 error: Some(format!("Failed to read PDF file: {e}")),
157 });
158 }
159 };
160
161 #[cfg(feature = "rag-pdf")]
163 {
164 let text = match tokio::task::spawn_blocking(move || {
165 pdf_extract::extract_text_from_mem(&bytes)
166 })
167 .await
168 {
169 Ok(Ok(t)) => t,
170 Ok(Err(e)) => {
171 return Ok(ToolResult {
172 success: false,
173 output: String::new(),
174 error: Some(format!("PDF extraction failed: {e}")),
175 });
176 }
177 Err(e) => {
178 return Ok(ToolResult {
179 success: false,
180 output: String::new(),
181 error: Some(format!("PDF extraction task panicked: {e}")),
182 });
183 }
184 };
185
186 if text.trim().is_empty() {
187 return Ok(ToolResult {
188 success: true,
189 output: "PDF contains no extractable text (may be image-only or encrypted)"
192 .into(),
193 error: None,
194 });
195 }
196
197 let output = if text.chars().count() > max_chars {
198 let mut truncated: String = text.chars().take(max_chars).collect();
199 use std::fmt::Write as _;
200 let _ = write!(truncated, "\n\n... [truncated at {max_chars} chars]");
201 truncated
202 } else {
203 text
204 };
205
206 return Ok(ToolResult {
207 success: true,
208 output,
209 error: None,
210 });
211 }
212
213 #[cfg(not(feature = "rag-pdf"))]
214 {
215 let _ = bytes;
216 let _ = max_chars;
217 Ok(ToolResult {
218 success: false,
219 output: String::new(),
220 error: Some(
221 "PDF extraction is not enabled. \
222 Rebuild with: cargo build --features rag-pdf"
223 .into(),
224 ),
225 })
226 }
227 }
228}
229
230#[cfg(test)]
231mod tests {
232 use super::*;
233 use crate::wrappers::{PathGuardedTool, RateLimitedTool};
234 use tempfile::TempDir;
235 use zeroclaw_config::autonomy::AutonomyLevel;
236 use zeroclaw_config::policy::SecurityPolicy;
237
238 fn test_security(workspace: std::path::PathBuf) -> Arc<SecurityPolicy> {
239 Arc::new(SecurityPolicy {
240 autonomy: AutonomyLevel::Supervised,
241 workspace_dir: workspace,
242 ..SecurityPolicy::default()
243 })
244 }
245
246 fn test_security_with_limit(
247 workspace: std::path::PathBuf,
248 max_actions: u32,
249 ) -> Arc<SecurityPolicy> {
250 Arc::new(SecurityPolicy {
251 autonomy: AutonomyLevel::Supervised,
252 workspace_dir: workspace,
253 max_actions_per_hour: max_actions,
254 ..SecurityPolicy::default()
255 })
256 }
257
258 fn wrapped_tool(workspace: std::path::PathBuf) -> Box<dyn Tool> {
262 let security = test_security(workspace);
263 Box::new(RateLimitedTool::new(
264 PathGuardedTool::new(PdfReadTool::new(security.clone()), security.clone()),
265 security,
266 ))
267 }
268
269 #[test]
270 fn name_is_pdf_read() {
271 let tool = PdfReadTool::new(test_security(std::env::temp_dir()));
272 assert_eq!(tool.name(), "pdf_read");
273 }
274
275 #[test]
276 fn description_not_empty() {
277 let tool = PdfReadTool::new(test_security(std::env::temp_dir()));
278 assert!(!tool.description().is_empty());
279 }
280
281 #[test]
282 fn schema_has_path_required() {
283 let tool = PdfReadTool::new(test_security(std::env::temp_dir()));
284 let schema = tool.parameters_schema();
285 assert!(schema["properties"]["path"].is_object());
286 assert!(schema["properties"]["max_chars"].is_object());
287 let required = schema["required"].as_array().unwrap();
288 assert!(required.contains(&json!("path")));
289 }
290
291 #[test]
292 fn spec_matches_metadata() {
293 let tool = PdfReadTool::new(test_security(std::env::temp_dir()));
294 let spec = tool.spec();
295 assert_eq!(spec.name, "pdf_read");
296 assert!(spec.parameters.is_object());
297 }
298
299 #[tokio::test]
300 async fn missing_path_param_returns_error() {
301 let tool = PdfReadTool::new(test_security(std::env::temp_dir()));
302 let result = tool.execute(json!({})).await;
303 assert!(result.is_err());
304 assert!(result.unwrap_err().to_string().contains("path"));
305 }
306
307 #[tokio::test]
308 async fn absolute_path_is_blocked() {
309 let tool = wrapped_tool(std::env::temp_dir());
310
311 #[cfg(unix)]
312 let target = "/etc/passwd";
313 #[cfg(windows)]
314 let target = {
315 let sysroot = std::env::var("SystemRoot").unwrap_or_else(|_| r"C:\Windows".to_string());
316 std::path::PathBuf::from(sysroot).join(r"System32\drivers\etc\hosts")
317 };
318
319 let result = tool.execute(json!({"path": target})).await.unwrap();
320 assert!(!result.success);
321 assert!(
322 result
323 .error
324 .as_deref()
325 .unwrap_or("")
326 .contains("Path blocked"),
327 "expected 'Path blocked' error, got: {:?}",
328 result.error
329 );
330 }
331
332 #[tokio::test]
333 async fn path_traversal_is_blocked() {
334 let tmp = TempDir::new().unwrap();
335 let tool = wrapped_tool(tmp.path().to_path_buf());
336 let result = tool
337 .execute(json!({"path": "../../../etc/passwd"}))
338 .await
339 .unwrap();
340 assert!(!result.success);
341 assert!(
342 result
343 .error
344 .as_deref()
345 .unwrap_or("")
346 .contains("Path blocked"),
347 "expected 'Path blocked' error, got: {:?}",
348 result.error
349 );
350 }
351
352 #[tokio::test]
353 async fn nonexistent_file_returns_error() {
354 let tmp = TempDir::new().unwrap();
355 let tool = PdfReadTool::new(test_security(tmp.path().to_path_buf()));
356 let result = tool
357 .execute(json!({"path": "does_not_exist.pdf"}))
358 .await
359 .unwrap();
360 assert!(!result.success);
361 assert!(
362 result
363 .error
364 .as_deref()
365 .unwrap_or("")
366 .contains("Failed to resolve")
367 );
368 }
369
370 #[cfg(unix)]
371 #[tokio::test]
372 async fn symlink_escape_is_blocked() {
373 use std::os::unix::fs::symlink;
374
375 let root = TempDir::new().unwrap();
376 let workspace = root.path().join("workspace");
377 let outside = root.path().join("outside");
378 tokio::fs::create_dir_all(&workspace).await.unwrap();
379 tokio::fs::create_dir_all(&outside).await.unwrap();
380 tokio::fs::write(outside.join("secret.pdf"), b"%PDF-1.4 secret")
381 .await
382 .unwrap();
383 symlink(outside.join("secret.pdf"), workspace.join("link.pdf")).unwrap();
384
385 let tool = PdfReadTool::new(test_security(workspace));
386 let result = tool.execute(json!({"path": "link.pdf"})).await.unwrap();
387 assert!(!result.success);
388 assert!(
389 result
390 .error
391 .as_deref()
392 .unwrap_or("")
393 .contains("escapes workspace")
394 );
395 }
396
397 #[cfg(feature = "rag-pdf")]
399 mod extraction {
400 use super::*;
401
402 fn minimal_pdf_bytes() -> Vec<u8> {
405 let body = b"%PDF-1.4\n\
407 1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj\n\
408 2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj\n\
409 3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R\
410 /Contents 4 0 R/Resources<</Font<</F1 5 0 R>>>>>>endobj\n\
411 4 0 obj<</Length 44>>\nstream\n\
412 BT /F1 12 Tf 72 720 Td (Hello PDF) Tj ET\n\
413 endstream\nendobj\n\
414 5 0 obj<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>endobj\n";
415
416 let xref_offset = body.len();
417
418 let xref = format!(
419 "xref\n0 6\n\
420 0000000000 65535 f \n\
421 0000000009 00000 n \n\
422 0000000058 00000 n \n\
423 0000000115 00000 n \n\
424 0000000274 00000 n \n\
425 0000000370 00000 n \n\
426 trailer<</Size 6/Root 1 0 R>>\n\
427 startxref\n{xref_offset}\n%%EOF\n"
428 );
429
430 let mut pdf = body.to_vec();
431 pdf.extend_from_slice(xref.as_bytes());
432 pdf
433 }
434
435 #[tokio::test]
436 async fn extracts_text_from_valid_pdf() {
437 let tmp = TempDir::new().unwrap();
438 let pdf_path = tmp.path().join("test.pdf");
439 tokio::fs::write(&pdf_path, minimal_pdf_bytes())
440 .await
441 .unwrap();
442
443 let tool = PdfReadTool::new(test_security(tmp.path().to_path_buf()));
444 let result = tool.execute(json!({"path": "test.pdf"})).await.unwrap();
445
446 assert!(
449 result.success
450 || result
451 .error
452 .as_deref()
453 .unwrap_or("")
454 .contains("no extractable")
455 );
456 }
457
458 #[tokio::test]
459 async fn max_chars_truncates_output() {
460 let tmp = TempDir::new().unwrap();
461 let pdf_path = tmp.path().join("trunc.pdf");
464 tokio::fs::write(&pdf_path, minimal_pdf_bytes())
465 .await
466 .unwrap();
467
468 let tool = PdfReadTool::new(test_security(tmp.path().to_path_buf()));
469 let result = tool
470 .execute(json!({"path": "trunc.pdf", "max_chars": 5}))
471 .await
472 .unwrap();
473
474 if result.success && !result.output.is_empty() {
477 assert!(
478 result.output.chars().count() <= 5 + "[truncated".len() + 50,
479 "output longer than expected: {} chars",
480 result.output.chars().count()
481 );
482 }
483 }
484
485 #[tokio::test]
486 async fn image_only_pdf_returns_empty_text_warning() {
487 let tmp = TempDir::new().unwrap();
490 let empty_content_pdf = b"%PDF-1.4\n\
491 1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj\n\
492 2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj\n\
493 3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R\
494 /Contents 4 0 R/Resources<<>>>>endobj\n\
495 4 0 obj<</Length 0>>\nstream\n\nendstream\nendobj\n\
496 xref\n0 5\n\
497 0000000000 65535 f \n\
498 0000000009 00000 n \n\
499 0000000058 00000 n \n\
500 0000000115 00000 n \n\
501 0000000250 00000 n \n\
502 trailer<</Size 5/Root 1 0 R>>\nstartxref\n300\n%%EOF\n";
503
504 tokio::fs::write(tmp.path().join("empty.pdf"), empty_content_pdf)
505 .await
506 .unwrap();
507
508 let tool = PdfReadTool::new(test_security(tmp.path().to_path_buf()));
509 let result = tool.execute(json!({"path": "empty.pdf"})).await.unwrap();
510
511 let is_empty_warning = result.success && result.output.contains("no extractable text");
514 let is_extraction_error =
515 !result.success && result.error.as_deref().unwrap_or("").contains("extraction");
516 let is_resolve_error =
517 !result.success && result.error.as_deref().unwrap_or("").contains("Failed");
518 assert!(
519 is_empty_warning || is_extraction_error || is_resolve_error,
520 "unexpected result: success={} error={:?}",
521 result.success,
522 result.error
523 );
524 }
525 }
526
527 #[cfg(not(feature = "rag-pdf"))]
528 #[tokio::test]
529 async fn without_feature_returns_clear_error() {
530 let tmp = TempDir::new().unwrap();
531 let pdf_path = tmp.path().join("doc.pdf");
532 tokio::fs::write(&pdf_path, b"%PDF-1.4 fake").await.unwrap();
533
534 let tool = PdfReadTool::new(test_security(tmp.path().to_path_buf()));
535 let result = tool.execute(json!({"path": "doc.pdf"})).await.unwrap();
536 assert!(!result.success);
537 assert!(
538 result.error.as_deref().unwrap_or("").contains("rag-pdf"),
539 "expected feature hint in error, got: {:?}",
540 result.error
541 );
542 }
543
544 #[tokio::test]
548 async fn probing_nonexistent_consumes_rate_limit_budget() {
549 let tmp = TempDir::new().unwrap();
550 let security = test_security_with_limit(tmp.path().to_path_buf(), 2);
551 let tool = PdfReadTool::new(security.clone());
552
553 let r1 = tool.execute(json!({"path": "a.pdf"})).await.unwrap();
554 assert!(!r1.success);
555 assert!(
556 r1.error
557 .as_deref()
558 .unwrap_or("")
559 .contains("Failed to resolve")
560 );
561
562 let r2 = tool.execute(json!({"path": "b.pdf"})).await.unwrap();
563 assert!(!r2.success);
564 assert!(
565 r2.error
566 .as_deref()
567 .unwrap_or("")
568 .contains("Failed to resolve")
569 );
570
571 assert!(
573 !security.record_action(),
574 "budget must be exhausted after two failed probes"
575 );
576 }
577}