From ac334e9cf7580080172679b2b58deb4986c2ec78 Mon Sep 17 00:00:00 2001
From: tanyar09 <tatiana.romlit@gmail.com>
Date: Tue, 24 Feb 2026 11:28:24 -0500
Subject: [PATCH] Enhance ReadFileTool for improved file reading capabilities

- Added support for PDF file extraction using pdftotext, with error handling for extraction failures.
- Updated tool description to clarify usage and file type support.
- Improved error messages for binary and non-UTF-8 encoded files.
- Updated ExecTool description to emphasize the importance of using read_file for reading files before executing commands.
---
 nanobot/agent/tools/filesystem.py | 48 ++++++++++++++++++++++++++++++-
 nanobot/agent/tools/shell.py      |  6 +++-
 2 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/nanobot/agent/tools/filesystem.py b/nanobot/agent/tools/filesystem.py
index 6b3254a..9519aaa 100644
--- a/nanobot/agent/tools/filesystem.py
+++ b/nanobot/agent/tools/filesystem.py
@@ -1,5 +1,7 @@
 """File system tools: read, write, edit."""
 
+import asyncio
+import subprocess
 from pathlib import Path
 from typing import Any
 
@@ -26,7 +28,14 @@ class ReadFileTool(Tool):
     
     @property
     def description(self) -> str:
-        return "Read the contents of a file at the given path."
+        return """Read the contents of a file at the given path. 
+        
+ALWAYS use this tool to read files - it supports:
+- Text files (plain text, code, markdown, etc.)
+- PDF files (automatically extracts text using pdftotext)
+- Binary files will return an error
+
+For reading files, use read_file FIRST. Only use exec for complex data processing AFTER reading the file content."""
     
     @property
     def parameters(self) -> dict[str, Any]:
@@ -49,8 +58,45 @@ class ReadFileTool(Tool):
             if not file_path.is_file():
                 return f"Error: Not a file: {path}"
             
+            # Check if file is a PDF and extract text if so
+            if file_path.suffix.lower() == '.pdf':
+                try:
+                    # Use -layout flag to preserve table structure (makes quantities, prices, etc. easier to see)
+                    process = await asyncio.create_subprocess_exec(
+                        'pdftotext', '-layout', str(file_path), '-',
+                        stdout=asyncio.subprocess.PIPE,
+                        stderr=asyncio.subprocess.PIPE,
+                    )
+                    stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=30.0)
+                    if process.returncode == 0 and stdout:
+                        return stdout.decode('utf-8', errors='replace')
+                    # Fall back to reading as binary and checking PDF header
+                    if stderr:
+                        error_msg = stderr.decode('utf-8', errors='replace')
+                        if 'pdftotext' not in error_msg.lower():
+                            return f"Error extracting PDF text: {error_msg}"
+                except FileNotFoundError:
+                    # pdftotext not available, try to read and detect PDF
+                    pass
+                except asyncio.TimeoutError:
+                    return "Error: PDF extraction timed out"
+                except Exception as e:
+                    return f"Error extracting PDF text: {str(e)}"
+            
+            # For non-PDF files or if PDF extraction failed, read as text
             content = file_path.read_text(encoding="utf-8")
             return content
+        except UnicodeDecodeError:
+            # If UTF-8 fails, try to detect if it's a PDF by reading first bytes
+            try:
+                file_path = _resolve_path(path, self._allowed_dir)
+                with open(file_path, 'rb') as f:
+                    header = f.read(4)
+                    if header == b'%PDF':
+                        return f"Error: PDF file detected but text extraction failed. Install 'poppler-utils' (pdftotext) to read PDF files."
+            except:
+                pass
+            return f"Error: File appears to be binary or not UTF-8 encoded. Cannot read as text."
         except PermissionError as e:
             return f"Error: {e}"
         except Exception as e:
diff --git a/nanobot/agent/tools/shell.py b/nanobot/agent/tools/shell.py
index 840d906..2bf00cd 100644
--- a/nanobot/agent/tools/shell.py
+++ b/nanobot/agent/tools/shell.py
@@ -43,7 +43,11 @@ class ExecTool(Tool):
     def description(self) -> str:
         return """Execute a shell command and return its output. Use with caution.
 
-IMPORTANT: For data analysis tasks (Excel, CSV, JSON files), ALWAYS use Python with pandas:
+IMPORTANT: 
+- For READING files (including PDFs, text files, etc.), ALWAYS use read_file FIRST. Do NOT use exec to read files.
+- Only use exec for complex data processing AFTER you have already read the file content using read_file.
+
+For data analysis tasks (Excel, CSV, JSON files), use Python with pandas:
 - Excel files: python3 -c "import pandas as pd; df = pd.read_excel('file.xlsx'); result = df['Column Name'].sum(); print(result)"
 - CSV files: python3 -c "import pandas as pd; df = pd.read_csv('file.csv'); result = df['Column Name'].sum(); print(result)"
 - NEVER use pandas/openpyxl as command-line tools (they don't exist)