From ac334e9cf7580080172679b2b58deb4986c2ec78 Mon Sep 17 00:00:00 2001 From: tanyar09 Date: Tue, 24 Feb 2026 11:28:24 -0500 Subject: [PATCH] Enhance ReadFileTool for improved file reading capabilities - Added support for PDF file extraction using pdftotext, with error handling for extraction failures. - Updated tool description to clarify usage and file type support. - Improved error messages for binary and non-UTF-8 encoded files. - Updated ExecTool description to emphasize the importance of using read_file for reading files before executing commands. --- nanobot/agent/tools/filesystem.py | 48 ++++++++++++++++++++++++++++++- nanobot/agent/tools/shell.py | 6 +++- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/nanobot/agent/tools/filesystem.py b/nanobot/agent/tools/filesystem.py index 6b3254a..9519aaa 100644 --- a/nanobot/agent/tools/filesystem.py +++ b/nanobot/agent/tools/filesystem.py @@ -1,5 +1,7 @@ """File system tools: read, write, edit.""" +import asyncio +import subprocess from pathlib import Path from typing import Any @@ -26,7 +28,14 @@ class ReadFileTool(Tool): @property def description(self) -> str: - return "Read the contents of a file at the given path." + return """Read the contents of a file at the given path. + +ALWAYS use this tool to read files - it supports: +- Text files (plain text, code, markdown, etc.) +- PDF files (automatically extracts text using pdftotext) +- Binary files will return an error + +For reading files, use read_file FIRST. Only use exec for complex data processing AFTER reading the file content.""" @property def parameters(self) -> dict[str, Any]: @@ -49,8 +58,45 @@ class ReadFileTool(Tool): if not file_path.is_file(): return f"Error: Not a file: {path}" + # Check if file is a PDF and extract text if so + if file_path.suffix.lower() == '.pdf': + try: + # Use -layout flag to preserve table structure (makes quantities, prices, etc. easier to see) + process = await asyncio.create_subprocess_exec( + 'pdftotext', '-layout', str(file_path), '-', + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=30.0) + if process.returncode == 0 and stdout: + return stdout.decode('utf-8', errors='replace') + # Fall back to reading as binary and checking PDF header + if stderr: + error_msg = stderr.decode('utf-8', errors='replace') + if 'pdftotext' not in error_msg.lower(): + return f"Error extracting PDF text: {error_msg}" + except FileNotFoundError: + # pdftotext not available, try to read and detect PDF + pass + except asyncio.TimeoutError: + return "Error: PDF extraction timed out" + except Exception as e: + return f"Error extracting PDF text: {str(e)}" + + # For non-PDF files or if PDF extraction failed, read as text content = file_path.read_text(encoding="utf-8") return content + except UnicodeDecodeError: + # If UTF-8 fails, try to detect if it's a PDF by reading first bytes + try: + file_path = _resolve_path(path, self._allowed_dir) + with open(file_path, 'rb') as f: + header = f.read(4) + if header == b'%PDF': + return f"Error: PDF file detected but text extraction failed. Install 'poppler-utils' (pdftotext) to read PDF files." + except: + pass + return f"Error: File appears to be binary or not UTF-8 encoded. Cannot read as text." except PermissionError as e: return f"Error: {e}" except Exception as e: diff --git a/nanobot/agent/tools/shell.py b/nanobot/agent/tools/shell.py index 840d906..2bf00cd 100644 --- a/nanobot/agent/tools/shell.py +++ b/nanobot/agent/tools/shell.py @@ -43,7 +43,11 @@ class ExecTool(Tool): def description(self) -> str: return """Execute a shell command and return its output. Use with caution. -IMPORTANT: For data analysis tasks (Excel, CSV, JSON files), ALWAYS use Python with pandas: +IMPORTANT: +- For READING files (including PDFs, text files, etc.), ALWAYS use read_file FIRST. Do NOT use exec to read files. +- Only use exec for complex data processing AFTER you have already read the file content using read_file. + +For data analysis tasks (Excel, CSV, JSON files), use Python with pandas: - Excel files: python3 -c "import pandas as pd; df = pd.read_excel('file.xlsx'); result = df['Column Name'].sum(); print(result)" - CSV files: python3 -c "import pandas as pd; df = pd.read_csv('file.csv'); result = df['Column Name'].sum(); print(result)" - NEVER use pandas/openpyxl as command-line tools (they don't exist)