Enhance ReadFileTool for improved file reading capabilities
- Added support for PDF file extraction using pdftotext, with error handling for extraction failures. - Updated tool description to clarify usage and file type support. - Improved error messages for binary and non-UTF-8 encoded files. - Updated ExecTool description to emphasize the importance of using read_file for reading files before executing commands.
This commit is contained in:
parent
d3cb1d0050
commit
ac334e9cf7
@ -1,5 +1,7 @@
|
|||||||
"""File system tools: read, write, edit."""
|
"""File system tools: read, write, edit."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import subprocess
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
@ -26,7 +28,14 @@ class ReadFileTool(Tool):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def description(self) -> str:
|
def description(self) -> str:
|
||||||
return "Read the contents of a file at the given path."
|
return """Read the contents of a file at the given path.
|
||||||
|
|
||||||
|
ALWAYS use this tool to read files - it supports:
|
||||||
|
- Text files (plain text, code, markdown, etc.)
|
||||||
|
- PDF files (automatically extracts text using pdftotext)
|
||||||
|
- Binary files will return an error
|
||||||
|
|
||||||
|
For reading files, use read_file FIRST. Only use exec for complex data processing AFTER reading the file content."""
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def parameters(self) -> dict[str, Any]:
|
def parameters(self) -> dict[str, Any]:
|
||||||
@ -49,8 +58,45 @@ class ReadFileTool(Tool):
|
|||||||
if not file_path.is_file():
|
if not file_path.is_file():
|
||||||
return f"Error: Not a file: {path}"
|
return f"Error: Not a file: {path}"
|
||||||
|
|
||||||
|
# Check if file is a PDF and extract text if so
|
||||||
|
if file_path.suffix.lower() == '.pdf':
|
||||||
|
try:
|
||||||
|
# Use -layout flag to preserve table structure (makes quantities, prices, etc. easier to see)
|
||||||
|
process = await asyncio.create_subprocess_exec(
|
||||||
|
'pdftotext', '-layout', str(file_path), '-',
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE,
|
||||||
|
)
|
||||||
|
stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=30.0)
|
||||||
|
if process.returncode == 0 and stdout:
|
||||||
|
return stdout.decode('utf-8', errors='replace')
|
||||||
|
# Fall back to reading as binary and checking PDF header
|
||||||
|
if stderr:
|
||||||
|
error_msg = stderr.decode('utf-8', errors='replace')
|
||||||
|
if 'pdftotext' not in error_msg.lower():
|
||||||
|
return f"Error extracting PDF text: {error_msg}"
|
||||||
|
except FileNotFoundError:
|
||||||
|
# pdftotext not available, try to read and detect PDF
|
||||||
|
pass
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
return "Error: PDF extraction timed out"
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error extracting PDF text: {str(e)}"
|
||||||
|
|
||||||
|
# For non-PDF files or if PDF extraction failed, read as text
|
||||||
content = file_path.read_text(encoding="utf-8")
|
content = file_path.read_text(encoding="utf-8")
|
||||||
return content
|
return content
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# If UTF-8 fails, try to detect if it's a PDF by reading first bytes
|
||||||
|
try:
|
||||||
|
file_path = _resolve_path(path, self._allowed_dir)
|
||||||
|
with open(file_path, 'rb') as f:
|
||||||
|
header = f.read(4)
|
||||||
|
if header == b'%PDF':
|
||||||
|
return f"Error: PDF file detected but text extraction failed. Install 'poppler-utils' (pdftotext) to read PDF files."
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return f"Error: File appears to be binary or not UTF-8 encoded. Cannot read as text."
|
||||||
except PermissionError as e:
|
except PermissionError as e:
|
||||||
return f"Error: {e}"
|
return f"Error: {e}"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@ -43,7 +43,11 @@ class ExecTool(Tool):
|
|||||||
def description(self) -> str:
|
def description(self) -> str:
|
||||||
return """Execute a shell command and return its output. Use with caution.
|
return """Execute a shell command and return its output. Use with caution.
|
||||||
|
|
||||||
IMPORTANT: For data analysis tasks (Excel, CSV, JSON files), ALWAYS use Python with pandas:
|
IMPORTANT:
|
||||||
|
- For READING files (including PDFs, text files, etc.), ALWAYS use read_file FIRST. Do NOT use exec to read files.
|
||||||
|
- Only use exec for complex data processing AFTER you have already read the file content using read_file.
|
||||||
|
|
||||||
|
For data analysis tasks (Excel, CSV, JSON files), use Python with pandas:
|
||||||
- Excel files: python3 -c "import pandas as pd; df = pd.read_excel('file.xlsx'); result = df['Column Name'].sum(); print(result)"
|
- Excel files: python3 -c "import pandas as pd; df = pd.read_excel('file.xlsx'); result = df['Column Name'].sum(); print(result)"
|
||||||
- CSV files: python3 -c "import pandas as pd; df = pd.read_csv('file.csv'); result = df['Column Name'].sum(); print(result)"
|
- CSV files: python3 -c "import pandas as pd; df = pd.read_csv('file.csv'); result = df['Column Name'].sum(); print(result)"
|
||||||
- NEVER use pandas/openpyxl as command-line tools (they don't exist)
|
- NEVER use pandas/openpyxl as command-line tools (they don't exist)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user