eohi/lit review/pdf_to_txt.py
2026-01-22 17:55:35 -05:00

129 lines
3.3 KiB
Python

#!/home/ladmin/miniconda3/envs/nlp/bin/python
"""
PDF to Text Converter
Converts PDF files to plain text files.
Usage:
python pdf_to_txt.py <input.pdf> # Creates input.txt
python pdf_to_txt.py <input.pdf> <output.txt> # Custom output name
python pdf_to_txt.py --all # Convert all PDFs in current directory
Requirements:
pip install pypdf
"""
import sys
import os
from pathlib import Path
try:
from pypdf import PdfReader
except ImportError:
print("Error: pypdf library not found.")
print("Please install it with: pip install pypdf")
sys.exit(1)
def pdf_to_text(pdf_path, output_path=None):
"""
Convert a PDF file to a text file.
Args:
pdf_path: Path to the PDF file
output_path: Path to the output text file (optional)
Returns:
True if successful, False otherwise
"""
try:
# Convert to Path objects
pdf_path = Path(pdf_path)
if not pdf_path.exists():
print(f"Error: File not found: {pdf_path}")
return False
# Determine output path
if output_path is None:
output_path = pdf_path.with_suffix('.txt')
else:
output_path = Path(output_path)
print(f"Converting: {pdf_path.name}")
# Read the PDF
reader = PdfReader(str(pdf_path))
# Extract text from all pages
text_content = []
for i, page in enumerate(reader.pages, 1):
text = page.extract_text()
if text:
text_content.append(f"--- Page {i} ---\n{text}\n")
# Write to text file
full_text = "\n".join(text_content)
output_path.write_text(full_text, encoding='utf-8')
print(f"✓ Created: {output_path.name} ({len(reader.pages)} pages, {len(full_text):,} characters)")
return True
except Exception as e:
print(f"✗ Error processing {pdf_path.name}: {str(e)}")
return False
def convert_all_pdfs():
"""Convert all PDF files in the current directory to text files."""
current_dir = Path.cwd()
pdf_files = list(current_dir.glob("*.pdf"))
if not pdf_files:
print("No PDF files found in the current directory.")
return
print(f"Found {len(pdf_files)} PDF file(s) to convert.\n")
successful = 0
failed = 0
for pdf_file in pdf_files:
if pdf_to_text(pdf_file):
successful += 1
else:
failed += 1
print(f"\n{'='*60}")
print(f"Conversion complete: {successful} successful, {failed} failed")
def main():
if len(sys.argv) < 2:
print(__doc__)
sys.exit(1)
# Convert all PDFs in directory
if sys.argv[1] == "--all":
convert_all_pdfs()
# Convert single PDF
elif len(sys.argv) == 2:
pdf_path = sys.argv[1]
pdf_to_text(pdf_path)
# Convert single PDF with custom output name
elif len(sys.argv) == 3:
pdf_path = sys.argv[1]
output_path = sys.argv[2]
pdf_to_text(pdf_path, output_path)
else:
print("Error: Too many arguments")
print(__doc__)
sys.exit(1)
if __name__ == "__main__":
main()