eohi/lit review/pdf_to_txt.py

#!/home/ladmin/miniconda3/envs/nlp/bin/python
"""
PDF to Text Converter
Converts PDF files to plain text files.

Usage:
    python pdf_to_txt.py <input.pdf>                    # Creates input.txt
    python pdf_to_txt.py <input.pdf> <output.txt>       # Custom output name
    python pdf_to_txt.py --all                           # Convert all PDFs in current directory

Requirements:
    pip install pypdf
"""

import sys
import os
from pathlib import Path

try:
    from pypdf import PdfReader
except ImportError:
    print("Error: pypdf library not found.")
    print("Please install it with: pip install pypdf")
    sys.exit(1)


def pdf_to_text(pdf_path, output_path=None):
    """
    Convert a PDF file to a text file.

    Args:
        pdf_path: Path to the PDF file
        output_path: Path to the output text file (optional)

    Returns:
        True if successful, False otherwise
    """
    try:
        # Convert to Path objects
        pdf_path = Path(pdf_path)

        if not pdf_path.exists():
            print(f"Error: File not found: {pdf_path}")
            return False

        # Determine output path
        if output_path is None:
            output_path = pdf_path.with_suffix('.txt')
        else:
            output_path = Path(output_path)

        print(f"Converting: {pdf_path.name}")

        # Read the PDF
        reader = PdfReader(str(pdf_path))

        # Extract text from all pages
        text_content = []
        for i, page in enumerate(reader.pages, 1):
            text = page.extract_text()
            if text:
                text_content.append(f"--- Page {i} ---\n{text}\n")

        # Write to text file
        full_text = "\n".join(text_content)
        output_path.write_text(full_text, encoding='utf-8')

        print(f"✓ Created: {output_path.name} ({len(reader.pages)} pages, {len(full_text):,} characters)")
        return True

    except Exception as e:
        print(f"✗ Error processing {pdf_path.name}: {str(e)}")
        return False


def convert_all_pdfs():
    """Convert all PDF files in the current directory to text files."""
    current_dir = Path.cwd()
    pdf_files = list(current_dir.glob("*.pdf"))

    if not pdf_files:
        print("No PDF files found in the current directory.")
        return

    print(f"Found {len(pdf_files)} PDF file(s) to convert.\n")

    successful = 0
    failed = 0

    for pdf_file in pdf_files:
        if pdf_to_text(pdf_file):
            successful += 1
        else:
            failed += 1

    print(f"\n{'='*60}")
    print(f"Conversion complete: {successful} successful, {failed} failed")


def main():
    if len(sys.argv) < 2:
        print(__doc__)
        sys.exit(1)

    # Convert all PDFs in directory
    if sys.argv[1] == "--all":
        convert_all_pdfs()

    # Convert single PDF
    elif len(sys.argv) == 2:
        pdf_path = sys.argv[1]
        pdf_to_text(pdf_path)

    # Convert single PDF with custom output name
    elif len(sys.argv) == 3:
        pdf_path = sys.argv[1]
        output_path = sys.argv[2]
        pdf_to_text(pdf_path, output_path)

    else:
        print("Error: Too many arguments")
        print(__doc__)
        sys.exit(1)


if __name__ == "__main__":
    main()