#!/home/ladmin/miniconda3/envs/nlp/bin/python """ PDF to Text Converter Converts PDF files to plain text files. Usage: python pdf_to_txt.py # Creates input.txt python pdf_to_txt.py # Custom output name python pdf_to_txt.py --all # Convert all PDFs in current directory Requirements: pip install pypdf """ import sys import os from pathlib import Path try: from pypdf import PdfReader except ImportError: print("Error: pypdf library not found.") print("Please install it with: pip install pypdf") sys.exit(1) def pdf_to_text(pdf_path, output_path=None): """ Convert a PDF file to a text file. Args: pdf_path: Path to the PDF file output_path: Path to the output text file (optional) Returns: True if successful, False otherwise """ try: # Convert to Path objects pdf_path = Path(pdf_path) if not pdf_path.exists(): print(f"Error: File not found: {pdf_path}") return False # Determine output path if output_path is None: output_path = pdf_path.with_suffix('.txt') else: output_path = Path(output_path) print(f"Converting: {pdf_path.name}") # Read the PDF reader = PdfReader(str(pdf_path)) # Extract text from all pages text_content = [] for i, page in enumerate(reader.pages, 1): text = page.extract_text() if text: text_content.append(f"--- Page {i} ---\n{text}\n") # Write to text file full_text = "\n".join(text_content) output_path.write_text(full_text, encoding='utf-8') print(f"✓ Created: {output_path.name} ({len(reader.pages)} pages, {len(full_text):,} characters)") return True except Exception as e: print(f"✗ Error processing {pdf_path.name}: {str(e)}") return False def convert_all_pdfs(): """Convert all PDF files in the current directory to text files.""" current_dir = Path.cwd() pdf_files = list(current_dir.glob("*.pdf")) if not pdf_files: print("No PDF files found in the current directory.") return print(f"Found {len(pdf_files)} PDF file(s) to convert.\n") successful = 0 failed = 0 for pdf_file in pdf_files: if pdf_to_text(pdf_file): successful += 1 else: failed += 1 print(f"\n{'='*60}") print(f"Conversion complete: {successful} successful, {failed} failed") def main(): if len(sys.argv) < 2: print(__doc__) sys.exit(1) # Convert all PDFs in directory if sys.argv[1] == "--all": convert_all_pdfs() # Convert single PDF elif len(sys.argv) == 2: pdf_path = sys.argv[1] pdf_to_text(pdf_path) # Convert single PDF with custom output name elif len(sys.argv) == 3: pdf_path = sys.argv[1] output_path = sys.argv[2] pdf_to_text(pdf_path, output_path) else: print("Error: Too many arguments") print(__doc__) sys.exit(1) if __name__ == "__main__": main()