129 lines
3.3 KiB
Python
129 lines
3.3 KiB
Python
#!/home/ladmin/miniconda3/envs/nlp/bin/python
|
|
"""
|
|
PDF to Text Converter
|
|
Converts PDF files to plain text files.
|
|
|
|
Usage:
|
|
python pdf_to_txt.py <input.pdf> # Creates input.txt
|
|
python pdf_to_txt.py <input.pdf> <output.txt> # Custom output name
|
|
python pdf_to_txt.py --all # Convert all PDFs in current directory
|
|
|
|
Requirements:
|
|
pip install pypdf
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
from pathlib import Path
|
|
|
|
try:
|
|
from pypdf import PdfReader
|
|
except ImportError:
|
|
print("Error: pypdf library not found.")
|
|
print("Please install it with: pip install pypdf")
|
|
sys.exit(1)
|
|
|
|
|
|
def pdf_to_text(pdf_path, output_path=None):
|
|
"""
|
|
Convert a PDF file to a text file.
|
|
|
|
Args:
|
|
pdf_path: Path to the PDF file
|
|
output_path: Path to the output text file (optional)
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
try:
|
|
# Convert to Path objects
|
|
pdf_path = Path(pdf_path)
|
|
|
|
if not pdf_path.exists():
|
|
print(f"Error: File not found: {pdf_path}")
|
|
return False
|
|
|
|
# Determine output path
|
|
if output_path is None:
|
|
output_path = pdf_path.with_suffix('.txt')
|
|
else:
|
|
output_path = Path(output_path)
|
|
|
|
print(f"Converting: {pdf_path.name}")
|
|
|
|
# Read the PDF
|
|
reader = PdfReader(str(pdf_path))
|
|
|
|
# Extract text from all pages
|
|
text_content = []
|
|
for i, page in enumerate(reader.pages, 1):
|
|
text = page.extract_text()
|
|
if text:
|
|
text_content.append(f"--- Page {i} ---\n{text}\n")
|
|
|
|
# Write to text file
|
|
full_text = "\n".join(text_content)
|
|
output_path.write_text(full_text, encoding='utf-8')
|
|
|
|
print(f"✓ Created: {output_path.name} ({len(reader.pages)} pages, {len(full_text):,} characters)")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"✗ Error processing {pdf_path.name}: {str(e)}")
|
|
return False
|
|
|
|
|
|
def convert_all_pdfs():
|
|
"""Convert all PDF files in the current directory to text files."""
|
|
current_dir = Path.cwd()
|
|
pdf_files = list(current_dir.glob("*.pdf"))
|
|
|
|
if not pdf_files:
|
|
print("No PDF files found in the current directory.")
|
|
return
|
|
|
|
print(f"Found {len(pdf_files)} PDF file(s) to convert.\n")
|
|
|
|
successful = 0
|
|
failed = 0
|
|
|
|
for pdf_file in pdf_files:
|
|
if pdf_to_text(pdf_file):
|
|
successful += 1
|
|
else:
|
|
failed += 1
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Conversion complete: {successful} successful, {failed} failed")
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print(__doc__)
|
|
sys.exit(1)
|
|
|
|
# Convert all PDFs in directory
|
|
if sys.argv[1] == "--all":
|
|
convert_all_pdfs()
|
|
|
|
# Convert single PDF
|
|
elif len(sys.argv) == 2:
|
|
pdf_path = sys.argv[1]
|
|
pdf_to_text(pdf_path)
|
|
|
|
# Convert single PDF with custom output name
|
|
elif len(sys.argv) == 3:
|
|
pdf_path = sys.argv[1]
|
|
output_path = sys.argv[2]
|
|
pdf_to_text(pdf_path, output_path)
|
|
|
|
else:
|
|
print("Error: Too many arguments")
|
|
print(__doc__)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|