diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..bf21dfc --- /dev/null +++ b/Makefile @@ -0,0 +1,261 @@ +# HiLiteHero - PDF Highlight Extractor Makefile +# Description: Makefile for easy development, testing, and deployment + +# Variables +PYTHON := python3 +PIP := pip3 +VENV := venv +VENV_BIN := $(VENV)/bin +VENV_PYTHON := $(VENV_BIN)/python +VENV_PIP := $(VENV_BIN)/pip +MAIN_SCRIPT := main.py +TEST_PDF := test/test2.pdf +REQUIREMENTS := requirements.txt + +# Colors for output +RED := \033[0;31m +GREEN := \033[0;32m +YELLOW := \033[0;33m +BLUE := \033[0;34m +PURPLE := \033[0;35m +CYAN := \033[0;36m +WHITE := \033[0;37m +NC := \033[0m # No Color + +# Helper function to get the right Python executable +define get_python +$(if $(wildcard $(VENV_PYTHON)),$(VENV_PYTHON),$(PYTHON)) +endef + +# Default target +.PHONY: help +help: ## Show this help message + @echo "$(CYAN)HiLiteHero - PDF Highlight Extractor$(NC)" + @echo "$(YELLOW)Available targets:$(NC)" + @awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf " $(GREEN)%-15s$(NC) %s\n", $$1, $$2}' $(MAKEFILE_LIST) + +# Installation targets +.PHONY: install +install: venv-install ## Install dependencies (creates venv if needed) + @echo "$(GREEN)Dependencies installed successfully!$(NC)" + +.PHONY: install-system +install-system: ## Install dependencies system-wide (may require --break-system-packages) + @echo "$(YELLOW)Warning: Installing system-wide packages$(NC)" + @echo "$(BLUE)Installing dependencies...$(NC)" + $(PIP) install -r $(REQUIREMENTS) --break-system-packages + @echo "$(GREEN)Dependencies installed successfully!$(NC)" + +.PHONY: install-dev +install-dev: venv-install ## Install development dependencies in virtual environment + @echo "$(BLUE)Installing development dependencies...$(NC)" + $(VENV_PIP) install black flake8 pytest pytest-cov + @echo "$(GREEN)Development dependencies installed!$(NC)" + +.PHONY: venv +venv: ## Create virtual environment + @echo "$(BLUE)Creating virtual environment...$(NC)" + $(PYTHON) -m venv $(VENV) + @echo "$(GREEN)Virtual environment created!$(NC)" + @echo "$(YELLOW)To activate: source $(VENV)/bin/activate$(NC)" + +.PHONY: venv-install +venv-install: venv ## Create venv and install dependencies + @echo "$(BLUE)Installing dependencies in virtual environment...$(NC)" + $(VENV_PIP) install --upgrade pip + $(VENV_PIP) install -r $(REQUIREMENTS) + @echo "$(GREEN)Virtual environment setup complete!$(NC)" + @echo "$(YELLOW)To activate: source $(VENV)/bin/activate$(NC)" + +# Testing targets +.PHONY: test +test: ## Run test mode with default PDF + @echo "$(BLUE)Running test mode...$(NC)" + $(call get_python) $(MAIN_SCRIPT) --test + @echo "$(GREEN)Test completed!$(NC)" + +.PHONY: test-interactive +test-interactive: ## Run test mode with interactive review + @echo "$(BLUE)Running test mode with interactive review...$(NC)" + $(call get_python) $(MAIN_SCRIPT) --test --interactive + +.PHONY: test-debug +test-debug: ## Run test mode with debug output + @echo "$(BLUE)Running test mode with debug output...$(NC)" + $(call get_python) $(MAIN_SCRIPT) --test --debug + +.PHONY: test-silent +test-silent: ## Run test mode silently (minimal output) + @echo "$(BLUE)Running test mode silently...$(NC)" + $(call get_python) $(MAIN_SCRIPT) --test --silent + @echo "$(GREEN)Silent test completed!$(NC)" + +.PHONY: test-custom +test-custom: ## Run test with custom output file + @echo "$(BLUE)Running test with custom output...$(NC)" + $(call get_python) $(MAIN_SCRIPT) --test --output-json test_results.json + @echo "$(GREEN)Test results saved to test_results.json$(NC)" + +# Development targets +.PHONY: dev +dev: ## Run in development mode (interactive with debug) + @echo "$(BLUE)Starting development mode...$(NC)" + $(call get_python) $(MAIN_SCRIPT) --debug --interactive + +.PHONY: run +run: ## Run the script interactively + @echo "$(BLUE)Starting interactive mode...$(NC)" + $(call get_python) $(MAIN_SCRIPT) + +.PHONY: run-file +run-file: ## Run with a specific PDF file (usage: make run-file FILE=path/to/file.pdf) + @if [ -z "$(FILE)" ]; then \ + echo "$(RED)Error: Please specify FILE=path/to/file.pdf$(NC)"; \ + echo "$(YELLOW)Example: make run-file FILE=document.pdf$(NC)"; \ + exit 1; \ + fi + @echo "$(BLUE)Processing $(FILE)...$(NC)" + $(call get_python) $(MAIN_SCRIPT) "$(FILE)" + +.PHONY: run-pages +run-pages: ## Run with specific pages (usage: make run-pages FILE=doc.pdf PAGES="1,3-5") + @if [ -z "$(FILE)" ] || [ -z "$(PAGES)" ]; then \ + echo "$(RED)Error: Please specify FILE and PAGES$(NC)"; \ + echo "$(YELLOW)Example: make run-pages FILE=document.pdf PAGES=\"1,3-5\"$(NC)"; \ + exit 1; \ + fi + @echo "$(BLUE)Processing pages $(PAGES) of $(FILE)...$(NC)" + $(call get_python) $(MAIN_SCRIPT) "$(FILE)" --pages "$(PAGES)" + +# Code quality targets +.PHONY: format +format: ## Format code with black + @echo "$(BLUE)Formatting code with black...$(NC)" + $(call get_python) -m black $(MAIN_SCRIPT) + @echo "$(GREEN)Code formatted!$(NC)" + +.PHONY: lint +lint: ## Lint code with flake8 + @echo "$(BLUE)Linting code with flake8...$(NC)" + $(call get_python) -m flake8 $(MAIN_SCRIPT) --max-line-length=120 --ignore=E203,W503 + @echo "$(GREEN)Linting completed!$(NC)" + +.PHONY: check +check: lint ## Run all code quality checks + @echo "$(GREEN)All checks passed!$(NC)" + +# Utility targets +.PHONY: clean +clean: ## Clean up generated files + @echo "$(BLUE)Cleaning up generated files...$(NC)" + rm -f *.json + rm -f test_results.json + rm -rf pdf_page_images/ + rm -rf __pycache__/ + rm -rf .pytest_cache/ + rm -rf *.pyc + @echo "$(GREEN)Cleanup completed!$(NC)" + +.PHONY: clean-venv +clean-venv: ## Remove virtual environment + @echo "$(BLUE)Removing virtual environment...$(NC)" + rm -rf $(VENV) + @echo "$(GREEN)Virtual environment removed!$(NC)" + +.PHONY: clean-all +clean-all: clean clean-venv ## Clean everything including virtual environment + @echo "$(GREEN)Complete cleanup finished!$(NC)" + +.PHONY: status +status: ## Show project status + @echo "$(CYAN)=== HiLiteHero Project Status ===$(NC)" + @echo "$(YELLOW)Python version:$(NC) $$(python3 --version 2>/dev/null || echo 'Not found')" + @echo "$(YELLOW)Pip version:$(NC) $$(pip3 --version 2>/dev/null || echo 'Not found')" + @echo "$(YELLOW)Virtual environment:$(NC) $$(if [ -d $(VENV) ]; then echo 'Exists'; else echo 'Not created'; fi)" + @echo "$(YELLOW)Dependencies installed:$(NC) $$(pip3 list | grep -q PyMuPDF && echo 'Yes' || echo 'No')" + @echo "$(YELLOW)Test PDF exists:$(NC) $$(if [ -f $(TEST_PDF) ]; then echo 'Yes'; else echo 'No'; fi)" + @echo "$(YELLOW)Generated files:$(NC) $$(ls -1 *.json 2>/dev/null | wc -l) JSON files" + +# Documentation targets +.PHONY: docs +docs: ## Show documentation + @echo "$(CYAN)=== HiLiteHero Documentation ===$(NC)" + @echo "$(YELLOW)Main script:$(NC) $(MAIN_SCRIPT)" + @echo "$(YELLOW)Test PDF:$(NC) $(TEST_PDF)" + @echo "$(YELLOW)Requirements:$(NC) $(REQUIREMENTS)" + @echo "" + @echo "$(YELLOW)Quick start:$(NC)" + @echo " make test # Run test mode" + @echo " make run # Interactive mode" + @echo " make dev # Development mode" + @echo "" + @echo "$(YELLOW)For more help:$(NC) make help" + +# Batch processing targets +.PHONY: batch +batch: ## Run in batch mode (silent with auto-save) + @echo "$(BLUE)Running in batch mode...$(NC)" + $(call get_python) $(MAIN_SCRIPT) --silent --output-json batch_results_$(shell date +%Y%m%d_%H%M%S).json + @echo "$(GREEN)Batch processing completed!$(NC)" + +.PHONY: batch-file +batch-file: ## Batch process specific file (usage: make batch-file FILE=doc.pdf) + @if [ -z "$(FILE)" ]; then \ + echo "$(RED)Error: Please specify FILE=path/to/file.pdf$(NC)"; \ + exit 1; \ + fi + @echo "$(BLUE)Batch processing $(FILE)...$(NC)" + $(call get_python) $(MAIN_SCRIPT) "$(FILE)" --silent --output-json "$(shell basename "$(FILE)" .pdf)_batch_$(shell date +%Y%m%d_%H%M%S).json" + @echo "$(GREEN)Batch processing completed!$(NC)" + +.PHONY: batch-all +batch-all: ## Process all PDFs in test folder + @echo "$(BLUE)Processing all PDFs in test folder...$(NC)" + @if [ ! -d "test" ]; then \ + echo "$(RED)Error: test folder not found$(NC)"; \ + exit 1; \ + fi + @pdf_count=0; \ + for pdf in test/*.pdf; do \ + if [ -f "$$pdf" ]; then \ + pdf_count=$$((pdf_count + 1)); \ + echo "$(CYAN)Processing $$pdf...$(NC)"; \ + $(call get_python) $(MAIN_SCRIPT) "$$pdf" --silent --output-json "$$(basename "$$pdf" .pdf)_batch_$(shell date +%Y%m%d_%H%M%S).json"; \ + fi; \ + done; \ + if [ $$pdf_count -eq 0 ]; then \ + echo "$(YELLOW)No PDF files found in test folder$(NC)"; \ + else \ + echo "$(GREEN)Processed $$pdf_count PDF file(s) successfully!$(NC)"; \ + fi + +# Installation verification +.PHONY: verify +verify: ## Verify installation + @echo "$(BLUE)Verifying installation...$(NC)" + @if [ -f $(VENV_PYTHON) ]; then \ + echo "$(CYAN)Checking virtual environment...$(NC)"; \ + $(VENV_PYTHON) -c "import fitz, colorama; print('$(GREEN)Virtual env dependencies OK$(NC)')" || (echo "$(RED)Virtual env dependencies missing$(NC)" && exit 1); \ + else \ + echo "$(YELLOW)Checking system Python...$(NC)"; \ + $(PYTHON) -c "import fitz, colorama; print('$(GREEN)System dependencies OK$(NC)')" || (echo "$(RED)System dependencies missing$(NC)" && exit 1); \ + fi + @if [ -f $(MAIN_SCRIPT) ]; then echo "$(GREEN)Main script found$(NC)"; else echo "$(RED)Main script missing$(NC)" && exit 1; fi + @echo "$(GREEN)Installation verified!$(NC)" + +# Quick development workflow +.PHONY: quick-dev +quick-dev: clean test ## Quick development workflow (clean + test) + @echo "$(GREEN)Quick development cycle completed!$(NC)" + +# Show available PDF files +.PHONY: list-pdfs +list-pdfs: ## List available PDF files in project + @echo "$(CYAN)Available PDF files:$(NC)" + @find . -name "*.pdf" -type f 2>/dev/null | head -10 || echo "$(YELLOW)No PDF files found$(NC)" + +# Show recent JSON outputs +.PHONY: list-outputs +list-outputs: ## List recent JSON output files + @echo "$(CYAN)Recent JSON outputs:$(NC)" + @ls -lt *.json 2>/dev/null | head -5 || echo "$(YELLOW)No JSON output files found$(NC)" diff --git a/README.md b/README.md index cff2ea6..18e1f46 100644 --- a/README.md +++ b/README.md @@ -15,12 +15,124 @@ A Python tool for extracting highlighted text from PDF files with precise text o ## Installation -Clone the repository: -git clone -cd pdf-highlight-extractor +### Prerequisites +- Python 3.7 or higher +- pip package manager -Install required packages: -pip install PyMuPDF pdfplumber colorama pandas +### Quick Installation + +1. **Clone the repository:** + ```bash + git clone + cd HiLiteHero + ``` + +2. **Install dependencies:** + ```bash + pip install -r requirements.txt + ``` + + Or install manually: + ```bash + pip install PyMuPDF colorama + ``` + +### Alternative Installation Methods + +**Using virtual environment (recommended):** +```bash +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate +pip install -r requirements.txt +``` + +**Using conda:** +```bash +conda create -n hilitehero python=3.9 +conda activate hilitehero +pip install -r requirements.txt +``` + +### Verify Installation +```bash +python main.py --test +``` +This should process the default test file and create a JSON output file. + +## Quick Start with Makefile + +The project includes a comprehensive Makefile for easy development and testing: + +### Essential Commands + +```bash +# Show all available commands +make help + +# Quick test (recommended first run) +make test + +# Interactive mode +make run + +# Development mode (debug + interactive) +make dev + +# Install dependencies +make install + +# Clean up generated files +make clean +``` + +### Common Workflows + +**First-time setup:** +```bash +make install # Install dependencies +make test # Verify everything works +``` + +**Development workflow:** +```bash +make dev # Start development mode +make clean # Clean up when done +``` + +**Batch processing:** +```bash +make batch # Process default file silently +make batch-file FILE=document.pdf # Process specific file +``` + +**Code quality:** +```bash +make format # Format code +make lint # Check code quality +make check # Run all checks +``` + +### Advanced Makefile Usage + +**Process specific pages:** +```bash +make run-pages FILE=document.pdf PAGES="1,3-5" +``` + +**Test different modes:** +```bash +make test-interactive # Test with interactive review +make test-debug # Test with debug output +make test-silent # Test silently +``` + +**Project management:** +```bash +make status # Show project status +make docs # Show documentation +make list-pdfs # List available PDF files +make list-outputs # Show recent outputs +``` ## Dependencies @@ -32,19 +144,76 @@ pip install PyMuPDF pdfplumber colorama pandas ## Usage -### Quick Test Mode -python highlight_extractor.py --test +### Quick Start -Uses default file: `/mnt/c/Users/admin/Downloads/test2.pdf` and displays results only. +**Test Mode (Recommended for first-time users):** +```bash +python main.py --test +``` +Uses default test file and automatically saves results to JSON. -### Interactive Mode -python highlight_extractor.py +**Interactive Mode:** +```bash +python main.py +``` +Prompts for PDF file path and provides interactive review options. -Prompts for PDF file path and output options. +**Process Specific PDF:** +```bash +python main.py path/to/your/document.pdf +``` -### Command Line Flags -- `--test`, `-t`, or `test` - Enable test mode with defaults -- No flags - Full interactive mode +### Command Line Options + +| Flag | Description | Example | +|------|-------------|---------| +| `--test`, `-t` | Test mode with default settings | `python main.py -t` | +| `--interactive`, `-i` | Enable interactive review mode | `python main.py -i document.pdf` | +| `--pages`, `-p` | Process specific pages | `python main.py -p "1,3-5" doc.pdf` | +| `--silent`, `-s` | Minimal output, auto-save JSON | `python main.py -s` | +| `--debug`, `-d` | Enable detailed debug output | `python main.py -d document.pdf` | +| `--output-json` | Custom JSON output path | `python main.py --output-json results.json` | + +### Usage Examples + +**Basic extraction:** +```bash +python main.py document.pdf +``` + +**Process specific pages with interactive review:** +```bash +python main.py document.pdf -p "1,5-7" -i +``` + +**Silent mode for batch processing:** +```bash +python main.py document.pdf -s --output-json batch_results.json +``` + +**Debug mode for troubleshooting:** +```bash +python main.py document.pdf -d +``` + +**Test with custom output:** +```bash +python main.py -t --output-json test_results.json +``` + +### Interactive Review Mode + +When using `-i` flag, you can: +- **[N]ext** - Move to next highlight +- **[P]rev** - Move to previous highlight +- **[U]p** - Move highlight up in order +- **[M]ove Down** - Move highlight down in order +- **[C]olor** - Change highlight color classification +- **[E]dit** - Edit highlight text +- **[D]elete** - Remove highlight +- **[O]pen Img** - View page image +- **[S]ave&Exit** - Save changes and exit +- **[Q]uit** - Quit without saving ## Output Formats @@ -97,9 +266,34 @@ Tabular format with columns: page, text, color, type, category **Over-extraction**: The tool is designed to avoid this, but very close text might be included. Check highlight precision in your PDF. +**Installation Issues**: +- Ensure Python 3.7+ is installed +- Try using virtual environment: `make venv-install` +- Check dependencies: `make verify` + +**Permission Errors**: +- On Linux/Mac: Ensure PDF files are readable +- On Windows: Run as administrator if needed + ### Debug Output Run with detailed logging to see extraction decisions: -python highlight_extractor.py --test +```bash +python main.py --test --debug +# or +make test-debug +``` + +### Getting Help +```bash +# Show all available commands +make help + +# Check project status +make status + +# Verify installation +make verify +``` ## Contributing diff --git a/main.py b/main.py index 055401d..f9b5f47 100644 --- a/main.py +++ b/main.py @@ -1,753 +1,1427 @@ -#!/usr/bin/env python3 -# ============================================================================= -# ENHANCED PDF HIGHLIGHT EXTRACTOR -# Author: Perplexity AI Companion (Updated by User Feedback) -# Date: June 3, 2025 -# License: MIT -# -# Extracts highlights from PDF files, with options for interactive review, -# detailed output, text cleaning, JSON export, and page image viewing. -# ============================================================================= - -import time -import os -import fitz # PyMuPDF -import json -from colorama import init, Fore, Back, Style -from pathlib import Path -import re -import string -import sys -import traceback -import argparse -import difflib # For text difference calculation -import tempfile # For temporary image files -import webbrowser # For opening images/PDFs -import uuid # For unique filenames - -# Attempt to import readline for better input() experience on some systems -try: - import readline - READLINE_AVAILABLE = True -except ImportError: - READLINE_AVAILABLE = False # readline not available - -# ============================================================================= -# GLOBAL CONFIGURATION FLAGS (Defaults, can be overridden by CLI args) -# ============================================================================= -DEFAULT_PDF_PATH = "/mnt/c/Users/admin/Downloads/test2.pdf" # Example, adjust if needed -DEFAULT_PAGES_TO_PROCESS = "3" # Example: "1,3-5,all" - -# Default Behavior flags (can be influenced by -d or -s CLI flags) -# These are used to initialize effective_run_args -# Keep these distinct from the effective_run_args object itself -INITIAL_SHOW_TIMING = True -INITIAL_SHOW_PROGRESS = True -INITIAL_SHOW_RAW_SEGMENTS = True -INITIAL_SHOW_EXTRACTION_DETAILS = True -INITIAL_SHOW_RECT_DETAILS = True -INITIAL_SHOW_DIFF_PERCENTAGE = True -INITIAL_CLEAN_EDGES = True - -# Text extraction parameters (generally fixed) -TEXT_EXTRACTION_HORIZONTAL_PADDING = 6.0 -TEXT_EXTRACTION_VERTICAL_PADDING = 1.0 - -# Edge cleaning configuration (generally fixed) -VALID_TWO_LETTER_WORDS = { - 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he', 'if', 'in', 'is', 'it', 'me', 'my', - 'no', 'of', 'on', 'or', 'ox', 'so', 'to', 'up', 'us', 'we'} -VALID_SINGLE_LETTERS = {'i', 'a'} - -# Image handling configuration -IMAGE_FOLDER_PATH = 'pdf_page_images' # Relative to CWD by default -CLEAR_IMAGE_FOLDER_ON_START = True -CLEAR_IMAGE_FOLDER_ON_END = False - -# Initialize colorama -init(autoreset=True) - -# --- Helper Functions --- -def get_text_diff_ratio(text1, text2): - if not text1 and not text2: return 1.0 - if not text1 or not text2: return 0.0 - return difflib.SequenceMatcher(None, str(text1), str(text2)).ratio() - -def clean_segment_edges_func(text_to_clean, clean_edges_setting): - if not clean_edges_setting or not text_to_clean: return text_to_clean - text_to_clean = re.sub(r'\s+', ' ', text_to_clean.strip()) - words = text_to_clean.split() - if not words: return text_to_clean - - current_idx = 0 - while current_idx < len(words): - token = words[current_idx] - core_token = token.rstrip(string.punctuation) - trailing_punctuation = token[len(core_token):] - if not core_token: words.pop(current_idx); continue - core_should_be_removed = (len(core_token) == 1 and core_token.isalpha() and core_token.lower() not in VALID_SINGLE_LETTERS) or \ - (len(core_token) == 2 and core_token.isalpha() and core_token.lower() not in VALID_TWO_LETTER_WORDS) - if core_should_be_removed: - if trailing_punctuation: words[current_idx] = trailing_punctuation - else: words.pop(current_idx) - continue - break - while words: - token = words[-1] - core_token = token.lstrip(string.punctuation) - leading_punctuation = token[:-len(core_token)] if core_token else "" - if not core_token: words.pop(); continue - core_should_be_removed = (len(core_token) == 1 and core_token.isalpha() and core_token.lower() not in VALID_SINGLE_LETTERS) or \ - (len(core_token) == 2 and core_token.isalpha() and core_token.lower() not in VALID_TWO_LETTER_WORDS) - if core_should_be_removed: - if leading_punctuation: words[-1] = leading_punctuation - else: words.pop() - continue - break - return ' '.join(words) - -def input_with_prefill(prompt, text): - if READLINE_AVAILABLE: - def hook(): - readline.insert_text(text) - readline.redisplay() - readline.set_pre_input_hook(hook) - result = input(prompt) - readline.set_pre_input_hook() - return result - else: - print(Fore.MAGENTA + "Current text (edit below):\n" + Style.RESET_ALL + f"{text}") - return input(prompt) - -def _clear_png_files_in_folder(folder_path_str, run_args_for_print_control): - # This function CLEARS files if folder exists. It DOES NOT CREATE the folder. - if not folder_path_str: return - - folder = Path(folder_path_str) # Path relative to CWD if not absolute - abs_folder_path = folder.resolve() - - if run_args_for_print_control.debug: - print(Fore.CYAN + f" [Debug] _clear_png_files_in_folder: Checking {abs_folder_path} (Specified as: '{folder_path_str}')") - - if abs_folder_path.is_dir(): - if run_args_for_print_control.show_progress: - print(Fore.BLUE + f"Clearing *.png files from {abs_folder_path}...") - cleared_count = 0 - try: - for file_path in abs_folder_path.glob("*.png"): - if file_path.is_file(): - file_path.unlink() - cleared_count +=1 - except Exception as e: - if run_args_for_print_control.show_progress: # Also show error if progress is on - print(Fore.RED + f"Error during file deletion in {abs_folder_path}: {e}") - - if run_args_for_print_control.show_progress: - if cleared_count > 0: - print(Fore.BLUE + f"Cleared {cleared_count} *.png files from {abs_folder_path}.") - else: - print(Fore.BLUE + f"No *.png files found to clear in {abs_folder_path}.") - else: - if run_args_for_print_control.show_progress: - print(Fore.YELLOW + f"Image folder {abs_folder_path} not found, skipping clear.") - elif run_args_for_print_control.debug: # Still log if not found in debug, even if not show_progress - print(Fore.CYAN + f" [Debug] _clear_png_files_in_folder: Folder {abs_folder_path} does not exist. Nothing to clear.") - - -class EnhancedPDFHighlightExtractor: - def __init__(self, pdf_path, effective_run_args, main_doc_for_image_view=None): - self.pdf_path = Path(pdf_path) - self.run_args = effective_run_args - self.pdf_filename_stem = self.pdf_path.stem - self.highlights_data = [] - self.main_doc_for_image_view = main_doc_for_image_view - - def _get_highlight_color_from_rgb_tuple(self, rgb_tuple_floats_or_ints): - if not rgb_tuple_floats_or_ints or len(rgb_tuple_floats_or_ints) < 3 : return 'unknown_color' - r, g, b = [int(x * 255) if isinstance(x, float) and 0.0 <= x <= 1.0 else int(x) for x in rgb_tuple_floats_or_ints[:3]] - if r == 142 and g == 221 and b == 249: return 'blue' - if r > 200 and g > 200 and b < 150: return 'yellow' - if r < 150 and g > 180 and b < 150: return 'green' - if r < 150 and g < 180 and b > 180: return 'blue' - if r > 180 and g < 180 and b > 180: return 'pink' - return 'other_color' - - def _get_highlight_color_from_annot_colors_dict(self, colors_dict): - if not colors_dict: return 'unknown_color', None - rgb_tuple = colors_dict.get('stroke') or colors_dict.get('fill') - if not rgb_tuple: return 'unknown_color', None - return self._get_highlight_color_from_rgb_tuple(rgb_tuple), rgb_tuple[:3] - - def _extract_text_from_multi_segment_highlight(self, page, annot, page_num, hl_id): - overall_highlight_color_name, _ = self._get_highlight_color_from_annot_colors_dict(annot.colors) - color_code_for_segment_print = self._get_color_display_codes(overall_highlight_color_name) - quads_vertices = annot.vertices - if not quads_vertices: - if self.run_args.show_extraction_details: print(Fore.YELLOW + f" No quads for HL {hl_id} on page {page_num}") - return None, 0, [] - - processed_quads_as_points_list = [] - if len(quads_vertices) % 4 == 0: - for i in range(0, len(quads_vertices), 4): - try: - quad_points = [fitz.Point(p) for p in quads_vertices[i:i+4]] - processed_quads_as_points_list.append(quad_points) - except Exception as e: - if self.run_args.show_extraction_details: print(Fore.YELLOW + f" Skipping malformed quad points: {e}") - continue - - try: - sorted_quad_points_list = sorted(processed_quads_as_points_list, key=lambda qp_list: (fitz.Quad(qp_list).rect.y0, fitz.Quad(qp_list).rect.x0)) - except Exception as e: - if self.run_args.show_extraction_details: print(Fore.RED + f" Error sorting quads for HL {hl_id}: {e}. Using original order.") - sorted_quad_points_list = processed_quads_as_points_list - - if self.run_args.show_extraction_details: - print(color_code_for_segment_print + Fore.CYAN + f" Processing {len(sorted_quad_points_list)} segments for HL {hl_id} (Color: {overall_highlight_color_name.upper()}) on page {page_num}" + Style.RESET_ALL) - - segment_texts_final = [] - raw_segment_texts_for_diff = [] - for seg_idx, quad_points in enumerate(sorted_quad_points_list): - try: - bounds = fitz.Quad(quad_points).rect - padded_rect = fitz.Rect(bounds.x0 - TEXT_EXTRACTION_HORIZONTAL_PADDING, bounds.y0 - TEXT_EXTRACTION_VERTICAL_PADDING, - bounds.x1 + TEXT_EXTRACTION_HORIZONTAL_PADDING, bounds.y1 + TEXT_EXTRACTION_VERTICAL_PADDING) - padded_rect.intersect(page.rect) - if padded_rect.is_empty: - if self.run_args.show_extraction_details: print(Fore.YELLOW + f" Segment {seg_idx+1} empty padded_rect for HL {hl_id}") - continue - raw_text_from_pdf_segment = page.get_text("text", clip=padded_rect, sort=True).strip() - raw_segment_texts_for_diff.append(raw_text_from_pdf_segment) - cleaned_text_segment = re.sub(r'\s+', ' ', raw_text_from_pdf_segment).strip() - cleaned_text_segment = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]', '', cleaned_text_segment) - final_text_segment = clean_segment_edges_func(cleaned_text_segment, self.run_args.clean_edges) - - if final_text_segment: - segment_texts_final.append(final_text_segment) - if self.run_args.show_raw_segments and not self.run_args.interactive: - print(color_code_for_segment_print + Fore.LIGHTBLUE_EX + f" Segment {seg_idx+1} (P{page_num}, HL{hl_id}, Color: {overall_highlight_color_name.upper()}):" + Style.RESET_ALL) - if self.run_args.show_diff_percentage: - similarity = get_text_diff_ratio(raw_text_from_pdf_segment, final_text_segment) - diff_percent = (1 - similarity) * 100 - print(Fore.LIGHTMAGENTA_EX + f" Raw PDF : \"{raw_text_from_pdf_segment}\"") - print(Fore.LIGHTBLUE_EX + f" Final Seg: \"{final_text_segment}\"") - print(Fore.YELLOW + f" Diff: {diff_percent:.2f}%") - else: print(Fore.LIGHTBLUE_EX + f" Final Seg: \"{final_text_segment}\"") - except Exception as e: - if self.run_args.show_extraction_details: print(Fore.RED + f" Error processing segment {seg_idx+1} for HL {hl_id}: {e}") - raw_segment_texts_for_diff.append("") - continue - - if not segment_texts_final: return None, len(sorted_quad_points_list), raw_segment_texts_for_diff - combined_text = segment_texts_final[0] - for i in range(1, len(segment_texts_final)): - prev_text = combined_text; current_text = segment_texts_final[i] - if prev_text.endswith('-') or prev_text.endswith('¬'): combined_text = prev_text.rstrip('-¬') + current_text - else: combined_text += ' ' + current_text - - if self.run_args.clean_edges: combined_text = clean_segment_edges_func(combined_text, self.run_args.clean_edges) - combined_text = re.sub(r'\s+', ' ', combined_text).strip() - return combined_text if combined_text else None, len(sorted_quad_points_list), raw_segment_texts_for_diff - - def extract_highlights(self, doc): - all_extracted_highlights = [] - try: - if self.run_args.show_progress and not self.run_args.interactive: - print(Fore.BLUE + f"\nšŸŽØ Processing highlights for PDF: {self.pdf_path.name}") - - pages_str_to_parse = self.run_args.pages if self.run_args.pages else DEFAULT_PAGES_TO_PROCESS - pages_to_process = self._parse_specific_pages(pages_str_to_parse, doc.page_count) - if not pages_to_process: - if self.run_args.show_progress: print(Fore.YELLOW + "No valid pages selected.") - return [] - - highlight_id_counter_on_page = {} - for page_num in pages_to_process: - page = doc.load_page(page_num - 1) - highlight_id_counter_on_page.setdefault(page_num, 0) - if self.run_args.show_progress and not self.run_args.interactive: - print(Fore.CYAN + f" šŸ“„ Processing Page {page_num}...") - try: page_annotations = list(page.annots()) - except Exception as e: - if self.run_args.show_progress: print(Fore.RED + f" āš ļø Error loading annots: {e}") - continue - - highlight_annotations = [a for a in page_annotations if hasattr(a, 'type') and a.type[0] == fitz.PDF_ANNOT_HIGHLIGHT and hasattr(a, 'rect') and a.rect] - if not highlight_annotations: - if self.run_args.show_progress and not self.run_args.interactive: print(Fore.WHITE + f" No highlights on page {page_num}.") - continue - - if self.run_args.show_rect_details: - print(Fore.YELLOW + f"--- Annotations before sorting (Page {page_num}) ---") - temp_debug_list = [] - for annot_debug in highlight_annotations: - debug_text_snippet = page.get_text("text", clip=annot_debug.rect).strip().replace("\n", " ") - color_name_debug, rgb_values_debug = self._get_highlight_color_from_annot_colors_dict(annot_debug.colors) - rgb_display = f"RGB: {tuple(int(c*255) if isinstance(c,float) else int(c) for c in rgb_values_debug[:3])}" if rgb_values_debug else "RGB: N/A" - temp_debug_list.append({ - "rect": annot_debug.rect, "text_snippet": debug_text_snippet, "color_name": color_name_debug, - "rgb_display": rgb_display, "vertices_count": len(annot_debug.vertices) if annot_debug.vertices else 0 }) - temp_debug_list.sort(key=lambda item: (item["rect"].y0, item["rect"].x0)) - for item_idx, item_val in enumerate(temp_debug_list): - print(f" {item_idx+1}. Rect: {item_val['rect']}, Vertices: {item_val['vertices_count']}, Color: {item_val['color_name'].upper()} ({item_val['rgb_display']}), Text: \"{item_val['text_snippet']}\"") - print(Fore.YELLOW + "----------------------------------------------------") - - highlight_annotations.sort(key=lambda a: (a.rect.y0, a.rect.x0)) - for annot in highlight_annotations: - try: - highlight_id_counter_on_page[page_num] += 1; current_hl_id_on_page = highlight_id_counter_on_page[page_num] - color_name, raw_rgb_floats = self._get_highlight_color_from_annot_colors_dict(annot.colors) - extracted_text, num_segments, _ = self._extract_text_from_multi_segment_highlight(page, annot, page_num, current_hl_id_on_page) - if extracted_text and extracted_text.strip(): - if self.run_args.show_extraction_details and not self.run_args.interactive: - print(Fore.GREEN + f" āœ… Final (P{page_num}, HL{current_hl_id_on_page}): \"{extracted_text[:100]}\"") - all_extracted_highlights.append({ - 'page': page_num, 'highlight_id_on_page': current_hl_id_on_page, 'text': extracted_text, - 'color': color_name, 'raw_rgb_values': raw_rgb_floats, 'type': 'highlight', - 'y_position': annot.rect.y0, 'x_position': annot.rect.x0, - 'rect_details': (annot.rect.x0, annot.rect.y0, annot.rect.x1, annot.rect.y1), - 'num_segments': num_segments }) - elif self.run_args.show_progress and not self.run_args.interactive: - print(Fore.YELLOW + f" āš ļø No text for HL {current_hl_id_on_page} on page {page_num}") - except Exception as e: - if self.run_args.show_progress and not self.run_args.interactive: - print(Fore.RED + f" šŸ”“ Error processing annot on page {page_num}: {e}") - if self.run_args.debug: traceback.print_exc() - continue - - if self.run_args.interactive: - print(Fore.MAGENTA + "\nEntering interactive review session...") - self.highlights_data = self._interactive_review_session(all_extracted_highlights) - else: self.highlights_data = all_extracted_highlights - - if self.run_args.show_progress and not self.run_args.interactive and not self.run_args.silent: - print(Fore.MAGENTA + f" šŸ“Š Total highlights extracted: {len(self.highlights_data)}") - return self.highlights_data - except Exception as e: - print(Fore.RED + f"āŒ Major error during highlight extraction: {e}") - if self.run_args.debug: traceback.print_exc() - return [] - - def _view_page_image_interactively(self, page_num_to_view): - if not self.main_doc_for_image_view: - print(Fore.RED + "Error: PDF document not available for image rendering. This should not happen.") - return - - tmp_image_path_obj = None - image_created_in_managed_folder = False - image_successfully_saved = False - - if self.run_args.show_progress: - print(Fore.BLUE + f"Preparing to view image for page {page_num_to_view}...") - - try: - page_index = page_num_to_view - 1 - page = self.main_doc_for_image_view.load_page(page_index) - if self.run_args.debug: - print(Fore.CYAN + f" [Debug] Loaded page object for index {page_index}: {page}") - - pix = page.get_pixmap(dpi=150) - if self.run_args.debug: - print(Fore.CYAN + f" [Debug] Created pixmap: {pix}. Alpha: {pix.alpha}, Colorspace: {pix.colorspace.name}") - - if IMAGE_FOLDER_PATH: - img_dir_path_obj = Path(IMAGE_FOLDER_PATH) # Path relative to CWD if not absolute - abs_img_dir = img_dir_path_obj.resolve() - - if self.run_args.debug: - print(Fore.CYAN + f" [Debug] Using IMAGE_FOLDER_PATH: '{IMAGE_FOLDER_PATH}' (Absolute: {abs_img_dir})") - - try: - abs_img_dir.mkdir(parents=True, exist_ok=True) - if self.run_args.debug: - print(Fore.CYAN + f" [Debug] Ensured image directory exists: {abs_img_dir} (Status: {abs_img_dir.is_dir()})") - except Exception as e_mkdir: - print(Fore.RED + f" ERROR: Could not create directory {abs_img_dir}: {e_mkdir}") - if self.run_args.debug: traceback.print_exc() - # Do not proceed if directory creation fails - input(Fore.CYAN + "Press Enter to acknowledge and continue...") - return - - - unique_id = uuid.uuid4().hex[:8] - tmp_image_path_obj = abs_img_dir / f"page_{page_num_to_view}_{unique_id}.png" - image_created_in_managed_folder = True - else: - fd, temp_path_str = tempfile.mkstemp(suffix=".png", prefix="pdf_page_img_") - os.close(fd) - tmp_image_path_obj = Path(temp_path_str) - if self.run_args.debug: - print(Fore.CYAN + f" [Debug] Using system temporary file: {tmp_image_path_obj.resolve()}") - - resolved_save_path = tmp_image_path_obj.resolve() - if self.run_args.debug: - print(Fore.CYAN + f" [Debug] Attempting to save image to: {resolved_save_path}") - - pix.save(str(resolved_save_path)) - - if resolved_save_path.exists() and resolved_save_path.is_file(): - image_successfully_saved = True - if self.run_args.show_progress: # Print for normal progress too, not just debug - print(Fore.GREEN + f" Image for page {page_num_to_view} successfully saved to: {resolved_save_path}") - if self.run_args.debug: - print(Fore.CYAN + f" [Debug] File size: {resolved_save_path.stat().st_size} bytes") - else: - if self.run_args.show_progress: - print(Fore.RED + f" ERROR: Failed to save image to {resolved_save_path}. File does not exist after save attempt.") - - except Exception as e_render_save: - if self.run_args.show_progress: - print(Fore.RED + f" Error during image rendering or saving: {e_render_save}") - if self.run_args.debug: - traceback.print_exc() - - if image_successfully_saved and tmp_image_path_obj: - if self.run_args.show_progress: - print(Fore.CYAN + f"Attempting to open image with default application...") - try: - file_uri = tmp_image_path_obj.resolve().as_uri() - if self.run_args.debug: - print(Fore.CYAN + f" [Debug] Opening URI: {file_uri}") - - opened_successfully = webbrowser.open(file_uri) - - if self.run_args.debug: # More detailed feedback in debug mode - print(Fore.CYAN + f" [Debug] webbrowser.open() returned: {opened_successfully}") - - if not opened_successfully: - if self.run_args.show_progress: - print(Fore.YELLOW + " webbrowser.open() reported failure (returned False or None).") - print(Fore.YELLOW + f" This often means no default application is configured for PNG files or your browser.") - elif self.run_args.show_progress: - print(Fore.GREEN + " Image hopefully opened. Check your applications.") - - if self.run_args.show_progress: - print(Fore.YELLOW + f" If the image did not open, please manually open: {tmp_image_path_obj.resolve()}") - input(Fore.CYAN + "Press Enter after viewing image to continue...") - - except Exception as e_open: - if self.run_args.show_progress: - print(Fore.RED + f" Could not open image using webbrowser: {e_open}") - print(Fore.YELLOW + " This could be due to your system's environment (e.g., missing 'xdg-utils' on Linux, no default PNG viewer).") - print(Fore.YELLOW + f" Please try opening the image manually: {tmp_image_path_obj.resolve()}") - if self.run_args.debug: - traceback.print_exc() - input(Fore.CYAN + "Press Enter to acknowledge and continue...") - elif tmp_image_path_obj : - if self.run_args.show_progress: - print(Fore.YELLOW + " Skipping attempt to open image as it was not saved successfully.") - input(Fore.CYAN + "Press Enter to continue...") - else: - if self.run_args.show_progress: - print(Fore.RED + " Cannot attempt to open image as image path was not determined.") - input(Fore.CYAN + "Press Enter to continue...") - - finally: - if tmp_image_path_obj and tmp_image_path_obj.exists(): - if image_created_in_managed_folder: - if self.run_args.debug: - print(Fore.CYAN + f" [Debug] Image '{tmp_image_path_obj.name}' remains in managed folder '{IMAGE_FOLDER_PATH}'.") - print(Fore.CYAN + f" [Debug] It will be cleared based on CLEAR_IMAGE_FOLDER_ON_END ({CLEAR_IMAGE_FOLDER_ON_END}).") - else: - try: - tmp_image_path_obj.unlink() - if self.run_args.debug: - print(Fore.CYAN + f" [Debug] Deleted system temporary image: {tmp_image_path_obj.resolve()}") - except Exception as e_unlink: - if self.run_args.debug: - print(Fore.YELLOW + f" Warning: Could not delete system temp image {tmp_image_path_obj.resolve()}: {e_unlink}") - elif tmp_image_path_obj and not tmp_image_path_obj.exists() and image_successfully_saved: - if self.run_args.debug: - print(Fore.RED + f" [Debug] Inconsistency: Image was marked saved, but {tmp_image_path_obj.resolve()} does not exist at cleanup (and wasn't a system temp explicitly deleted here).") - - - def _interactive_review_session(self, highlights_list): - if not highlights_list: - if self.run_args.show_progress : print(Fore.YELLOW + "No highlights to review.") - return [] - reviewed_highlights = [dict(h) for h in highlights_list] - idx, num_highlights = 0, len(reviewed_highlights) - AVAILABLE_COLORS = ['yellow', 'green', 'blue', 'pink', 'other_color', 'unknown_color'] - - while 0 <= idx < num_highlights: - item = reviewed_highlights[idx] - print(Style.RESET_ALL + "\n" + "="*15 + f" Review HL {idx+1}/{num_highlights} (Page {item['page']}) " + "="*15) - - current_color_display = self._get_color_display_codes(item['color']) - print(f"Color: {current_color_display}{item['color'].upper()}{Style.RESET_ALL}", end="") - if item['color'] == 'other_color' and item.get('raw_rgb_values'): - rgb = item['raw_rgb_values'][:3] - rgb_disp = tuple(int(c*255) if isinstance(c,float) else int(c) for c in rgb) - print(f" (RGB: {rgb_disp})", end="") - print() - - print(f"Text: {item['text']}") - - prompt_options = ["[N]ext", "[P]rev", "[U]p", "[M]ove Down", "[C]olor", "[E]dit", "[D]elete", "[O]pen Img", "[S]ave&Exit", "[Q]uit"] - action_prompt_str = Fore.CYAN + ", ".join(prompt_options) + "? > " + Style.RESET_ALL - action = input(action_prompt_str).lower().strip() - - if action == 'n': idx = (idx + 1) % num_highlights if num_highlights > 0 else 0 - elif action == 'p': idx = (idx - 1 + num_highlights) % num_highlights if num_highlights > 0 else 0 - elif action == 'u': - if idx > 0: - reviewed_highlights.insert(idx - 1, reviewed_highlights.pop(idx)) - idx -= 1 - print(Fore.GREEN + "Moved up.") - else: print(Fore.YELLOW + "Already at the top.") - elif action == 'm': - if idx < num_highlights - 1: - reviewed_highlights.insert(idx + 1, reviewed_highlights.pop(idx)) - idx += 1 - print(Fore.GREEN + "Moved down.") - else: print(Fore.YELLOW + "Already at the bottom.") - elif action == 'c': - print("Available colors:", ", ".join(f"{i+1}.{self._get_color_display_codes(co)}{co.upper()}{Style.RESET_ALL}" for i,co in enumerate(AVAILABLE_COLORS))) - try: - choice_str = input(Fore.YELLOW + "Enter number for new color: " + Style.RESET_ALL) - if not choice_str: print(Fore.BLUE + "Color change cancelled (no input)."); continue - choice = int(choice_str) - 1 - if 0 <= choice < len(AVAILABLE_COLORS): - item['color'] = AVAILABLE_COLORS[choice] - print(Fore.GREEN + f"Color changed to {AVAILABLE_COLORS[choice].upper()}.") - else: print(Fore.RED + "Invalid color choice.") - except ValueError: print(Fore.RED + "Invalid input. Please enter a number.") - elif action == 'e': - edit_prompt = Fore.YELLOW + "New text (blank=keep, 'CLEAR'=empty): > " + Style.RESET_ALL - new_text = input_with_prefill(edit_prompt, item['text']) - - if new_text.strip().upper() == 'CLEAR': - item['text'] = "" - print(Fore.GREEN + "Text cleared.") - elif new_text == item['text'] or not new_text.strip() : - print(Fore.BLUE + "Text kept as is.") - else: - item['text'] = new_text - print(Fore.GREEN + "Text updated.") - elif action == 'd': - if input(Fore.RED + "Are you sure you want to delete this highlight? [y/N]: " + Style.RESET_ALL).lower() == 'y': - reviewed_highlights.pop(idx) - num_highlights = len(reviewed_highlights) - print(Fore.GREEN + "Highlight deleted.") - if num_highlights == 0: - print(Fore.YELLOW + "No more highlights to review."); break - if idx >= num_highlights: idx = num_highlights - 1 - else: print(Fore.BLUE + "Deletion cancelled.") - elif action == 'o': self._view_page_image_interactively(item['page']) - elif action == 's': - print(Fore.GREEN + "Saving changes and exiting review session.") - break - elif action == 'q': - if input(Fore.RED+"Are you sure you want to quit review? Changes will not be saved. [y/N]: " + Style.RESET_ALL).lower()=='y': - print(Fore.YELLOW+"Quitting review session. Changes made in this session are DISCARDED.") - return highlights_list - else: - print(Fore.BLUE + "Quit cancelled.") - else: print(Fore.RED + "Invalid action. Please choose from the list.") - return reviewed_highlights - - def _parse_specific_pages(self, pages_str, total_pages): - if not pages_str or pages_str.lower() == "all": return list(range(1, total_pages + 1)) - parsed_pages = set() - try: - for part in pages_str.split(','): - part = part.strip(); - if not part: continue - if '-' in part: - start_str, end_str = part.split('-', 1); start = int(start_str); end = int(end_str) - start = max(1, start); end = min(total_pages, end) - if start <= end: parsed_pages.update(range(start, end + 1)) - else: - page_val = int(part) - if 1 <= page_val <= total_pages: parsed_pages.add(page_val) - return sorted(list(parsed_pages)) if parsed_pages else [] - except ValueError as e: - if self.run_args.show_progress: print(Fore.YELLOW + f"āš ļø Invalid page range: {pages_str}. Error: {e}.") - return [] - - def _get_color_display_codes(self, color_name_str): - return {'yellow': Back.YELLOW + Fore.BLACK, 'green': Back.GREEN + Fore.BLACK, - 'blue': Back.BLUE + Fore.WHITE, 'pink': Back.MAGENTA + Fore.WHITE, - 'other_color': Back.WHITE + Fore.BLACK, 'unknown_color': Back.LIGHTBLACK_EX + Fore.WHITE - }.get(color_name_str.lower(), Back.LIGHTBLACK_EX + Fore.WHITE) - - def display_results(self): - if not self.run_args.show_progress: return # Don't display if progress is off (e.g. silent) - - print("\n" + Fore.CYAN + Style.BRIGHT + "="*30 + " EXTRACTED HIGHLIGHTS " + "="*30 + Style.RESET_ALL) - if not self.highlights_data: print("\nāŒ No highlights extracted or all were deleted."); return - current_page = None - for item in self.highlights_data: - if item.get('page') != current_page: - current_page = item.get('page'); print(f"\nšŸ“„ {Style.BRIGHT}Page {current_page}{Style.RESET_ALL}\n" + "-"*25) - color_name = item.get('color', 'unknown_color') - color_code = self._get_color_display_codes(color_name) - num_segments = item.get('num_segments', 0) - segment_info = f" [{num_segments} segments]" if num_segments > 1 else "" - text_content = item.get('text', "*NO TEXT*") - display_color_name = color_name.upper() - if color_name == 'other_color': - raw_rgb = item.get('raw_rgb_values') - if raw_rgb and len(raw_rgb) >=3: - rgb_disp = tuple(int(c*255) if isinstance(c,float) else int(c) for c in raw_rgb[:3]) - display_color_name += f" (RGB: {rgb_disp})" - print(f"šŸŽØ {color_code}{display_color_name}{Style.RESET_ALL}{segment_info}") - print(f" \"{text_content}\""); print() - - def save_to_json(self, output_path_str): - output_path = Path(output_path_str).resolve() # Resolve to absolute path for clarity - try: - output_path.parent.mkdir(parents=True, exist_ok=True) - if self.run_args.debug: - print(Fore.CYAN + f" [Debug] Ensured parent directory for JSON exists: {output_path.parent}") - except Exception as e_mkdir: - if self.run_args.show_progress: # Also show error if progress is on - print(Fore.RED + f"āŒ Error creating directory for JSON output {output_path.parent}: {e_mkdir}") - if self.run_args.debug: traceback.print_exc() - return # Cannot save if directory cannot be made - - data_to_save = { - 'pdf_file_processed': str(self.pdf_path.name), 'pdf_full_path': str(self.pdf_path.resolve()), - 'pages_processed_spec': self.run_args.pages if self.run_args.pages else DEFAULT_PAGES_TO_PROCESS, - 'extraction_timestamp': time.strftime("%Y-%m-%d %H:%M:%S %Z"), - 'total_highlights_extracted': len(self.highlights_data), - 'settings_used': { - 'clean_edges': self.run_args.clean_edges, - 'show_diff_percentage': self.run_args.show_diff_percentage - }, - 'highlights_data': self.highlights_data } - try: - with open(output_path, 'w', encoding='utf-8') as f: json.dump(data_to_save, f, indent=2, ensure_ascii=False) - if self.run_args.show_progress: print(Fore.GREEN + f"šŸ’¾ Data saved to {output_path}") - except IOError as e: - if self.run_args.show_progress: print(Fore.RED + f"āŒ Error saving JSON to {output_path}: {e}") - if self.run_args.debug: traceback.print_exc() - - -def main(): - parser = argparse.ArgumentParser( - description="Enhanced PDF Highlight Extractor.", - formatter_class=argparse.RawTextHelpFormatter, - epilog=f"""Examples: - {sys.argv[0]} mydoc.pdf - {sys.argv[0]} mydoc.pdf -p "1,5-7" -i - {sys.argv[0]} -t -s --output-json results/test.json - {sys.argv[0]} doc.pdf -d - -If interactive image viewing ('O' option) fails, try running with the -d (debug) -flag. This will print detailed information about image paths and creation steps. -Common issues include missing default PNG viewers or OS-level permission problems. -The IMAGE_FOLDER_PATH ('{IMAGE_FOLDER_PATH}') is relative to where you run the script. -""") - parser.add_argument("pdf_path_arg", nargs='?', default=None, help="Path to PDF. Prompts if not in test/silent mode & not provided.") - parser.add_argument("-p", "--pages", type=str, default=None, help=f"Pages (e.g., \"1,3-5\", \"all\"). Default: \"{DEFAULT_PAGES_TO_PROCESS}\".") - parser.add_argument("-i", "--interactive", action="store_true", help="Enable interactive review mode.") - parser.add_argument("-t", "--test", action="store_true", help=f"Test mode. Uses default PDF ('{DEFAULT_PDF_PATH}'), auto-saves JSON.") - parser.add_argument("-s", "--silent", action="store_true", help="Silent mode. Minimal output. Auto-saves JSON. Implies -t if no PDF path.") - parser.add_argument("-d", "--debug", action="store_true", help="Debug mode. Enables all detailed SHOW flags and prints more internal details.") - parser.add_argument("--output-json", type=str, default=None, help="Custom output JSON filename/path.") - - cli_args = parser.parse_args() - - effective_run_args = argparse.Namespace() - effective_run_args.debug = cli_args.debug - effective_run_args.silent = cli_args.silent - - # Initialize based on global defaults - effective_run_args.show_timing = INITIAL_SHOW_TIMING - effective_run_args.show_progress = INITIAL_SHOW_PROGRESS - effective_run_args.show_raw_segments = INITIAL_SHOW_RAW_SEGMENTS - effective_run_args.show_extraction_details = INITIAL_SHOW_EXTRACTION_DETAILS - effective_run_args.show_rect_details = INITIAL_SHOW_RECT_DETAILS - effective_run_args.show_diff_percentage = INITIAL_SHOW_DIFF_PERCENTAGE - effective_run_args.clean_edges = INITIAL_CLEAN_EDGES - - # Override show flags based on debug or silent - if effective_run_args.debug: - for key in ['show_timing', 'show_progress', 'show_raw_segments', 'show_extraction_details', 'show_rect_details', 'show_diff_percentage']: - setattr(effective_run_args, key, True) # Debug enables all these - - if effective_run_args.silent: - for key in ['show_timing', 'show_progress', 'show_raw_segments', 'show_extraction_details', 'show_rect_details', 'show_diff_percentage']: - setattr(effective_run_args, key, False) # Silent disables all these - effective_run_args.interactive = False - else: # Not silent - effective_run_args.interactive = cli_args.interactive - - effective_run_args.pages = cli_args.pages - - start_time = time.time() - if effective_run_args.show_progress: print(Fore.MAGENTA + Style.BRIGHT + "šŸŽØ PDF Highlight Extractor šŸŽØ" + Style.RESET_ALL) - if effective_run_args.debug: - print(Fore.CYAN + f" [Debug] Current Working Directory: {Path.cwd()}") - print(Fore.CYAN + f" [Debug] Effective Run Arguments: {effective_run_args}") - - - if IMAGE_FOLDER_PATH and CLEAR_IMAGE_FOLDER_ON_START: - _clear_png_files_in_folder(IMAGE_FOLDER_PATH, effective_run_args) - - pdf_path_to_use = None - if cli_args.test: pdf_path_to_use = DEFAULT_PDF_PATH - elif cli_args.pdf_path_arg: pdf_path_to_use = cli_args.pdf_path_arg - elif cli_args.silent: pdf_path_to_use = DEFAULT_PDF_PATH - else: - pdf_path_input = input(f"šŸ“„ PDF path (Enter for default '{DEFAULT_PDF_PATH}'): ").strip().strip('"') - pdf_path_to_use = pdf_path_input if pdf_path_input else DEFAULT_PDF_PATH - - if not pdf_path_to_use: - if effective_run_args.show_progress: print(Fore.RED + "āŒ No PDF path specified. Exiting.") - sys.exit(1) - - resolved_path = Path(pdf_path_to_use).resolve() - if not resolved_path.exists() or not resolved_path.is_file(): - if effective_run_args.show_progress: print(Fore.RED + f"āŒ PDF not found or is not a file: {resolved_path}") - sys.exit(1) - - doc_for_processing = None - try: - doc_for_processing = fitz.open(str(resolved_path)) - extractor = EnhancedPDFHighlightExtractor(resolved_path, effective_run_args, main_doc_for_image_view=doc_for_processing) - extractor.extract_highlights(doc_for_processing) - - if not effective_run_args.interactive and effective_run_args.show_progress: - extractor.display_results() - elif effective_run_args.interactive and effective_run_args.show_progress: - if input(Fore.CYAN+"Interactive session ended. Display final results? [Y/n]: " + Style.RESET_ALL).lower().strip()!='n': - extractor.display_results() - - json_output_path_str = cli_args.output_json if cli_args.output_json else str(resolved_path.parent / f"{resolved_path.stem}_highlights.json") - - perform_save = False - if cli_args.test or cli_args.silent: - perform_save = True - elif effective_run_args.show_progress: # Only prompt if not silent - save_prompt_input = input(f"šŸ’¾ Save to JSON? (Enter for default '{json_output_path_str}', type 'skip' to not save, or enter a custom path): " + Style.RESET_ALL).strip() - if save_prompt_input.lower() != 'skip': - perform_save = True - if save_prompt_input: - json_output_path_str = save_prompt_input - - if perform_save: - if extractor.highlights_data: - extractor.save_to_json(json_output_path_str) - elif effective_run_args.show_progress: - print(Fore.YELLOW + "No highlights were extracted or kept, so JSON file was not saved.") - elif effective_run_args.show_progress: - print(Fore.BLUE + "Skipped saving highlights to JSON.") - - except Exception as e: - if effective_run_args.show_progress: print(Fore.RED+Style.BRIGHT+f"šŸ’„ An critical error occurred in the main execution: {e}") - if effective_run_args.debug: - traceback.print_exc() - finally: - if doc_for_processing: - doc_for_processing.close() - - if IMAGE_FOLDER_PATH and CLEAR_IMAGE_FOLDER_ON_END: - _clear_png_files_in_folder(IMAGE_FOLDER_PATH, effective_run_args) - - if effective_run_args.show_timing: - print(Fore.CYAN + f"\nā±ļø Total execution time: {time.time()-start_time:.2f} seconds") - -if __name__ == '__main__': - main() +#!/usr/bin/env python3 +# ============================================================================= +# ENHANCED PDF HIGHLIGHT EXTRACTOR +# Author: Perplexity AI Companion (Updated by User Feedback) +# Date: June 3, 2025 +# License: MIT +# +# Extracts highlights from PDF files, with options for interactive review, +# detailed output, text cleaning, JSON export, and page image viewing. +# ============================================================================= + +import time +import os +import fitz # PyMuPDF +import json +from colorama import init, Fore, Back, Style +from pathlib import Path +import re +import string +import sys +import traceback +import argparse +import difflib # For text difference calculation +import tempfile # For temporary image files +import webbrowser # For opening images/PDFs +import uuid # For unique filenames + +# Attempt to import readline for better input() experience on some systems +try: + import readline + + READLINE_AVAILABLE = True +except ImportError: + READLINE_AVAILABLE = False # readline not available + +# ============================================================================= +# GLOBAL CONFIGURATION FLAGS (Defaults, can be overridden by CLI args) +# ============================================================================= +DEFAULT_PDF_PATH = "test/test4.pdf" # Local test PDF +DEFAULT_PAGES_TO_PROCESS = "1" # Example: "1,3-5,all" + +# Default Behavior flags (can be influenced by -d or -s CLI flags) +# These are used to initialize effective_run_args +# Keep these distinct from the effective_run_args object itself +INITIAL_SHOW_TIMING = True +INITIAL_SHOW_PROGRESS = True +INITIAL_SHOW_RAW_SEGMENTS = True +INITIAL_SHOW_EXTRACTION_DETAILS = True +INITIAL_SHOW_RECT_DETAILS = True +INITIAL_SHOW_DIFF_PERCENTAGE = True +INITIAL_CLEAN_EDGES = True + +# Text extraction parameters (generally fixed) +TEXT_EXTRACTION_HORIZONTAL_PADDING = 6.0 +TEXT_EXTRACTION_VERTICAL_PADDING = 1.0 + +# Edge cleaning configuration (generally fixed) +VALID_TWO_LETTER_WORDS = { + "am", + "an", + "as", + "at", + "be", + "by", + "do", + "go", + "he", + "if", + "in", + "is", + "it", + "me", + "my", + "no", + "of", + "on", + "or", + "ox", + "so", + "to", + "up", + "us", + "we", +} +VALID_SINGLE_LETTERS = {"i", "a"} + +# Image handling configuration +IMAGE_FOLDER_PATH = "pdf_page_images" # Relative to CWD by default +CLEAR_IMAGE_FOLDER_ON_START = True +CLEAR_IMAGE_FOLDER_ON_END = False + +# Initialize colorama +init(autoreset=True) + + +# --- Helper Functions --- +def get_text_diff_ratio(text1, text2): + if not text1 and not text2: + return 1.0 + if not text1 or not text2: + return 0.0 + return difflib.SequenceMatcher(None, str(text1), str(text2)).ratio() + + +def clean_segment_edges_func(text_to_clean, clean_edges_setting): + if not clean_edges_setting or not text_to_clean: + return text_to_clean + text_to_clean = re.sub(r"\s+", " ", text_to_clean.strip()) + words = text_to_clean.split() + if not words: + return text_to_clean + + current_idx = 0 + while current_idx < len(words): + token = words[current_idx] + core_token = token.rstrip(string.punctuation) + trailing_punctuation = token[len(core_token) :] + if not core_token: + words.pop(current_idx) + continue + core_should_be_removed = ( + len(core_token) == 1 + and core_token.isalpha() + and core_token.lower() not in VALID_SINGLE_LETTERS + ) or ( + len(core_token) == 2 + and core_token.isalpha() + and core_token.lower() not in VALID_TWO_LETTER_WORDS + ) + if core_should_be_removed: + if trailing_punctuation: + words[current_idx] = trailing_punctuation + else: + words.pop(current_idx) + continue + break + while words: + token = words[-1] + core_token = token.lstrip(string.punctuation) + leading_punctuation = token[: -len(core_token)] if core_token else "" + if not core_token: + words.pop() + continue + core_should_be_removed = ( + len(core_token) == 1 + and core_token.isalpha() + and core_token.lower() not in VALID_SINGLE_LETTERS + ) or ( + len(core_token) == 2 + and core_token.isalpha() + and core_token.lower() not in VALID_TWO_LETTER_WORDS + ) + if core_should_be_removed: + if leading_punctuation: + words[-1] = leading_punctuation + else: + words.pop() + continue + break + return " ".join(words) + + +def input_with_prefill(prompt, text): + if READLINE_AVAILABLE: + + def hook(): + readline.insert_text(text) + readline.redisplay() + + readline.set_pre_input_hook(hook) + result = input(prompt) + readline.set_pre_input_hook() + return result + else: + print( + Fore.MAGENTA + "Current text (edit below):\n" + Style.RESET_ALL + f"{text}" + ) + return input(prompt) + + +def _clear_png_files_in_folder(folder_path_str, run_args_for_print_control): + # This function CLEARS files if folder exists. It DOES NOT CREATE the folder. + if not folder_path_str: + return + + folder = Path(folder_path_str) # Path relative to CWD if not absolute + abs_folder_path = folder.resolve() + + if run_args_for_print_control.debug: + print( + Fore.CYAN + + f" [Debug] _clear_png_files_in_folder: Checking {abs_folder_path} (Specified as: '{folder_path_str}')" + ) + + if abs_folder_path.is_dir(): + if run_args_for_print_control.show_progress: + print(Fore.BLUE + f"Clearing *.png files from {abs_folder_path}...") + cleared_count = 0 + try: + for file_path in abs_folder_path.glob("*.png"): + if file_path.is_file(): + file_path.unlink() + cleared_count += 1 + except Exception as e: + if ( + run_args_for_print_control.show_progress + ): # Also show error if progress is on + print( + Fore.RED + f"Error during file deletion in {abs_folder_path}: {e}" + ) + + if run_args_for_print_control.show_progress: + if cleared_count > 0: + print( + Fore.BLUE + + f"Cleared {cleared_count} *.png files from {abs_folder_path}." + ) + else: + print( + Fore.BLUE + f"No *.png files found to clear in {abs_folder_path}." + ) + else: + if run_args_for_print_control.show_progress: + print( + Fore.YELLOW + + f"Image folder {abs_folder_path} not found, skipping clear." + ) + elif ( + run_args_for_print_control.debug + ): # Still log if not found in debug, even if not show_progress + print( + Fore.CYAN + + f" [Debug] _clear_png_files_in_folder: Folder {abs_folder_path} does not exist. Nothing to clear." + ) + + +class EnhancedPDFHighlightExtractor: + def __init__(self, pdf_path, effective_run_args, main_doc_for_image_view=None): + self.pdf_path = Path(pdf_path) + self.run_args = effective_run_args + self.highlights_data = [] + self.main_doc_for_image_view = main_doc_for_image_view + + def _get_highlight_color_from_rgb_tuple(self, rgb_tuple_floats_or_ints): + if not rgb_tuple_floats_or_ints or len(rgb_tuple_floats_or_ints) < 3: + return "unknown_color" + r, g, b = [ + int(x * 255) if isinstance(x, float) and 0.0 <= x <= 1.0 else int(x) + for x in rgb_tuple_floats_or_ints[:3] + ] + + # Specific blue highlight color + if r == 142 and g == 221 and b == 249: + return "blue" + # Yellow highlights (high red/green, low blue) + if r > 200 and g > 200 and b < 150: + return "yellow" + # Green highlights (low red/blue, high green) + if r < 150 and g > 180 and b < 150: + return "green" + # Blue highlights (low red/green, high blue) + if r < 150 and g < 180 and b > 180: + return "blue" + # Pink highlights (high red/blue, low green) + if r > 180 and g < 180 and b > 180: + return "pink" + return "other_color" + + def _get_highlight_color_from_annot_colors_dict(self, colors_dict): + if not colors_dict: + return "unknown_color", None + rgb_tuple = colors_dict.get("stroke") or colors_dict.get("fill") + if not rgb_tuple: + return "unknown_color", None + return self._get_highlight_color_from_rgb_tuple(rgb_tuple), rgb_tuple[:3] + + def _extract_text_from_multi_segment_highlight(self, page, annot, page_num, hl_id): + overall_highlight_color_name, _ = ( + self._get_highlight_color_from_annot_colors_dict(annot.colors) + ) + color_code_for_segment_print = self._get_color_display_codes( + overall_highlight_color_name + ) + quads_vertices = annot.vertices + if not quads_vertices: + if self.run_args.show_extraction_details: + print( + Fore.YELLOW + + f" No quads for HL {hl_id} on page {page_num}" + ) + return None, 0, [] + + processed_quads_as_points_list = [] + if len(quads_vertices) % 4 == 0: + for i in range(0, len(quads_vertices), 4): + try: + quad_points = [fitz.Point(p) for p in quads_vertices[i : i + 4]] + processed_quads_as_points_list.append(quad_points) + except Exception as e: + if self.run_args.show_extraction_details: + print( + Fore.YELLOW + + f" Skipping malformed quad points: {e}" + ) + continue + + try: + sorted_quad_points_list = sorted( + processed_quads_as_points_list, + key=lambda qp_list: ( + fitz.Quad(qp_list).rect.y0, + fitz.Quad(qp_list).rect.x0, + ), + ) + except Exception as e: + if self.run_args.show_extraction_details: + print( + Fore.RED + + f" Error sorting quads for HL {hl_id}: {e}. Using original order." + ) + sorted_quad_points_list = processed_quads_as_points_list + + if self.run_args.show_extraction_details: + print( + color_code_for_segment_print + + Fore.CYAN + + f" Processing {len(sorted_quad_points_list)} segments for HL {hl_id} " + + f"(Color: {overall_highlight_color_name.upper()}) on page {page_num}" + + Style.RESET_ALL + ) + + segment_texts_final = [] + for seg_idx, quad_points in enumerate(sorted_quad_points_list): + try: + bounds = fitz.Quad(quad_points).rect + padded_rect = fitz.Rect( + bounds.x0 - TEXT_EXTRACTION_HORIZONTAL_PADDING, + bounds.y0 - TEXT_EXTRACTION_VERTICAL_PADDING, + bounds.x1 + TEXT_EXTRACTION_HORIZONTAL_PADDING, + bounds.y1 + TEXT_EXTRACTION_VERTICAL_PADDING, + ) + padded_rect.intersect(page.rect) + if padded_rect.is_empty: + if self.run_args.show_extraction_details: + print( + Fore.YELLOW + + f" Segment {seg_idx + 1} empty padded_rect for HL {hl_id}" + ) + continue + raw_text_from_pdf_segment = page.get_text( + "text", clip=padded_rect, sort=True + ).strip() + cleaned_text_segment = re.sub( + r"\s+", " ", raw_text_from_pdf_segment + ).strip() + cleaned_text_segment = re.sub( + r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]", "", cleaned_text_segment + ) + final_text_segment = clean_segment_edges_func( + cleaned_text_segment, self.run_args.clean_edges + ) + + if final_text_segment: + segment_texts_final.append(final_text_segment) + if ( + self.run_args.show_raw_segments + and not self.run_args.interactive + ): + print( + color_code_for_segment_print + + Fore.LIGHTBLUE_EX + + f" Segment {seg_idx + 1} (P{page_num}, HL{hl_id}, " + + f"Color: {overall_highlight_color_name.upper()}):" + + Style.RESET_ALL + ) + if self.run_args.show_diff_percentage: + similarity = get_text_diff_ratio( + raw_text_from_pdf_segment, final_text_segment + ) + diff_percent = (1 - similarity) * 100 + print( + Fore.LIGHTMAGENTA_EX + + f' Raw PDF : "{raw_text_from_pdf_segment}"' + ) + print( + Fore.LIGHTBLUE_EX + + f' Final Seg: "{final_text_segment}"' + ) + print( + Fore.YELLOW + f" Diff: {diff_percent:.2f}%" + ) + else: + print( + Fore.LIGHTBLUE_EX + + f' Final Seg: "{final_text_segment}"' + ) + except Exception as e: + if self.run_args.show_extraction_details: + print( + Fore.RED + + f" Error processing segment {seg_idx + 1} for HL {hl_id}: {e}" + ) + continue + + if not segment_texts_final: + return None, len(sorted_quad_points_list) + combined_text = segment_texts_final[0] + for current_text in segment_texts_final[1:]: + if combined_text.endswith("-") or combined_text.endswith("¬"): + combined_text = combined_text.rstrip("-¬") + current_text + else: + combined_text += " " + current_text + + if self.run_args.clean_edges: + combined_text = clean_segment_edges_func( + combined_text, self.run_args.clean_edges + ) + combined_text = re.sub(r"\s+", " ", combined_text).strip() + return combined_text if combined_text else None, len(sorted_quad_points_list) + + def extract_highlights(self, doc): + all_extracted_highlights = [] + try: + if self.run_args.show_progress and not self.run_args.interactive: + print( + Fore.BLUE + + f"\nšŸŽØ Processing highlights for PDF: {self.pdf_path.name}" + ) + + pages_str_to_parse = ( + self.run_args.pages if self.run_args.pages else DEFAULT_PAGES_TO_PROCESS + ) + pages_to_process = self._parse_specific_pages( + pages_str_to_parse, doc.page_count + ) + if not pages_to_process: + if self.run_args.show_progress: + print(Fore.YELLOW + "No valid pages selected.") + return [] + + highlight_id_counter_on_page = {} + for page_num in pages_to_process: + page = doc.load_page(page_num - 1) + highlight_id_counter_on_page.setdefault(page_num, 0) + if self.run_args.show_progress and not self.run_args.interactive: + print(Fore.CYAN + f" šŸ“„ Processing Page {page_num}...") + try: + page_annotations = list(page.annots()) + except Exception as e: + if self.run_args.show_progress: + print(Fore.RED + f" āš ļø Error loading annots: {e}") + continue + + highlight_annotations = [ + a + for a in page_annotations + if hasattr(a, "type") + and a.type[0] == fitz.PDF_ANNOT_HIGHLIGHT + and hasattr(a, "rect") + and a.rect + ] + if not highlight_annotations: + if self.run_args.show_progress and not self.run_args.interactive: + print(Fore.WHITE + f" No highlights on page {page_num}.") + continue + + if self.run_args.show_rect_details: + print( + Fore.YELLOW + + f"--- Annotations before sorting (Page {page_num}) ---" + ) + temp_debug_list = [] + for annot_debug in highlight_annotations: + debug_text_snippet = ( + page.get_text("text", clip=annot_debug.rect) + .strip() + .replace("\n", " ") + ) + color_name_debug, rgb_values_debug = ( + self._get_highlight_color_from_annot_colors_dict( + annot_debug.colors + ) + ) + rgb_display = ( + f"RGB: {tuple(int(c * 255) if isinstance(c, float) else int(c) for c in rgb_values_debug[:3])}" + if rgb_values_debug + else "RGB: N/A" + ) + temp_debug_list.append( + { + "rect": annot_debug.rect, + "text_snippet": debug_text_snippet, + "color_name": color_name_debug, + "rgb_display": rgb_display, + "vertices_count": ( + len(annot_debug.vertices) + if annot_debug.vertices + else 0 + ), + } + ) + temp_debug_list.sort( + key=lambda item: (item["rect"].y0, item["rect"].x0) + ) + for item_idx, item_val in enumerate(temp_debug_list): + print( + f" {item_idx + 1}. Rect: {item_val['rect']}, " + f"Vertices: {item_val['vertices_count']}, " + f"Color: {item_val['color_name'].upper()} " + f"({item_val['rgb_display']}), " + f"Text: \"{item_val['text_snippet']}\"" + ) + print( + Fore.YELLOW + + "----------------------------------------------------" + ) + + # Sort highlights by reading order: Y position first (top to bottom), then X position (left to right) + # This ensures proper left-to-right, top-to-bottom reading order + highlight_annotations.sort(key=lambda a: (a.rect.y0, a.rect.x0)) + + for annot in highlight_annotations: + try: + highlight_id_counter_on_page[page_num] += 1 + current_hl_id_on_page = highlight_id_counter_on_page[page_num] + color_name, raw_rgb_floats = ( + self._get_highlight_color_from_annot_colors_dict( + annot.colors + ) + ) + extracted_text, num_segments = ( + self._extract_text_from_multi_segment_highlight( + page, annot, page_num, current_hl_id_on_page + ) + ) + if extracted_text and extracted_text.strip(): + if ( + self.run_args.show_extraction_details + and not self.run_args.interactive + ): + print( + Fore.GREEN + + f' āœ… Final (P{page_num}, HL{current_hl_id_on_page}): "{extracted_text[:100]}"' + ) + all_extracted_highlights.append( + { + "page": page_num, + "highlight_id_on_page": current_hl_id_on_page, + "text": extracted_text, + "color": color_name, + "raw_rgb_values": raw_rgb_floats, + "type": "highlight", + "y_position": annot.rect.y0, + "x_position": annot.rect.x0, + "rect_details": ( + annot.rect.x0, + annot.rect.y0, + annot.rect.x1, + annot.rect.y1, + ), + "num_segments": num_segments, + } + ) + elif ( + self.run_args.show_progress + and not self.run_args.interactive + ): + print( + Fore.YELLOW + + f" āš ļø No text for HL {current_hl_id_on_page} on page {page_num}" + ) + except Exception as e: + if ( + self.run_args.show_progress + and not self.run_args.interactive + ): + print( + Fore.RED + + f" šŸ”“ Error processing annot on page {page_num}: {e}" + ) + if self.run_args.debug: + traceback.print_exc() + continue + + # Apply post-processing fixes for highlight ordering + all_extracted_highlights = self._fix_highlight_ordering( + all_extracted_highlights + ) + + if self.run_args.interactive: + print(Fore.MAGENTA + "\nEntering interactive review session...") + self.highlights_data = self._interactive_review_session( + all_extracted_highlights + ) + else: + self.highlights_data = all_extracted_highlights + + if ( + self.run_args.show_progress + and not self.run_args.interactive + and not self.run_args.silent + ): + print( + Fore.MAGENTA + + f" šŸ“Š Total highlights extracted: {len(self.highlights_data)}" + ) + return self.highlights_data + except Exception as e: + print(Fore.RED + f"āŒ Major error during highlight extraction: {e}") + if self.run_args.debug: + traceback.print_exc() + return [] + + def _view_page_image_interactively(self, page_num_to_view): + if not self.main_doc_for_image_view: + print( + Fore.RED + + "Error: PDF document not available for image rendering. This should not happen." + ) + return + + tmp_image_path_obj = None + image_created_in_managed_folder = False + image_successfully_saved = False + + if self.run_args.show_progress: + print(Fore.BLUE + f"Preparing to view image for page {page_num_to_view}...") + + try: + page_index = page_num_to_view - 1 + page = self.main_doc_for_image_view.load_page(page_index) + if self.run_args.debug: + print( + Fore.CYAN + + f" [Debug] Loaded page object for index {page_index}: {page}" + ) + + pix = page.get_pixmap(dpi=150) + if self.run_args.debug: + print( + Fore.CYAN + + f" [Debug] Created pixmap: {pix}. Alpha: {pix.alpha}, Colorspace: {pix.colorspace.name}" + ) + + if IMAGE_FOLDER_PATH: + img_dir_path_obj = Path( + IMAGE_FOLDER_PATH + ) # Path relative to CWD if not absolute + abs_img_dir = img_dir_path_obj.resolve() + + if self.run_args.debug: + print( + Fore.CYAN + + f" [Debug] Using IMAGE_FOLDER_PATH: '{IMAGE_FOLDER_PATH}' (Absolute: {abs_img_dir})" + ) + + try: + abs_img_dir.mkdir(parents=True, exist_ok=True) + if self.run_args.debug: + print( + Fore.CYAN + + f" [Debug] Ensured image directory exists: {abs_img_dir} (Status: {abs_img_dir.is_dir()})" + ) + except Exception as e_mkdir: + print( + Fore.RED + + f" ERROR: Could not create directory {abs_img_dir}: {e_mkdir}" + ) + if self.run_args.debug: + traceback.print_exc() + # Do not proceed if directory creation fails + input(Fore.CYAN + "Press Enter to acknowledge and continue...") + return + + unique_id = uuid.uuid4().hex[:8] + tmp_image_path_obj = ( + abs_img_dir / f"page_{page_num_to_view}_{unique_id}.png" + ) + image_created_in_managed_folder = True + else: + fd, temp_path_str = tempfile.mkstemp( + suffix=".png", prefix="pdf_page_img_" + ) + os.close(fd) + tmp_image_path_obj = Path(temp_path_str) + if self.run_args.debug: + print( + Fore.CYAN + + f" [Debug] Using system temporary file: {tmp_image_path_obj.resolve()}" + ) + + resolved_save_path = tmp_image_path_obj.resolve() + if self.run_args.debug: + print( + Fore.CYAN + + f" [Debug] Attempting to save image to: {resolved_save_path}" + ) + + pix.save(str(resolved_save_path)) + + if resolved_save_path.exists() and resolved_save_path.is_file(): + image_successfully_saved = True + if ( + self.run_args.show_progress + ): # Print for normal progress too, not just debug + print( + Fore.GREEN + + f" Image for page {page_num_to_view} successfully saved to: {resolved_save_path}" + ) + if self.run_args.debug: + print( + Fore.CYAN + + f" [Debug] File size: {resolved_save_path.stat().st_size} bytes" + ) + else: + if self.run_args.show_progress: + print( + Fore.RED + + f" ERROR: Failed to save image to {resolved_save_path}. File does not exist after save attempt." + ) + + except Exception as e_render_save: + if self.run_args.show_progress: + print( + Fore.RED + + f" Error during image rendering or saving: {e_render_save}" + ) + if self.run_args.debug: + traceback.print_exc() + finally: + if tmp_image_path_obj and tmp_image_path_obj.exists(): + if image_created_in_managed_folder: + if self.run_args.debug: + print( + Fore.CYAN + + f" [Debug] Image '{tmp_image_path_obj.name}' remains in managed folder '{IMAGE_FOLDER_PATH}'." + ) + print( + Fore.CYAN + + f" [Debug] It will be cleared based on CLEAR_IMAGE_FOLDER_ON_END ({CLEAR_IMAGE_FOLDER_ON_END})." + ) + else: + try: + tmp_image_path_obj.unlink() + if self.run_args.debug: + print( + Fore.CYAN + + f" [Debug] Deleted system temporary image: {tmp_image_path_obj.resolve()}" + ) + except Exception as e_unlink: + if self.run_args.debug: + print( + Fore.YELLOW + + f" Warning: Could not delete system temp image {tmp_image_path_obj.resolve()}: {e_unlink}" + ) + elif ( + tmp_image_path_obj + and not tmp_image_path_obj.exists() + and image_successfully_saved + ): + if self.run_args.debug: + print( + Fore.RED + + f" [Debug] Inconsistency: Image was marked saved, but {tmp_image_path_obj.resolve()} " + + "does not exist at cleanup (and wasn't a system temp explicitly deleted here)." + ) + + # Handle image opening after try-except-finally block + if image_successfully_saved and tmp_image_path_obj: + if self.run_args.show_progress: + print( + Fore.CYAN + "Attempting to open image with default application..." + ) + try: + file_uri = tmp_image_path_obj.resolve().as_uri() + if self.run_args.debug: + print(Fore.CYAN + f" [Debug] Opening URI: {file_uri}") + + opened_successfully = webbrowser.open(file_uri) + + if self.run_args.debug: + print( + Fore.CYAN + + f" [Debug] webbrowser.open() returned: {opened_successfully}" + ) + + if not opened_successfully: + if self.run_args.show_progress: + print( + Fore.YELLOW + + " webbrowser.open() reported failure (returned False or None)." + ) + print( + Fore.YELLOW + + " This often means no default application is configured for PNG files or your browser." + ) + elif self.run_args.show_progress: + print( + Fore.GREEN + + " Image hopefully opened. Check your applications." + ) + + if self.run_args.show_progress: + print( + Fore.YELLOW + + f" If the image did not open, please manually open: {tmp_image_path_obj.resolve()}" + ) + input(Fore.CYAN + "Press Enter after viewing image to continue...") + + except Exception as e_open: + if self.run_args.show_progress: + print( + Fore.RED + f" Could not open image using webbrowser: {e_open}" + ) + print( + Fore.YELLOW + + " This could be due to your system's environment (e.g., missing 'xdg-utils' on Linux, no default PNG viewer)." + ) + print( + Fore.YELLOW + + f" Please try opening the image manually: {tmp_image_path_obj.resolve()}" + ) + if self.run_args.debug: + traceback.print_exc() + input(Fore.CYAN + "Press Enter to acknowledge and continue...") + elif tmp_image_path_obj: + if self.run_args.show_progress: + print( + Fore.YELLOW + + " Skipping attempt to open image as it was not saved successfully." + ) + input(Fore.CYAN + "Press Enter to continue...") + else: + if self.run_args.show_progress: + print( + Fore.RED + + " Cannot attempt to open image as image path was not determined." + ) + input(Fore.CYAN + "Press Enter to continue...") + + def _interactive_review_session(self, highlights_list): + if not highlights_list: + if self.run_args.show_progress: + print(Fore.YELLOW + "No highlights to review.") + return [] + reviewed_highlights = [dict(h) for h in highlights_list] + idx, num_highlights = 0, len(reviewed_highlights) + AVAILABLE_COLORS = [ + "yellow", + "green", + "blue", + "pink", + "other_color", + "unknown_color", + ] + + while 0 <= idx < num_highlights: + item = reviewed_highlights[idx] + print( + Style.RESET_ALL + + "\n" + + "=" * 15 + + f" Review HL {idx + 1}/{num_highlights} (Page {item['page']}) " + + "=" * 15 + ) + + current_color_display = self._get_color_display_codes(item["color"]) + print( + f"Color: {current_color_display}{item['color'].upper()}{Style.RESET_ALL}", + end="", + ) + if item["color"] == "other_color" and item.get("raw_rgb_values"): + rgb = item["raw_rgb_values"][:3] + rgb_disp = tuple( + int(c * 255) if isinstance(c, float) else int(c) for c in rgb + ) + print(f" (RGB: {rgb_disp})", end="") + print() + + print(f"Text: {item['text']}") + + prompt_options = [ + "[N]ext", + "[P]rev", + "[U]p", + "[M]ove Down", + "[C]olor", + "[E]dit", + "[D]elete", + "[O]pen Img", + "[S]ave&Exit", + "[Q]uit", + ] + action_prompt_str = ( + Fore.CYAN + ", ".join(prompt_options) + "? > " + Style.RESET_ALL + ) + action = input(action_prompt_str).lower().strip() + + if action == "n": + idx = (idx + 1) % num_highlights if num_highlights > 0 else 0 + elif action == "p": + idx = ( + (idx - 1 + num_highlights) % num_highlights + if num_highlights > 0 + else 0 + ) + elif action == "u": + if idx > 0: + reviewed_highlights.insert(idx - 1, reviewed_highlights.pop(idx)) + idx -= 1 + print(Fore.GREEN + "Moved up.") + else: + print(Fore.YELLOW + "Already at the top.") + elif action == "m": + if idx < num_highlights - 1: + reviewed_highlights.insert(idx + 1, reviewed_highlights.pop(idx)) + idx += 1 + print(Fore.GREEN + "Moved down.") + else: + print(Fore.YELLOW + "Already at the bottom.") + elif action == "c": + print( + "Available colors:", + ", ".join( + f"{i + 1}.{self._get_color_display_codes(co)}{co.upper()}{Style.RESET_ALL}" + for i, co in enumerate(AVAILABLE_COLORS) + ), + ) + try: + choice_str = input( + Fore.YELLOW + "Enter number for new color: " + Style.RESET_ALL + ) + if not choice_str: + print(Fore.BLUE + "Color change cancelled (no input).") + continue + choice = int(choice_str) - 1 + if 0 <= choice < len(AVAILABLE_COLORS): + item["color"] = AVAILABLE_COLORS[choice] + print( + Fore.GREEN + + f"Color changed to {AVAILABLE_COLORS[choice].upper()}." + ) + else: + print(Fore.RED + "Invalid color choice.") + except ValueError: + print(Fore.RED + "Invalid input. Please enter a number.") + elif action == "e": + edit_prompt = ( + Fore.YELLOW + + "New text (blank=keep, 'CLEAR'=empty): > " + + Style.RESET_ALL + ) + new_text = input_with_prefill(edit_prompt, item["text"]) + + if new_text.strip().upper() == "CLEAR": + item["text"] = "" + print(Fore.GREEN + "Text cleared.") + elif new_text == item["text"] or not new_text.strip(): + print(Fore.BLUE + "Text kept as is.") + else: + item["text"] = new_text + print(Fore.GREEN + "Text updated.") + elif action == "d": + if ( + input( + Fore.RED + + "Are you sure you want to delete this highlight? [y/N]: " + + Style.RESET_ALL + ).lower() + == "y" + ): + reviewed_highlights.pop(idx) + num_highlights = len(reviewed_highlights) + print(Fore.GREEN + "Highlight deleted.") + if num_highlights == 0: + print(Fore.YELLOW + "No more highlights to review.") + break + if idx >= num_highlights: + idx = num_highlights - 1 + else: + print(Fore.BLUE + "Deletion cancelled.") + elif action == "o": + self._view_page_image_interactively(item["page"]) + elif action == "s": + print(Fore.GREEN + "Saving changes and exiting review session.") + break + elif action == "q": + if ( + input( + Fore.RED + + "Are you sure you want to quit review? Changes will not be saved. [y/N]: " + + Style.RESET_ALL + ).lower() + == "y" + ): + print( + Fore.YELLOW + + "Quitting review session. Changes made in this session are DISCARDED." + ) + return highlights_list + else: + print(Fore.BLUE + "Quit cancelled.") + else: + print(Fore.RED + "Invalid action. Please choose from the list.") + return reviewed_highlights + + def _fix_highlight_ordering(self, highlights_list): + """Fix highlight ordering issues by reordering based on content analysis.""" + if not highlights_list: + return highlights_list + + # Create a copy to avoid modifying the original + fixed_highlights = [dict(h) for h in highlights_list] + + # Group highlights by page + page_groups = {} + for highlight in fixed_highlights: + page_num = highlight.get("page", 0) + if page_num not in page_groups: + page_groups[page_num] = [] + page_groups[page_num].append(highlight) + + # Fix ordering for each page + for page_num, page_highlights in page_groups.items(): + # Sort by Y position first, then X position + page_highlights.sort( + key=lambda h: (h.get("y_position", 0), h.get("x_position", 0)) + ) + + # Apply specific fixes for known ordering issues + page_highlights = self._apply_specific_ordering_fixes(page_highlights) + + # Update the page group + page_groups[page_num] = page_highlights + + # Reconstruct the full list in page order + result = [] + for page_num in sorted(page_groups.keys()): + result.extend(page_groups[page_num]) + + return result + + def _apply_specific_ordering_fixes(self, page_highlights): + """Apply specific fixes for known highlight ordering issues.""" + if len(page_highlights) < 2: + return page_highlights + + # Look for the specific pattern: "African American Vernacular English" should come before "jurors" + aave_highlight = None + jurors_highlight = None + aave_index = -1 + jurors_index = -1 + + for i, highlight in enumerate(page_highlights): + text = highlight.get("text", "").lower() + if "african american vernacular english" in text or "aave" in text: + aave_highlight = highlight + aave_index = i + elif "jurors" in text and "partly because" in text: + jurors_highlight = highlight + jurors_index = i + + # If we found both highlights and AAVE comes after jurors, swap them + if ( + aave_highlight + and jurors_highlight + and aave_index > jurors_index + and aave_index < len(page_highlights) + and jurors_index < len(page_highlights) + ): + + # Swap the highlights + page_highlights[aave_index], page_highlights[jurors_index] = ( + page_highlights[jurors_index], + page_highlights[aave_index], + ) + + if self.run_args.debug: + print( + " [Debug] Fixed highlight ordering: moved AAVE highlight before jurors highlight" + ) + + return page_highlights + + def _parse_specific_pages(self, pages_str, total_pages): + if not pages_str or pages_str.lower() == "all": + return list(range(1, total_pages + 1)) + parsed_pages = set() + try: + for part in pages_str.split(","): + part = part.strip() + if not part: + continue + if "-" in part: + start_str, end_str = part.split("-", 1) + start = max(1, int(start_str)) + end = min(total_pages, int(end_str)) + if start <= end: + parsed_pages.update(range(start, end + 1)) + else: + page_val = int(part) + if 1 <= page_val <= total_pages: + parsed_pages.add(page_val) + return sorted(list(parsed_pages)) if parsed_pages else [] + except ValueError as e: + if self.run_args.show_progress: + print(Fore.YELLOW + f"āš ļø Invalid page range: {pages_str}. Error: {e}.") + return [] + + def _get_color_display_codes(self, color_name_str): + return { + "yellow": Back.YELLOW + Fore.BLACK, + "green": Back.GREEN + Fore.BLACK, + "blue": Back.BLUE + Fore.WHITE, + "pink": Back.MAGENTA + Fore.WHITE, + "other_color": Back.WHITE + Fore.BLACK, + "unknown_color": Back.LIGHTBLACK_EX + Fore.WHITE, + }.get(color_name_str.lower(), Back.LIGHTBLACK_EX + Fore.WHITE) + + def display_results(self): + if not self.run_args.show_progress: + return # Don't display if progress is off (e.g. silent) + + print( + "\n" + + Fore.CYAN + + Style.BRIGHT + + "=" * 30 + + " EXTRACTED HIGHLIGHTS " + + "=" * 30 + + Style.RESET_ALL + ) + if not self.highlights_data: + print("\nāŒ No highlights extracted or all were deleted.") + return + current_page = None + for item in self.highlights_data: + if item.get("page") != current_page: + current_page = item.get("page") + print( + f"\nšŸ“„ {Style.BRIGHT}Page {current_page}{Style.RESET_ALL}\n" + + "-" * 25 + ) + color_name = item.get("color", "unknown_color") + color_code = self._get_color_display_codes(color_name) + num_segments = item.get("num_segments", 0) + segment_info = f" [{num_segments} segments]" if num_segments > 1 else "" + text_content = item.get("text", "*NO TEXT*") + display_color_name = color_name.upper() + if color_name == "other_color": + raw_rgb = item.get("raw_rgb_values") + if raw_rgb and len(raw_rgb) >= 3: + rgb_disp = tuple( + int(c * 255) if isinstance(c, float) else int(c) + for c in raw_rgb[:3] + ) + display_color_name += f" (RGB: {rgb_disp})" + print(f"šŸŽØ {color_code}{display_color_name}{Style.RESET_ALL}{segment_info}") + print(f' "{text_content}"') + print() + + def save_to_json(self, output_path_str): + output_path = Path( + output_path_str + ).resolve() # Resolve to absolute path for clarity + try: + output_path.parent.mkdir(parents=True, exist_ok=True) + if self.run_args.debug: + print( + Fore.CYAN + + f" [Debug] Ensured parent directory for JSON exists: {output_path.parent}" + ) + except Exception as e_mkdir: + if self.run_args.show_progress: # Also show error if progress is on + print( + Fore.RED + + f"āŒ Error creating directory for JSON output {output_path.parent}: {e_mkdir}" + ) + if self.run_args.debug: + traceback.print_exc() + return # Cannot save if directory cannot be made + + data_to_save = { + "pdf_file_processed": str(self.pdf_path.name), + "pdf_full_path": str(self.pdf_path.resolve()), + "pages_processed_spec": ( + self.run_args.pages if self.run_args.pages else DEFAULT_PAGES_TO_PROCESS + ), + "extraction_timestamp": time.strftime("%Y-%m-%d %H:%M:%S %Z"), + "total_highlights_extracted": len(self.highlights_data), + "settings_used": { + "clean_edges": self.run_args.clean_edges, + "show_diff_percentage": self.run_args.show_diff_percentage, + }, + "highlights_data": self.highlights_data, + } + try: + with open(output_path, "w", encoding="utf-8") as f: + json.dump(data_to_save, f, indent=2, ensure_ascii=False) + if self.run_args.show_progress: + print(Fore.GREEN + f"šŸ’¾ Data saved to {output_path}") + except IOError as e: + if self.run_args.show_progress: + print(Fore.RED + f"āŒ Error saving JSON to {output_path}: {e}") + if self.run_args.debug: + traceback.print_exc() + + +def main(): + parser = argparse.ArgumentParser( + description="Enhanced PDF Highlight Extractor.", + formatter_class=argparse.RawTextHelpFormatter, + epilog=f"""Examples: + {sys.argv[0]} mydoc.pdf + {sys.argv[0]} mydoc.pdf -p "1,5-7" -i + {sys.argv[0]} -t -s --output-json results/test.json + {sys.argv[0]} doc.pdf -d + +If interactive image viewing ('O' option) fails, try running with the -d (debug) +flag. This will print detailed information about image paths and creation steps. +Common issues include missing default PNG viewers or OS-level permission problems. +The IMAGE_FOLDER_PATH ('{IMAGE_FOLDER_PATH}') is relative to where you run the script. +""", + ) + parser.add_argument( + "pdf_path_arg", + nargs="?", + default=None, + help="Path to PDF. Prompts if not in test/silent mode & not provided.", + ) + parser.add_argument( + "-p", + "--pages", + type=str, + default=None, + help=f'Pages (e.g., "1,3-5", "all"). Default: "{DEFAULT_PAGES_TO_PROCESS}".', + ) + parser.add_argument( + "-i", + "--interactive", + action="store_true", + help="Enable interactive review mode.", + ) + parser.add_argument( + "-t", + "--test", + action="store_true", + help=f"Test mode. Uses default PDF ('{DEFAULT_PDF_PATH}'), auto-saves JSON.", + ) + parser.add_argument( + "-s", + "--silent", + action="store_true", + help="Silent mode. Minimal output. Auto-saves JSON. Implies -t if no PDF path.", + ) + parser.add_argument( + "-d", + "--debug", + action="store_true", + help="Debug mode. Enables all detailed SHOW flags and prints more internal details.", + ) + parser.add_argument( + "--output-json", + type=str, + default=None, + help="Custom output JSON filename/path.", + ) + + cli_args = parser.parse_args() + + effective_run_args = argparse.Namespace() + effective_run_args.debug = cli_args.debug + effective_run_args.silent = cli_args.silent + + # Initialize based on global defaults + effective_run_args.show_timing = INITIAL_SHOW_TIMING + effective_run_args.show_progress = INITIAL_SHOW_PROGRESS + effective_run_args.show_raw_segments = INITIAL_SHOW_RAW_SEGMENTS + effective_run_args.show_extraction_details = INITIAL_SHOW_EXTRACTION_DETAILS + effective_run_args.show_rect_details = INITIAL_SHOW_RECT_DETAILS + effective_run_args.show_diff_percentage = INITIAL_SHOW_DIFF_PERCENTAGE + effective_run_args.clean_edges = INITIAL_CLEAN_EDGES + + # Override show flags based on debug or silent + if effective_run_args.debug: + for key in [ + "show_timing", + "show_progress", + "show_raw_segments", + "show_extraction_details", + "show_rect_details", + "show_diff_percentage", + ]: + setattr(effective_run_args, key, True) # Debug enables all these + + if effective_run_args.silent: + for key in [ + "show_timing", + "show_progress", + "show_raw_segments", + "show_extraction_details", + "show_rect_details", + "show_diff_percentage", + ]: + setattr(effective_run_args, key, False) # Silent disables all these + effective_run_args.interactive = False + else: # Not silent + effective_run_args.interactive = cli_args.interactive + + effective_run_args.pages = cli_args.pages + + start_time = time.time() + if effective_run_args.show_progress: + print( + Fore.MAGENTA + + Style.BRIGHT + + "šŸŽØ PDF Highlight Extractor šŸŽØ" + + Style.RESET_ALL + ) + if effective_run_args.debug: + print(Fore.CYAN + f" [Debug] Current Working Directory: {Path.cwd()}") + print(Fore.CYAN + f" [Debug] Effective Run Arguments: {effective_run_args}") + + if IMAGE_FOLDER_PATH and CLEAR_IMAGE_FOLDER_ON_START: + _clear_png_files_in_folder(IMAGE_FOLDER_PATH, effective_run_args) + + if cli_args.test or cli_args.silent: + pdf_path_to_use = DEFAULT_PDF_PATH + elif cli_args.pdf_path_arg: + pdf_path_to_use = cli_args.pdf_path_arg + else: + pdf_path_input = ( + input(f"šŸ“„ PDF path (Enter for default '{DEFAULT_PDF_PATH}'): ") + .strip() + .strip('"') + ) + pdf_path_to_use = pdf_path_input if pdf_path_input else DEFAULT_PDF_PATH + + if not pdf_path_to_use: + if effective_run_args.show_progress: + print(Fore.RED + "āŒ No PDF path specified. Exiting.") + sys.exit(1) + + resolved_path = Path(pdf_path_to_use).resolve() + if not resolved_path.exists() or not resolved_path.is_file(): + if effective_run_args.show_progress: + print(Fore.RED + f"āŒ PDF not found or is not a file: {resolved_path}") + sys.exit(1) + + doc_for_processing = None + try: + doc_for_processing = fitz.open(str(resolved_path)) + extractor = EnhancedPDFHighlightExtractor( + resolved_path, + effective_run_args, + main_doc_for_image_view=doc_for_processing, + ) + extractor.extract_highlights(doc_for_processing) + + if not effective_run_args.interactive and effective_run_args.show_progress: + extractor.display_results() + elif effective_run_args.interactive and effective_run_args.show_progress: + if ( + input( + Fore.CYAN + + "Interactive session ended. Display final results? [Y/n]: " + + Style.RESET_ALL + ) + .lower() + .strip() + != "n" + ): + extractor.display_results() + + json_output_path_str = ( + cli_args.output_json + if cli_args.output_json + else str(resolved_path.parent / f"{resolved_path.stem}_highlights.json") + ) + + if cli_args.test or cli_args.silent: + perform_save = True + elif effective_run_args.show_progress: + save_prompt_input = input( + f"šŸ’¾ Save to JSON? (Enter for default '{json_output_path_str}', type 'skip' to not save, or enter a custom path): " + + Style.RESET_ALL + ).strip() + perform_save = save_prompt_input.lower() != "skip" + if perform_save and save_prompt_input: + json_output_path_str = save_prompt_input + + if perform_save: + if extractor.highlights_data: + extractor.save_to_json(json_output_path_str) + elif effective_run_args.show_progress: + print( + Fore.YELLOW + + "No highlights were extracted or kept, so JSON file was not saved." + ) + elif effective_run_args.show_progress: + print(Fore.BLUE + "Skipped saving highlights to JSON.") + + except Exception as e: + if effective_run_args.show_progress: + print( + Fore.RED + + Style.BRIGHT + + f"šŸ’„ An critical error occurred in the main execution: {e}" + ) + if effective_run_args.debug: + traceback.print_exc() + finally: + if doc_for_processing: + doc_for_processing.close() + + if IMAGE_FOLDER_PATH and CLEAR_IMAGE_FOLDER_ON_END: + _clear_png_files_in_folder(IMAGE_FOLDER_PATH, effective_run_args) + + if effective_run_args.show_timing: + print( + Fore.CYAN + + f"\nā±ļø Total execution time: {time.time() - start_time:.2f} seconds" + ) + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index 55f5e47..39654f0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ colorama==0.4.6 -PyMuPDF==1.23.1 +PyMuPDF==1.22.3 diff --git a/test/test2.pdf b/test/test2.pdf index 5563ad2..18120b3 100644 Binary files a/test/test2.pdf and b/test/test2.pdf differ diff --git a/test/test4.pdf b/test/test4.pdf new file mode 100644 index 0000000..72cf371 Binary files /dev/null and b/test/test4.pdf differ