diff --git a/.gitignore b/.gitignore index 1795c8d..adf3546 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -.history \ No newline at end of file +.history +*.png \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..0fed8bd --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,13 @@ +{ + "version": "0.2.0", + "configurations": [ + {"name":"Python Debugger: Current File","type":"debugpy","request":"launch","program":"${file}","console":"integratedTerminal"}, + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/clear b/clear new file mode 100644 index 0000000..1f1ff70 --- /dev/null +++ b/clear @@ -0,0 +1,433 @@ +{ + "pdf_file_processed": "test2.pdf", + "pdf_full_path": "/mnt/c/Users/admin/Downloads/test2.pdf", + "pages_processed_spec": "5", + "extraction_timestamp": "2025-06-03 08:55:13 EDT", + "total_highlights_extracted": 20, + "settings_used": { + "clean_edges": true, + "show_diff_percentage": true + }, + "highlights_data": [ + { + "page": 5, + "highlight_id_on_page": 1, + "text": "or prejudice in", + "color": "yellow", + "raw_rgb_values": [ + 1.0, + 0.9411770105361938, + 0.4000000059604645 + ], + "type": "highlight", + "y_position": 53.75, + "x_position": 60.0, + "rect_details": [ + 60.0, + 53.75, + 116.0, + 63.75 + ], + "num_segments": 1 + }, + { + "page": 5, + "highlight_id_on_page": 2, + "text": "unin", + "color": "yellow", + "raw_rgb_values": [ + 1.0, + 0.9411770105361938, + 0.4000000059604645 + ], + "type": "highlight", + "y_position": 77.75, + "x_position": 164.0, + "rect_details": [ + 164.0, + 77.75, + 169.0, + 87.75 + ], + "num_segments": 1 + }, + { + "page": 5, + "highlight_id_on_page": 3, + "text": "uninformed about how ‘language can stand as a barrier to jus-", + "color": "yellow", + "raw_rgb_values": [ + 1.0, + 0.9411770105361938, + 0.4000000059604645 + ], + "type": "highlight", + "y_position": 77.75, + "x_position": 164.0, + "rect_details": [ + 164.0, + 77.75, + 405.0, + 87.75 + ], + "num_segments": 1 + }, + { + "page": 5, + "highlight_id_on_page": 4, + "text": "tice or equal opportunity’.", + "color": "yellow", + "raw_rgb_values": [ + 1.0, + 0.9411770105361938, + 0.4000000059604645 + ], + "type": "highlight", + "y_position": 89.75, + "x_position": 60.0, + "rect_details": [ + 60.0, + 89.75, + 158.0, + 99.75 + ], + "num_segments": 1 + }, + { + "page": 5, + "highlight_id_on_page": 5, + "text": "linguistics,", + "color": "yellow", + "raw_rgb_values": [ + 1.0, + 0.9411770105361938, + 0.4000000059604645 + ], + "type": "highlight", + "y_position": 137.75, + "x_position": 188.0, + "rect_details": [ + 188.0, + 137.75, + 226.0, + 147.75 + ], + "num_segments": 1 + }, + { + "page": 5, + "highlight_id_on_page": 6, + "text": "needs to make applied contributions to the understanding and solution of racial discrimination, criminal injustice, and other social problems.", + "color": "yellow", + "raw_rgb_values": [ + 1.0, + 0.9411770105361938, + 0.4000000059604645 + ], + "type": "highlight", + "y_position": 149.75, + "x_position": 60.0, + "rect_details": [ + 60.0, + 149.75, + 408.0, + 171.75 + ], + "num_segments": 2 + }, + { + "page": 5, + "highlight_id_on_page": 7, + "text": "first", + "color": "blue", + "raw_rgb_values": [ + 0.5607839822769165, + 0.8705880045890808, + 0.9764710068702698 + ], + "type": "highlight", + "y_position": 173.75, + "x_position": 182.0, + "rect_details": [ + 182.0, + 173.75, + 198.0, + 183.75 + ], + "num_segments": 1 + }, + { + "page": 5, + "highlight_id_on_page": 8, + "text": "at interpreters are not generally provided for ‘dialects’ of a language, only for foreign ‘languages’", + "color": "yellow", + "raw_rgb_values": [ + 1.0, + 0.9411770105361938, + 0.4000000059604645 + ], + "type": "highlight", + "y_position": 197.75, + "x_position": 60.0, + "rect_details": [ + 60.0, + 197.75, + 408.0, + 219.75 + ], + "num_segments": 2 + }, + { + "page": 5, + "highlight_id_on_page": 9, + "text": "(§2),", + "color": "blue", + "raw_rgb_values": [ + 0.5607839822769165, + 0.8705880045890808, + 0.9764710068702698 + ], + "type": "highlight", + "y_position": 197.75, + "x_position": 182.0, + "rect_details": [ + 182.0, + 197.75, + 201.0, + 207.75 + ], + "num_segments": 1 + }, + { + "page": 5, + "highlight_id_on_page": 10, + "text": "§3", + "color": "blue", + "raw_rgb_values": [ + 0.5607839822769165, + 0.8705880045890808, + 0.9764710068702698 + ], + "type": "highlight", + "y_position": 209.75, + "x_position": 398.0, + "rect_details": [ + 398.0, + 209.75, + 408.0, + 219.75 + ], + "num_segments": 1 + }, + { + "page": 5, + "highlight_id_on_page": 11, + "text": "specific case of Rachel Jeantel’s dialect, a", + "color": "yellow", + "raw_rgb_values": [ + 1.0, + 0.9411770105361938, + 0.4000000059604645 + ], + "type": "highlight", + "y_position": 221.75, + "x_position": 84.0, + "rect_details": [ + 84.0, + 221.75, + 241.0, + 231.75 + ], + "num_segments": 1 + }, + { + "page": 5, + "highlight_id_on_page": 12, + "text": "whether the credibility and intelligibility problems that led jurors to disregard Jeantel’s testimony were due", + "color": "yellow", + "raw_rgb_values": [ + 1.0, + 0.9411770105361938, + 0.4000000059604645 + ], + "type": "highlight", + "y_position": 269.75, + "x_position": 60.0, + "rect_details": [ + 60.0, + 269.75, + 408.0, + 291.75 + ], + "num_segments": 2 + }, + { + "page": 5, + "highlight_id_on_page": 13, + "text": "§4 we", + "color": "blue", + "raw_rgb_values": [ + 0.5607839822769165, + 0.8705880045890808, + 0.9764710068702698 + ], + "type": "highlight", + "y_position": 269.75, + "x_position": 237.0, + "rect_details": [ + 237.0, + 269.75, + 257.0, + 279.75 + ], + "num_segments": 1 + }, + { + "page": 5, + "highlight_id_on_page": 14, + "text": "dialect and insti-", + "color": "yellow", + "raw_rgb_values": [ + 1.0, + 0.9411770105361938, + 0.4000000059604645 + ], + "type": "highlight", + "y_position": 317.75, + "x_position": 342.0, + "rect_details": [ + 342.0, + 317.75, + 402.0, + 327.75 + ], + "num_segments": 1 + }, + { + "page": 5, + "highlight_id_on_page": 15, + "text": "tutionalized racism negatively impact AAVE and other vernacular speakers i", + "color": "yellow", + "raw_rgb_values": [ + 1.0, + 0.9411770105361938, + 0.4000000059604645 + ], + "type": "highlight", + "y_position": 329.75, + "x_position": 60.0, + "rect_details": [ + 60.0, + 329.75, + 367.0, + 339.75 + ], + "num_segments": 1 + }, + { + "page": 5, + "highlight_id_on_page": 16, + "text": "(§5).", + "color": "blue", + "raw_rgb_values": [ + 0.5607839822769165, + 0.8705880045890808, + 0.9764710068702698 + ], + "type": "highlight", + "y_position": 341.75, + "x_position": 342.0, + "rect_details": [ + 342.0, + 341.75, + 355.0, + 351.75 + ], + "num_segments": 1 + }, + { + "page": 5, + "highlight_id_on_page": 17, + "text": "summarize our conclusions a", + "color": "yellow", + "raw_rgb_values": [ + 1.0, + 0.9411770105361938, + 0.4000000059604645 + ], + "type": "highlight", + "y_position": 353.75, + "x_position": 60.0, + "rect_details": [ + 60.0, + 353.75, + 170.0, + 363.75 + ], + "num_segments": 1 + }, + { + "page": 5, + "highlight_id_on_page": 18, + "text": "(§6).", + "color": "blue", + "raw_rgb_values": [ + 0.5607839822769165, + 0.8705880045890808, + 0.9764710068702698 + ], + "type": "highlight", + "y_position": 365.75, + "x_position": 220.0, + "rect_details": [ + 220.0, + 365.75, + 236.0, + 375.75 + ], + "num_segments": 1 + }, + { + "page": 5, + "highlight_id_on_page": 19, + "text": "at nonstandard or vernacular dialects", + "color": "yellow", + "raw_rgb_values": [ + 1.0, + 0.9411770105361938, + 0.4000000059604645 + ], + "type": "highlight", + "y_position": 407.75, + "x_position": 206.0, + "rect_details": [ + 206.0, + 407.75, + 340.0, + 417.75 + ], + "num_segments": 1 + }, + { + "page": 5, + "highlight_id_on_page": 20, + "text": "spoken most frequently and fluently by ethnic minorities and/or by less educated, working-class,orpoorpeopleworldwide.1", + "color": "yellow", + "raw_rgb_values": [ + 1.0, + 0.9411770105361938, + 0.4000000059604645 + ], + "type": "highlight", + "y_position": 431.75, + "x_position": 60.0, + "rect_details": [ + 60.0, + 431.75, + 408.0, + 453.75 + ], + "num_segments": 2 + } + ] +} \ No newline at end of file diff --git a/main.py b/main.py index 199cc00..055401d 100644 --- a/main.py +++ b/main.py @@ -1,709 +1,753 @@ -""" -PDF Highlight Extractor -====================== +#!/usr/bin/env python3 +# ============================================================================= +# ENHANCED PDF HIGHLIGHT EXTRACTOR +# Author: Perplexity AI Companion (Updated by User Feedback) +# Date: June 3, 2025 +# License: MIT +# +# Extracts highlights from PDF files, with options for interactive review, +# detailed output, text cleaning, JSON export, and page image viewing. +# ============================================================================= -A robust tool for extracting highlighted text from PDF files with intelligent text ordering -and hyphenation handling. - -Overview: --------- -This tool addresses common PDF text extraction challenges: -- PDFs store text in creation order, not reading order -- Multi-line highlights can extract in wrong sequence -- Hyphenated words across lines need rejoining -- Boundary words may be partially highlighted - -Architecture: ------------- -1. PDFHighlightExtractor: Main class handling extraction logic -2. Multi-method extraction: Fallback system for maximum compatibility -3. Smart text ordering: Line detection and geometric sorting -4. Hyphenation merger: Detects and combines split words - -Technical Approach: ------------------ -METHOD A: PyMuPDF built-in text sorting -- Uses page.get_text("text", sort=True) for automatic ordering -- Most reliable for simple layouts - -METHOD B: Text block extraction -- Extracts PDF text blocks which maintain better reading order -- Geometric sorting by block position - -METHOD C: Enhanced word-level sorting -- Individual word extraction with custom line detection -- Groups words by Y-position, sorts by X-position within lines -- Handles complex multi-line highlights - -Hyphenation Algorithm: --------------------- -1. Detects highlights ending with '-' -2. Checks next highlight for same color and reasonable distance -3. Merges: "lin-" + "guistics" → "linguistics" -4. Supports both same-page and cross-page hyphenation - -Color Detection: ---------------- -- RGB color space analysis -- Supports 4 highlight colors: Yellow, Pink, Green, Blue -- Handles both fill and stroke color properties - -Precision Control: ------------------ -- 40% overlap threshold for word inclusion -- +2 pixel boundary expansion for edge cases -- 5-pixel line tolerance for multi-line detection - -Usage Patterns: --------------- -Test Mode: python script.py --test -- Uses default PDF path -- Display-only output -- Quick testing and debugging - -Full Mode: python script.py -- Interactive prompts for file paths -- Optional JSON/CSV export -- Complete control over options -""" import time -import pdfplumber +import os import fitz # PyMuPDF import json from colorama import init, Fore, Back, Style -import pandas as pd from pathlib import Path import re +import string import sys +import traceback +import argparse +import difflib # For text difference calculation +import tempfile # For temporary image files +import webbrowser # For opening images/PDFs +import uuid # For unique filenames -# Initialize colorama for colored terminal output +# Attempt to import readline for better input() experience on some systems +try: + import readline + READLINE_AVAILABLE = True +except ImportError: + READLINE_AVAILABLE = False # readline not available + +# ============================================================================= +# GLOBAL CONFIGURATION FLAGS (Defaults, can be overridden by CLI args) +# ============================================================================= +DEFAULT_PDF_PATH = "/mnt/c/Users/admin/Downloads/test2.pdf" # Example, adjust if needed +DEFAULT_PAGES_TO_PROCESS = "3" # Example: "1,3-5,all" + +# Default Behavior flags (can be influenced by -d or -s CLI flags) +# These are used to initialize effective_run_args +# Keep these distinct from the effective_run_args object itself +INITIAL_SHOW_TIMING = True +INITIAL_SHOW_PROGRESS = True +INITIAL_SHOW_RAW_SEGMENTS = True +INITIAL_SHOW_EXTRACTION_DETAILS = True +INITIAL_SHOW_RECT_DETAILS = True +INITIAL_SHOW_DIFF_PERCENTAGE = True +INITIAL_CLEAN_EDGES = True + +# Text extraction parameters (generally fixed) +TEXT_EXTRACTION_HORIZONTAL_PADDING = 6.0 +TEXT_EXTRACTION_VERTICAL_PADDING = 1.0 + +# Edge cleaning configuration (generally fixed) +VALID_TWO_LETTER_WORDS = { + 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he', 'if', 'in', 'is', 'it', 'me', 'my', + 'no', 'of', 'on', 'or', 'ox', 'so', 'to', 'up', 'us', 'we'} +VALID_SINGLE_LETTERS = {'i', 'a'} + +# Image handling configuration +IMAGE_FOLDER_PATH = 'pdf_page_images' # Relative to CWD by default +CLEAR_IMAGE_FOLDER_ON_START = True +CLEAR_IMAGE_FOLDER_ON_END = False + +# Initialize colorama init(autoreset=True) -class PDFHighlightExtractor: - """ -Main extraction class for PDF highlighted text. +# --- Helper Functions --- +def get_text_diff_ratio(text1, text2): + if not text1 and not text2: return 1.0 + if not text1 or not text2: return 0.0 + return difflib.SequenceMatcher(None, str(text1), str(text2)).ratio() -This class handles the complete extraction pipeline from PDF analysis -to formatted output with intelligent text ordering and hyphenation. - -Key Features: ------------- -- Multi-method text extraction with fallback -- Geometric text ordering for proper reading sequence -- Hyphenation detection and merging -- 4-color highlight support (Yellow, Pink, Green, Blue) -- Cross-page highlight handling - -Extraction Pipeline: ------------------- -1. PDF Loading: Opens PDF with PyMuPDF -2. Annotation Detection: Finds highlight annotations -3. Color Classification: Identifies highlight colors -4. Text Extraction: Uses multi-method approach -5. Text Ordering: Applies geometric sorting -6. Hyphenation Merging: Combines split words -7. Output Formatting: Prepares results for display/export - -Methods Overview: ---------------- -extract_all_highlights(): Main entry point -_extract_text_balanced(): Core text extraction with ordering -_smart_hyphenation_merge(): Hyphenation detection and merging -_is_clear_hyphenation(): Hyphenation pattern recognition -display_results(): Formatted terminal output - -Usage: ------- -extractor = PDFHighlightExtractor('path/to/file.pdf') -annotations, highlights = extractor.extract_all_highlights() -extractor.display_results() -""" -def __init__(self, pdf_path): - self.pdf_path = Path(pdf_path) - self.annotations = [] - self.highlights = [] - -def extract_annotation_highlights(self): - """Extract annotations with simple processing.""" - annotations = [] - try: - with pdfplumber.open(self.pdf_path) as pdf: - print(f"📄 Processing annotations...") - for page_num, page in enumerate(pdf.pages, 1): - if hasattr(page, 'annots') and page.annots: - for annot in page.annots: - try: - annot_type = annot.get('subtype', 'Unknown') - if annot_type in ['Highlight', 'Squiggly', 'StrikeOut', 'Underline', 'FreeText', 'Text']: - rect = annot.get('rect', []) - text = self._get_annotation_text(page, annot, rect) - color = self._get_simple_color(annot.get('color', [])) - - if text and text.strip(): - annotations.append({ - 'page': page_num, - 'text': text.strip(), - 'color': color, - 'type': 'annotation', - 'y_position': rect[1] if len(rect) >= 4 else 0 - }) - except: - continue - - print(f" ✅ Found {len(annotations)} annotations") - except Exception as e: - print(f"❌ Error: {e}") +def clean_segment_edges_func(text_to_clean, clean_edges_setting): + if not clean_edges_setting or not text_to_clean: return text_to_clean + text_to_clean = re.sub(r'\s+', ' ', text_to_clean.strip()) + words = text_to_clean.split() + if not words: return text_to_clean - return annotations + current_idx = 0 + while current_idx < len(words): + token = words[current_idx] + core_token = token.rstrip(string.punctuation) + trailing_punctuation = token[len(core_token):] + if not core_token: words.pop(current_idx); continue + core_should_be_removed = (len(core_token) == 1 and core_token.isalpha() and core_token.lower() not in VALID_SINGLE_LETTERS) or \ + (len(core_token) == 2 and core_token.isalpha() and core_token.lower() not in VALID_TWO_LETTER_WORDS) + if core_should_be_removed: + if trailing_punctuation: words[current_idx] = trailing_punctuation + else: words.pop(current_idx) + continue + break + while words: + token = words[-1] + core_token = token.lstrip(string.punctuation) + leading_punctuation = token[:-len(core_token)] if core_token else "" + if not core_token: words.pop(); continue + core_should_be_removed = (len(core_token) == 1 and core_token.isalpha() and core_token.lower() not in VALID_SINGLE_LETTERS) or \ + (len(core_token) == 2 and core_token.isalpha() and core_token.lower() not in VALID_TWO_LETTER_WORDS) + if core_should_be_removed: + if leading_punctuation: words[-1] = leading_punctuation + else: words.pop() + continue + break + return ' '.join(words) -def extract_background_highlights(self): - """Extract highlights with BALANCED precision - capture complete highlights.""" - all_highlights = [] +def input_with_prefill(prompt, text): + if READLINE_AVAILABLE: + def hook(): + readline.insert_text(text) + readline.redisplay() + readline.set_pre_input_hook(hook) + result = input(prompt) + readline.set_pre_input_hook() + return result + else: + print(Fore.MAGENTA + "Current text (edit below):\n" + Style.RESET_ALL + f"{text}") + return input(prompt) + +def _clear_png_files_in_folder(folder_path_str, run_args_for_print_control): + # This function CLEARS files if folder exists. It DOES NOT CREATE the folder. + if not folder_path_str: return - try: - print(f"\n🎨 Processing highlights...") - doc = fitz.open(str(self.pdf_path)) + folder = Path(folder_path_str) # Path relative to CWD if not absolute + abs_folder_path = folder.resolve() + + if run_args_for_print_control.debug: + print(Fore.CYAN + f" [Debug] _clear_png_files_in_folder: Checking {abs_folder_path} (Specified as: '{folder_path_str}')") + + if abs_folder_path.is_dir(): + if run_args_for_print_control.show_progress: + print(Fore.BLUE + f"Clearing *.png files from {abs_folder_path}...") + cleared_count = 0 + try: + for file_path in abs_folder_path.glob("*.png"): + if file_path.is_file(): + file_path.unlink() + cleared_count +=1 + except Exception as e: + if run_args_for_print_control.show_progress: # Also show error if progress is on + print(Fore.RED + f"Error during file deletion in {abs_folder_path}: {e}") - # Collect each individual highlight with BALANCED extraction - for page_num in range(doc.page_count): - page = doc[page_num] - annotations = page.annots() - - for annot in annotations: + if run_args_for_print_control.show_progress: + if cleared_count > 0: + print(Fore.BLUE + f"Cleared {cleared_count} *.png files from {abs_folder_path}.") + else: + print(Fore.BLUE + f"No *.png files found to clear in {abs_folder_path}.") + else: + if run_args_for_print_control.show_progress: + print(Fore.YELLOW + f"Image folder {abs_folder_path} not found, skipping clear.") + elif run_args_for_print_control.debug: # Still log if not found in debug, even if not show_progress + print(Fore.CYAN + f" [Debug] _clear_png_files_in_folder: Folder {abs_folder_path} does not exist. Nothing to clear.") + + +class EnhancedPDFHighlightExtractor: + def __init__(self, pdf_path, effective_run_args, main_doc_for_image_view=None): + self.pdf_path = Path(pdf_path) + self.run_args = effective_run_args + self.pdf_filename_stem = self.pdf_path.stem + self.highlights_data = [] + self.main_doc_for_image_view = main_doc_for_image_view + + def _get_highlight_color_from_rgb_tuple(self, rgb_tuple_floats_or_ints): + if not rgb_tuple_floats_or_ints or len(rgb_tuple_floats_or_ints) < 3 : return 'unknown_color' + r, g, b = [int(x * 255) if isinstance(x, float) and 0.0 <= x <= 1.0 else int(x) for x in rgb_tuple_floats_or_ints[:3]] + if r == 142 and g == 221 and b == 249: return 'blue' + if r > 200 and g > 200 and b < 150: return 'yellow' + if r < 150 and g > 180 and b < 150: return 'green' + if r < 150 and g < 180 and b > 180: return 'blue' + if r > 180 and g < 180 and b > 180: return 'pink' + return 'other_color' + + def _get_highlight_color_from_annot_colors_dict(self, colors_dict): + if not colors_dict: return 'unknown_color', None + rgb_tuple = colors_dict.get('stroke') or colors_dict.get('fill') + if not rgb_tuple: return 'unknown_color', None + return self._get_highlight_color_from_rgb_tuple(rgb_tuple), rgb_tuple[:3] + + def _extract_text_from_multi_segment_highlight(self, page, annot, page_num, hl_id): + overall_highlight_color_name, _ = self._get_highlight_color_from_annot_colors_dict(annot.colors) + color_code_for_segment_print = self._get_color_display_codes(overall_highlight_color_name) + quads_vertices = annot.vertices + if not quads_vertices: + if self.run_args.show_extraction_details: print(Fore.YELLOW + f" No quads for HL {hl_id} on page {page_num}") + return None, 0, [] + + processed_quads_as_points_list = [] + if len(quads_vertices) % 4 == 0: + for i in range(0, len(quads_vertices), 4): try: - if annot.type[1] == 'Highlight': - colors = annot.colors - color_name = self._get_highlight_color(colors) - - if color_name in ['yellow', 'pink', 'green', 'blue']: - # BALANCED: Extract complete highlighted phrases - text = self._extract_text_balanced(page, annot) - - if text and text.strip(): - all_highlights.append({ - 'page': page_num + 1, - 'text': text.strip(), - 'color': color_name, - 'type': 'highlight', - 'y_position': annot.rect.y0, - 'x_position': annot.rect.x0, - 'y_end': annot.rect.y1, - 'x_end': annot.rect.x1, - 'rect': annot.rect - }) - print(f" 🎨 {color_name.upper()}: \"{text[:70]}...\"") + quad_points = [fitz.Point(p) for p in quads_vertices[i:i+4]] + processed_quads_as_points_list.append(quad_points) except Exception as e: + if self.run_args.show_extraction_details: print(Fore.YELLOW + f" Skipping malformed quad points: {e}") continue - doc.close() - - # Smart hyphenation merging only - merged_highlights = self._smart_hyphenation_merge(all_highlights) - - print(f" 📊 Raw: {len(all_highlights)} → Merged: {len(merged_highlights)}") - return merged_highlights - - except Exception as e: - print(f"❌ Error: {e}") - return [] - -def _extract_text_balanced(self, page, annot): - """BALANCED: Extract text with PROPER READING ORDER.""" - try: - # Method 1: Use PyMuPDF's built-in text ordering with sorting - highlight_rect = annot.rect - - # SMALL EXPANSION for boundary words - expanded_rect = fitz.Rect( - highlight_rect.x0 - 2, - highlight_rect.y0 - 1, - highlight_rect.x1 + 2, - highlight_rect.y1 + 1 - ) - - # METHOD A: Use text extraction with BUILT-IN SORTING - print(f" 🔍 Method A: Text extraction with sorting") - text_with_sort = page.get_text("text", clip=expanded_rect, sort=True) - if text_with_sort and text_with_sort.strip(): - cleaned_text = re.sub(r'\s+', ' ', text_with_sort.strip()) - print(f" ✅ Sorted text result: \"{cleaned_text}\"") - return cleaned_text - - # METHOD B: Text blocks (better reading order than individual words) - print(f" 🔍 Method B: Text blocks extraction") - text_blocks = page.get_text("blocks", clip=expanded_rect) - if text_blocks: - # Sort blocks by reading order (top to bottom, left to right) - text_blocks.sort(key=lambda block: (block[1], block[0])) # y-pos, then x-pos - - block_texts = [] - for block in text_blocks: - if len(block) >= 5 and block[4].strip(): - block_text = block[4].strip() - block_text = re.sub(r'\s+', ' ', block_text) - block_texts.append(block_text) - - if block_texts: - combined_text = " ".join(block_texts) - print(f" ✅ Block result: \"{combined_text}\"") - return combined_text - - # METHOD C: Enhanced word-level with geometric sorting - print(f" 🔍 Method C: Enhanced word sorting") - all_words = page.get_text("words") - highlight_words = [] - - for word in all_words: - word_rect = fitz.Rect(word[:4]) - word_text = word[4] - - if expanded_rect.intersects(word_rect): - intersection = expanded_rect & word_rect - word_area = word_rect.get_area() - - if word_area > 0: - overlap_ratio = intersection.get_area() / word_area - - if overlap_ratio >= 0.40: - highlight_words.append({ - 'text': word_text, - 'x0': word[0], - 'y0': word[1], - 'x1': word[2], - 'y1': word[3], - 'center_y': (word[1] + word[3]) / 2, - 'center_x': (word[0] + word[2]) / 2 - }) - - if highlight_words: - # ENHANCED SORTING: Group by lines first, then sort within lines - # Group words by approximate line (within 5 pixels of each other) - lines = [] - for word in highlight_words: - placed = False - for line in lines: - # Check if word belongs to existing line - avg_y = sum(w['center_y'] for w in line) / len(line) - if abs(word['center_y'] - avg_y) <= 5: # Same line tolerance - line.append(word) - placed = True - break - - if not placed: - lines.append([word]) - - # Sort lines by Y position (top to bottom) - lines.sort(key=lambda line: sum(w['center_y'] for w in line) / len(line)) - - # Sort words within each line by X position (left to right) - for line in lines: - line.sort(key=lambda w: w['center_x']) - - # Combine all words in reading order - ordered_words = [] - for line in lines: - ordered_words.extend(line) - - extracted_text = " ".join([w['text'] for w in ordered_words]) - print(f" ✅ Enhanced word sorting ({len(ordered_words)} words): \"{extracted_text}\"") - return extracted_text - - print(f" ❌ No text found in highlight area") - return "" - - except Exception as e: - print(f" ❌ Extraction error: {e}") - return "" - - -def _extract_by_quads_balanced(self, page, annot): - """Extract using quad points with BALANCED precision.""" - try: - quad_points = annot.vertices - if not quad_points: - return "" - - quad_count = int(len(quad_points) / 4) - all_words = page.get_text("words") - highlight_words = [] - - print(f" 🔍 Processing {quad_count} quads with balanced precision") - - for i in range(quad_count): - points = quad_points[i * 4: i * 4 + 4] - quad_rect = fitz.Quad(points).rect - - # SMALL EXPANSION - 2 pixels to catch boundary words - expanded_quad = fitz.Rect( - quad_rect.x0 - 2, quad_rect.y0 - 1, - quad_rect.x1 + 2, quad_rect.y1 + 1 - ) - - for word in all_words: - word_rect = fitz.Rect(word[:4]) - word_text = word[4] - - if expanded_quad.intersects(word_rect): - intersection = expanded_quad & word_rect - word_area = word_rect.get_area() - - if word_area > 0: - overlap_ratio = intersection.get_area() / word_area - - # RELAXED: 40% overlap required (was 75%) - if overlap_ratio >= 0.40: - highlight_words.append({ - 'text': word_text, - 'x0': word[0], - 'y0': word[1], - 'line': self._estimate_line_number(word[1]) - }) - print(f" ✓ Quad '{word_text}' (overlap: {overlap_ratio:.2f})") - - if highlight_words: - # Remove duplicates while preserving order - seen = set() - unique_words = [] - for word in highlight_words: - word_key = (word['text'], word['x0'], word['y0']) - if word_key not in seen: - seen.add(word_key) - unique_words.append(word) - - # Sort by reading order - unique_words.sort(key=lambda w: (w['line'], w['x0'])) - extracted_text = " ".join([w['text'] for w in unique_words]) - print(f" ✅ Quad balanced ({len(unique_words)} words): \"{extracted_text}\"") - return extracted_text - - return "" - - except Exception as e: - print(f" ❌ Quad extraction error: {e}") - return "" - -def _estimate_line_number(self, y_position, avg_line_height=14): - """Estimate line number based on y-position.""" - return round(y_position / avg_line_height) - -def _smart_hyphenation_merge(self, highlights): - """Smart merging - ONLY for clear hyphenation patterns.""" - if not highlights: - return highlights - - # Sort by page, color, then position - highlights.sort(key=lambda x: (x['page'], x['color'], x['y_position'], x['x_position'])) - - merged = [] - i = 0 - - while i < len(highlights): - current = highlights[i] - - # Look for hyphenation continuation - if (i + 1 < len(highlights) and - self._is_clear_hyphenation(current, highlights[i + 1])): - - next_hl = highlights[i + 1] - merged_text = self._join_hyphenated_text(current['text'], next_hl['text']) - - merged_highlight = current.copy() - merged_highlight['text'] = merged_text - - if current['page'] != next_hl['page']: - merged_highlight['pages_spanned'] = f"Pages {current['page']}-{next_hl['page']}" - print(f" 🔗 Cross-page hyphen: \"{merged_text[:80]}\"") - else: - merged_highlight['hyphen_merged'] = True - print(f" 🔗 Same-page hyphen: \"{merged_text[:80]}\"") - - merged.append(merged_highlight) - i += 2 # Skip both highlights - else: - merged.append(current) - i += 1 - - return merged - -def _is_clear_hyphenation(self, hl1, hl2): - """Detect ONLY clear hyphenation patterns.""" - # Must be same color - if hl1['color'] != hl2['color']: - return False - - text1 = hl1['text'].strip() - text2 = hl2['text'].strip() - - # MUST end with hyphen for hyphenation - if not text1.endswith('-'): - return False - - # Same page: check reasonable line spacing - if hl1['page'] == hl2['page']: - y_diff = abs(hl1['y_position'] - hl2['y_position']) - # Reasonable line height (8-30 pixels) - slightly more lenient - if 8 <= y_diff <= 30 and hl2['y_position'] > hl1['y_position']: - print(f" 🔍 Same-page hyphen detected: '{text1}' + '{text2[:15]}'") - return True - - # Cross-page: second highlight should be near top - elif hl2['page'] == hl1['page'] + 1 and hl2['y_position'] < 150: - print(f" 🔍 Cross-page hyphen detected: '{text1}' + '{text2[:15]}'") - return True - - return False - -def _join_hyphenated_text(self, text1, text2): - """Join hyphenated text correctly.""" - text1 = text1.strip() - text2 = text2.strip() - - if text1.endswith('-'): - # Remove hyphen and join - return text1[:-1] + text2 - else: - return text1 + " " + text2 - -def _get_highlight_color(self, colors): - """Get highlight color - only 4 colors.""" - if not colors: - return 'unknown' - - if 'fill' in colors and colors['fill']: - rgb = colors['fill'] - elif 'stroke' in colors and colors['stroke']: - rgb = colors['stroke'] - else: - return 'unknown' - - return self._rgb_to_simple_color(rgb) -def _rgb_to_simple_color(self, rgb): - """Convert RGB to one of 4 colors.""" - if not rgb or len(rgb) < 3: - return 'unknown' - - r, g, b = rgb[:3] - - if r <= 1: - r, g, b = r*255, g*255, b*255 - - if r > 220 and g > 220 and b < 120: - return 'yellow' - elif r < 120 and g > 180 and b < 120: - return 'green' - elif r < 120 and g < 180 and b > 180: - return 'blue' - elif r > 180 and g < 180 and b > 180: - return 'pink' - else: - max_val = max(r, g, b) - if max_val == r and r > 150: - return 'pink' - elif max_val == g and g > 150: - return 'green' - elif max_val == b and b > 150: - return 'blue' - elif r > 180 and g > 180: - return 'yellow' - return 'unknown' - -def _get_simple_color(self, color_rgb): - """Get simple color from annotation.""" - if color_rgb: - return self._rgb_to_simple_color(color_rgb) - return 'unknown' - -def _get_annotation_text(self, page, annot, rect): - """Extract annotation text.""" - text = annot.get('contents', '').strip() - if text: - return text - - if rect and len(rect) == 4: try: - x0, y0, x1, y1 = rect - cropped = page.crop((x0-1, y0-1, x1+1, y1+1)) - text = cropped.extract_text() - if text and text.strip(): - return text.strip() - except: - pass - - return "" + sorted_quad_points_list = sorted(processed_quads_as_points_list, key=lambda qp_list: (fitz.Quad(qp_list).rect.y0, fitz.Quad(qp_list).rect.x0)) + except Exception as e: + if self.run_args.show_extraction_details: print(Fore.RED + f" Error sorting quads for HL {hl_id}: {e}. Using original order.") + sorted_quad_points_list = processed_quads_as_points_list -def extract_all_highlights(self): - """Main extraction method.""" - print("🔍 PDF Highlight Extractor - BALANCED PRECISION") - print("🎯 Colors: Yellow, Pink, Green, Blue only") - print("🎯 BALANCED extraction - complete highlights without over-capture") - print("📏 Small expansion (+2 pixels) for boundary words") - print("🔍 40% overlap requirement (was 75% - more inclusive)") - print("🔗 Smart hyphenation merging") - print("=" * 70) - - self.annotations = self.extract_annotation_highlights() - self.highlights = self.extract_background_highlights() - - print(f"\n✨ Total: {len(self.annotations)} annotations, {len(self.highlights)} highlights") - return self.annotations, self.highlights + if self.run_args.show_extraction_details: + print(color_code_for_segment_print + Fore.CYAN + f" Processing {len(sorted_quad_points_list)} segments for HL {hl_id} (Color: {overall_highlight_color_name.upper()}) on page {page_num}" + Style.RESET_ALL) -def display_results(self): - """Display results cleanly.""" - print("\n" + "="*70) - print("📋 EXTRACTION RESULTS") - print("="*70) - - all_items = [] - for item in self.annotations: - item['category'] = 'annotation' - all_items.append(item) - for item in self.highlights: - item['category'] = 'highlight' - all_items.append(item) - - if not all_items: - print("\n❌ No highlights found") - return - - all_items.sort(key=lambda x: (x['page'], x['y_position'])) - - current_page = None - for item in all_items: - if item['page'] != current_page: - current_page = item['page'] - print(f"\n📄 Page {current_page}") - print("-" * 25) + segment_texts_final = [] + raw_segment_texts_for_diff = [] + for seg_idx, quad_points in enumerate(sorted_quad_points_list): + try: + bounds = fitz.Quad(quad_points).rect + padded_rect = fitz.Rect(bounds.x0 - TEXT_EXTRACTION_HORIZONTAL_PADDING, bounds.y0 - TEXT_EXTRACTION_VERTICAL_PADDING, + bounds.x1 + TEXT_EXTRACTION_HORIZONTAL_PADDING, bounds.y1 + TEXT_EXTRACTION_VERTICAL_PADDING) + padded_rect.intersect(page.rect) + if padded_rect.is_empty: + if self.run_args.show_extraction_details: print(Fore.YELLOW + f" Segment {seg_idx+1} empty padded_rect for HL {hl_id}") + continue + raw_text_from_pdf_segment = page.get_text("text", clip=padded_rect, sort=True).strip() + raw_segment_texts_for_diff.append(raw_text_from_pdf_segment) + cleaned_text_segment = re.sub(r'\s+', ' ', raw_text_from_pdf_segment).strip() + cleaned_text_segment = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]', '', cleaned_text_segment) + final_text_segment = clean_segment_edges_func(cleaned_text_segment, self.run_args.clean_edges) + + if final_text_segment: + segment_texts_final.append(final_text_segment) + if self.run_args.show_raw_segments and not self.run_args.interactive: + print(color_code_for_segment_print + Fore.LIGHTBLUE_EX + f" Segment {seg_idx+1} (P{page_num}, HL{hl_id}, Color: {overall_highlight_color_name.upper()}):" + Style.RESET_ALL) + if self.run_args.show_diff_percentage: + similarity = get_text_diff_ratio(raw_text_from_pdf_segment, final_text_segment) + diff_percent = (1 - similarity) * 100 + print(Fore.LIGHTMAGENTA_EX + f" Raw PDF : \"{raw_text_from_pdf_segment}\"") + print(Fore.LIGHTBLUE_EX + f" Final Seg: \"{final_text_segment}\"") + print(Fore.YELLOW + f" Diff: {diff_percent:.2f}%") + else: print(Fore.LIGHTBLUE_EX + f" Final Seg: \"{final_text_segment}\"") + except Exception as e: + if self.run_args.show_extraction_details: print(Fore.RED + f" Error processing segment {seg_idx+1} for HL {hl_id}: {e}") + raw_segment_texts_for_diff.append("") + continue + + if not segment_texts_final: return None, len(sorted_quad_points_list), raw_segment_texts_for_diff + combined_text = segment_texts_final[0] + for i in range(1, len(segment_texts_final)): + prev_text = combined_text; current_text = segment_texts_final[i] + if prev_text.endswith('-') or prev_text.endswith('¬'): combined_text = prev_text.rstrip('-¬') + current_text + else: combined_text += ' ' + current_text - color_code = self._get_color_display(item['color']) - icon = "📝" if item['category'] == 'annotation' else "🎨" + if self.run_args.clean_edges: combined_text = clean_segment_edges_func(combined_text, self.run_args.clean_edges) + combined_text = re.sub(r'\s+', ' ', combined_text).strip() + return combined_text if combined_text else None, len(sorted_quad_points_list), raw_segment_texts_for_diff + + def extract_highlights(self, doc): + all_extracted_highlights = [] + try: + if self.run_args.show_progress and not self.run_args.interactive: + print(Fore.BLUE + f"\n🎨 Processing highlights for PDF: {self.pdf_path.name}") + + pages_str_to_parse = self.run_args.pages if self.run_args.pages else DEFAULT_PAGES_TO_PROCESS + pages_to_process = self._parse_specific_pages(pages_str_to_parse, doc.page_count) + if not pages_to_process: + if self.run_args.show_progress: print(Fore.YELLOW + "No valid pages selected.") + return [] + + highlight_id_counter_on_page = {} + for page_num in pages_to_process: + page = doc.load_page(page_num - 1) + highlight_id_counter_on_page.setdefault(page_num, 0) + if self.run_args.show_progress and not self.run_args.interactive: + print(Fore.CYAN + f" 📄 Processing Page {page_num}...") + try: page_annotations = list(page.annots()) + except Exception as e: + if self.run_args.show_progress: print(Fore.RED + f" ⚠️ Error loading annots: {e}") + continue + + highlight_annotations = [a for a in page_annotations if hasattr(a, 'type') and a.type[0] == fitz.PDF_ANNOT_HIGHLIGHT and hasattr(a, 'rect') and a.rect] + if not highlight_annotations: + if self.run_args.show_progress and not self.run_args.interactive: print(Fore.WHITE + f" No highlights on page {page_num}.") + continue + + if self.run_args.show_rect_details: + print(Fore.YELLOW + f"--- Annotations before sorting (Page {page_num}) ---") + temp_debug_list = [] + for annot_debug in highlight_annotations: + debug_text_snippet = page.get_text("text", clip=annot_debug.rect).strip().replace("\n", " ") + color_name_debug, rgb_values_debug = self._get_highlight_color_from_annot_colors_dict(annot_debug.colors) + rgb_display = f"RGB: {tuple(int(c*255) if isinstance(c,float) else int(c) for c in rgb_values_debug[:3])}" if rgb_values_debug else "RGB: N/A" + temp_debug_list.append({ + "rect": annot_debug.rect, "text_snippet": debug_text_snippet, "color_name": color_name_debug, + "rgb_display": rgb_display, "vertices_count": len(annot_debug.vertices) if annot_debug.vertices else 0 }) + temp_debug_list.sort(key=lambda item: (item["rect"].y0, item["rect"].x0)) + for item_idx, item_val in enumerate(temp_debug_list): + print(f" {item_idx+1}. Rect: {item_val['rect']}, Vertices: {item_val['vertices_count']}, Color: {item_val['color_name'].upper()} ({item_val['rgb_display']}), Text: \"{item_val['text_snippet']}\"") + print(Fore.YELLOW + "----------------------------------------------------") + + highlight_annotations.sort(key=lambda a: (a.rect.y0, a.rect.x0)) + for annot in highlight_annotations: + try: + highlight_id_counter_on_page[page_num] += 1; current_hl_id_on_page = highlight_id_counter_on_page[page_num] + color_name, raw_rgb_floats = self._get_highlight_color_from_annot_colors_dict(annot.colors) + extracted_text, num_segments, _ = self._extract_text_from_multi_segment_highlight(page, annot, page_num, current_hl_id_on_page) + if extracted_text and extracted_text.strip(): + if self.run_args.show_extraction_details and not self.run_args.interactive: + print(Fore.GREEN + f" ✅ Final (P{page_num}, HL{current_hl_id_on_page}): \"{extracted_text[:100]}\"") + all_extracted_highlights.append({ + 'page': page_num, 'highlight_id_on_page': current_hl_id_on_page, 'text': extracted_text, + 'color': color_name, 'raw_rgb_values': raw_rgb_floats, 'type': 'highlight', + 'y_position': annot.rect.y0, 'x_position': annot.rect.x0, + 'rect_details': (annot.rect.x0, annot.rect.y0, annot.rect.x1, annot.rect.y1), + 'num_segments': num_segments }) + elif self.run_args.show_progress and not self.run_args.interactive: + print(Fore.YELLOW + f" ⚠️ No text for HL {current_hl_id_on_page} on page {page_num}") + except Exception as e: + if self.run_args.show_progress and not self.run_args.interactive: + print(Fore.RED + f" 🔴 Error processing annot on page {page_num}: {e}") + if self.run_args.debug: traceback.print_exc() + continue + + if self.run_args.interactive: + print(Fore.MAGENTA + "\nEntering interactive review session...") + self.highlights_data = self._interactive_review_session(all_extracted_highlights) + else: self.highlights_data = all_extracted_highlights + + if self.run_args.show_progress and not self.run_args.interactive and not self.run_args.silent: + print(Fore.MAGENTA + f" 📊 Total highlights extracted: {len(self.highlights_data)}") + return self.highlights_data + except Exception as e: + print(Fore.RED + f"❌ Major error during highlight extraction: {e}") + if self.run_args.debug: traceback.print_exc() + return [] + + def _view_page_image_interactively(self, page_num_to_view): + if not self.main_doc_for_image_view: + print(Fore.RED + "Error: PDF document not available for image rendering. This should not happen.") + return + + tmp_image_path_obj = None + image_created_in_managed_folder = False + image_successfully_saved = False + + if self.run_args.show_progress: + print(Fore.BLUE + f"Preparing to view image for page {page_num_to_view}...") + + try: + page_index = page_num_to_view - 1 + page = self.main_doc_for_image_view.load_page(page_index) + if self.run_args.debug: + print(Fore.CYAN + f" [Debug] Loaded page object for index {page_index}: {page}") + + pix = page.get_pixmap(dpi=150) + if self.run_args.debug: + print(Fore.CYAN + f" [Debug] Created pixmap: {pix}. Alpha: {pix.alpha}, Colorspace: {pix.colorspace.name}") + + if IMAGE_FOLDER_PATH: + img_dir_path_obj = Path(IMAGE_FOLDER_PATH) # Path relative to CWD if not absolute + abs_img_dir = img_dir_path_obj.resolve() + + if self.run_args.debug: + print(Fore.CYAN + f" [Debug] Using IMAGE_FOLDER_PATH: '{IMAGE_FOLDER_PATH}' (Absolute: {abs_img_dir})") + + try: + abs_img_dir.mkdir(parents=True, exist_ok=True) + if self.run_args.debug: + print(Fore.CYAN + f" [Debug] Ensured image directory exists: {abs_img_dir} (Status: {abs_img_dir.is_dir()})") + except Exception as e_mkdir: + print(Fore.RED + f" ERROR: Could not create directory {abs_img_dir}: {e_mkdir}") + if self.run_args.debug: traceback.print_exc() + # Do not proceed if directory creation fails + input(Fore.CYAN + "Press Enter to acknowledge and continue...") + return + + + unique_id = uuid.uuid4().hex[:8] + tmp_image_path_obj = abs_img_dir / f"page_{page_num_to_view}_{unique_id}.png" + image_created_in_managed_folder = True + else: + fd, temp_path_str = tempfile.mkstemp(suffix=".png", prefix="pdf_page_img_") + os.close(fd) + tmp_image_path_obj = Path(temp_path_str) + if self.run_args.debug: + print(Fore.CYAN + f" [Debug] Using system temporary file: {tmp_image_path_obj.resolve()}") + + resolved_save_path = tmp_image_path_obj.resolve() + if self.run_args.debug: + print(Fore.CYAN + f" [Debug] Attempting to save image to: {resolved_save_path}") + + pix.save(str(resolved_save_path)) + + if resolved_save_path.exists() and resolved_save_path.is_file(): + image_successfully_saved = True + if self.run_args.show_progress: # Print for normal progress too, not just debug + print(Fore.GREEN + f" Image for page {page_num_to_view} successfully saved to: {resolved_save_path}") + if self.run_args.debug: + print(Fore.CYAN + f" [Debug] File size: {resolved_save_path.stat().st_size} bytes") + else: + if self.run_args.show_progress: + print(Fore.RED + f" ERROR: Failed to save image to {resolved_save_path}. File does not exist after save attempt.") - merge_info = "" - if item.get('pages_spanned'): - merge_info = f" ({item['pages_spanned']})" - elif item.get('hyphen_merged'): - merge_info = " (hyphen-merged)" + except Exception as e_render_save: + if self.run_args.show_progress: + print(Fore.RED + f" Error during image rendering or saving: {e_render_save}") + if self.run_args.debug: + traceback.print_exc() - print(f"{icon} {color_code}{item['color'].upper()}{Style.RESET_ALL}{merge_info}") - print(f" \"{item['text']}\"") + if image_successfully_saved and tmp_image_path_obj: + if self.run_args.show_progress: + print(Fore.CYAN + f"Attempting to open image with default application...") + try: + file_uri = tmp_image_path_obj.resolve().as_uri() + if self.run_args.debug: + print(Fore.CYAN + f" [Debug] Opening URI: {file_uri}") -def _get_color_display(self, color_name): - """Terminal color codes.""" - colors = { - 'yellow': Back.YELLOW + Fore.BLACK, - 'green': Back.GREEN + Fore.BLACK, - 'blue': Back.BLUE + Fore.WHITE, - 'pink': Back.MAGENTA + Fore.WHITE, - } - return colors.get(color_name, Back.WHITE + Fore.BLACK) + opened_successfully = webbrowser.open(file_uri) + + if self.run_args.debug: # More detailed feedback in debug mode + print(Fore.CYAN + f" [Debug] webbrowser.open() returned: {opened_successfully}") -def save_to_json(self, annotations, highlights, output_path): - """Save to JSON.""" - data = { - 'annotations': annotations, - 'highlights': highlights, - 'summary': { - 'total_annotations': len(annotations), - 'total_highlights': len(highlights) - } - } - with open(output_path, 'w', encoding='utf-8') as f: - json.dump(data, f, indent=2, ensure_ascii=False) - print(f"💾 Saved to {output_path}") + if not opened_successfully: + if self.run_args.show_progress: + print(Fore.YELLOW + " webbrowser.open() reported failure (returned False or None).") + print(Fore.YELLOW + f" This often means no default application is configured for PNG files or your browser.") + elif self.run_args.show_progress: + print(Fore.GREEN + " Image hopefully opened. Check your applications.") + + if self.run_args.show_progress: + print(Fore.YELLOW + f" If the image did not open, please manually open: {tmp_image_path_obj.resolve()}") + input(Fore.CYAN + "Press Enter after viewing image to continue...") -def save_to_csv(self, annotations, highlights, output_path): - """Save to CSV.""" - all_items = [] - for item in annotations: - item_copy = item.copy() - item_copy['category'] = 'annotation' - all_items.append(item_copy) - for item in highlights: - item_copy = item.copy() - item_copy['category'] = 'highlight' - all_items.append(item_copy) - - df = pd.DataFrame(all_items) - df.to_csv(output_path, index=False, encoding='utf-8') - print(f"📊 Saved to {output_path}") + except Exception as e_open: + if self.run_args.show_progress: + print(Fore.RED + f" Could not open image using webbrowser: {e_open}") + print(Fore.YELLOW + " This could be due to your system's environment (e.g., missing 'xdg-utils' on Linux, no default PNG viewer).") + print(Fore.YELLOW + f" Please try opening the image manually: {tmp_image_path_obj.resolve()}") + if self.run_args.debug: + traceback.print_exc() + input(Fore.CYAN + "Press Enter to acknowledge and continue...") + elif tmp_image_path_obj : + if self.run_args.show_progress: + print(Fore.YELLOW + " Skipping attempt to open image as it was not saved successfully.") + input(Fore.CYAN + "Press Enter to continue...") + else: + if self.run_args.show_progress: + print(Fore.RED + " Cannot attempt to open image as image path was not determined.") + input(Fore.CYAN + "Press Enter to continue...") + + finally: + if tmp_image_path_obj and tmp_image_path_obj.exists(): + if image_created_in_managed_folder: + if self.run_args.debug: + print(Fore.CYAN + f" [Debug] Image '{tmp_image_path_obj.name}' remains in managed folder '{IMAGE_FOLDER_PATH}'.") + print(Fore.CYAN + f" [Debug] It will be cleared based on CLEAR_IMAGE_FOLDER_ON_END ({CLEAR_IMAGE_FOLDER_ON_END}).") + else: + try: + tmp_image_path_obj.unlink() + if self.run_args.debug: + print(Fore.CYAN + f" [Debug] Deleted system temporary image: {tmp_image_path_obj.resolve()}") + except Exception as e_unlink: + if self.run_args.debug: + print(Fore.YELLOW + f" Warning: Could not delete system temp image {tmp_image_path_obj.resolve()}: {e_unlink}") + elif tmp_image_path_obj and not tmp_image_path_obj.exists() and image_successfully_saved: + if self.run_args.debug: + print(Fore.RED + f" [Debug] Inconsistency: Image was marked saved, but {tmp_image_path_obj.resolve()} does not exist at cleanup (and wasn't a system temp explicitly deleted here).") -def is_test_mode(): - """Check if script is run in test mode.""" - test_flags = ['--test', '-t', 'test'] - return any(flag in sys.argv for flag in test_flags) + def _interactive_review_session(self, highlights_list): + if not highlights_list: + if self.run_args.show_progress : print(Fore.YELLOW + "No highlights to review.") + return [] + reviewed_highlights = [dict(h) for h in highlights_list] + idx, num_highlights = 0, len(reviewed_highlights) + AVAILABLE_COLORS = ['yellow', 'green', 'blue', 'pink', 'other_color', 'unknown_color'] + + while 0 <= idx < num_highlights: + item = reviewed_highlights[idx] + print(Style.RESET_ALL + "\n" + "="*15 + f" Review HL {idx+1}/{num_highlights} (Page {item['page']}) " + "="*15) + + current_color_display = self._get_color_display_codes(item['color']) + print(f"Color: {current_color_display}{item['color'].upper()}{Style.RESET_ALL}", end="") + if item['color'] == 'other_color' and item.get('raw_rgb_values'): + rgb = item['raw_rgb_values'][:3] + rgb_disp = tuple(int(c*255) if isinstance(c,float) else int(c) for c in rgb) + print(f" (RGB: {rgb_disp})", end="") + print() + + print(f"Text: {item['text']}") + + prompt_options = ["[N]ext", "[P]rev", "[U]p", "[M]ove Down", "[C]olor", "[E]dit", "[D]elete", "[O]pen Img", "[S]ave&Exit", "[Q]uit"] + action_prompt_str = Fore.CYAN + ", ".join(prompt_options) + "? > " + Style.RESET_ALL + action = input(action_prompt_str).lower().strip() + + if action == 'n': idx = (idx + 1) % num_highlights if num_highlights > 0 else 0 + elif action == 'p': idx = (idx - 1 + num_highlights) % num_highlights if num_highlights > 0 else 0 + elif action == 'u': + if idx > 0: + reviewed_highlights.insert(idx - 1, reviewed_highlights.pop(idx)) + idx -= 1 + print(Fore.GREEN + "Moved up.") + else: print(Fore.YELLOW + "Already at the top.") + elif action == 'm': + if idx < num_highlights - 1: + reviewed_highlights.insert(idx + 1, reviewed_highlights.pop(idx)) + idx += 1 + print(Fore.GREEN + "Moved down.") + else: print(Fore.YELLOW + "Already at the bottom.") + elif action == 'c': + print("Available colors:", ", ".join(f"{i+1}.{self._get_color_display_codes(co)}{co.upper()}{Style.RESET_ALL}" for i,co in enumerate(AVAILABLE_COLORS))) + try: + choice_str = input(Fore.YELLOW + "Enter number for new color: " + Style.RESET_ALL) + if not choice_str: print(Fore.BLUE + "Color change cancelled (no input)."); continue + choice = int(choice_str) - 1 + if 0 <= choice < len(AVAILABLE_COLORS): + item['color'] = AVAILABLE_COLORS[choice] + print(Fore.GREEN + f"Color changed to {AVAILABLE_COLORS[choice].upper()}.") + else: print(Fore.RED + "Invalid color choice.") + except ValueError: print(Fore.RED + "Invalid input. Please enter a number.") + elif action == 'e': + edit_prompt = Fore.YELLOW + "New text (blank=keep, 'CLEAR'=empty): > " + Style.RESET_ALL + new_text = input_with_prefill(edit_prompt, item['text']) + + if new_text.strip().upper() == 'CLEAR': + item['text'] = "" + print(Fore.GREEN + "Text cleared.") + elif new_text == item['text'] or not new_text.strip() : + print(Fore.BLUE + "Text kept as is.") + else: + item['text'] = new_text + print(Fore.GREEN + "Text updated.") + elif action == 'd': + if input(Fore.RED + "Are you sure you want to delete this highlight? [y/N]: " + Style.RESET_ALL).lower() == 'y': + reviewed_highlights.pop(idx) + num_highlights = len(reviewed_highlights) + print(Fore.GREEN + "Highlight deleted.") + if num_highlights == 0: + print(Fore.YELLOW + "No more highlights to review."); break + if idx >= num_highlights: idx = num_highlights - 1 + else: print(Fore.BLUE + "Deletion cancelled.") + elif action == 'o': self._view_page_image_interactively(item['page']) + elif action == 's': + print(Fore.GREEN + "Saving changes and exiting review session.") + break + elif action == 'q': + if input(Fore.RED+"Are you sure you want to quit review? Changes will not be saved. [y/N]: " + Style.RESET_ALL).lower()=='y': + print(Fore.YELLOW+"Quitting review session. Changes made in this session are DISCARDED.") + return highlights_list + else: + print(Fore.BLUE + "Quit cancelled.") + else: print(Fore.RED + "Invalid action. Please choose from the list.") + return reviewed_highlights + + def _parse_specific_pages(self, pages_str, total_pages): + if not pages_str or pages_str.lower() == "all": return list(range(1, total_pages + 1)) + parsed_pages = set() + try: + for part in pages_str.split(','): + part = part.strip(); + if not part: continue + if '-' in part: + start_str, end_str = part.split('-', 1); start = int(start_str); end = int(end_str) + start = max(1, start); end = min(total_pages, end) + if start <= end: parsed_pages.update(range(start, end + 1)) + else: + page_val = int(part) + if 1 <= page_val <= total_pages: parsed_pages.add(page_val) + return sorted(list(parsed_pages)) if parsed_pages else [] + except ValueError as e: + if self.run_args.show_progress: print(Fore.YELLOW + f"⚠️ Invalid page range: {pages_str}. Error: {e}.") + return [] + + def _get_color_display_codes(self, color_name_str): + return {'yellow': Back.YELLOW + Fore.BLACK, 'green': Back.GREEN + Fore.BLACK, + 'blue': Back.BLUE + Fore.WHITE, 'pink': Back.MAGENTA + Fore.WHITE, + 'other_color': Back.WHITE + Fore.BLACK, 'unknown_color': Back.LIGHTBLACK_EX + Fore.WHITE + }.get(color_name_str.lower(), Back.LIGHTBLACK_EX + Fore.WHITE) + + def display_results(self): + if not self.run_args.show_progress: return # Don't display if progress is off (e.g. silent) + + print("\n" + Fore.CYAN + Style.BRIGHT + "="*30 + " EXTRACTED HIGHLIGHTS " + "="*30 + Style.RESET_ALL) + if not self.highlights_data: print("\n❌ No highlights extracted or all were deleted."); return + current_page = None + for item in self.highlights_data: + if item.get('page') != current_page: + current_page = item.get('page'); print(f"\n📄 {Style.BRIGHT}Page {current_page}{Style.RESET_ALL}\n" + "-"*25) + color_name = item.get('color', 'unknown_color') + color_code = self._get_color_display_codes(color_name) + num_segments = item.get('num_segments', 0) + segment_info = f" [{num_segments} segments]" if num_segments > 1 else "" + text_content = item.get('text', "*NO TEXT*") + display_color_name = color_name.upper() + if color_name == 'other_color': + raw_rgb = item.get('raw_rgb_values') + if raw_rgb and len(raw_rgb) >=3: + rgb_disp = tuple(int(c*255) if isinstance(c,float) else int(c) for c in raw_rgb[:3]) + display_color_name += f" (RGB: {rgb_disp})" + print(f"🎨 {color_code}{display_color_name}{Style.RESET_ALL}{segment_info}") + print(f" \"{text_content}\""); print() + + def save_to_json(self, output_path_str): + output_path = Path(output_path_str).resolve() # Resolve to absolute path for clarity + try: + output_path.parent.mkdir(parents=True, exist_ok=True) + if self.run_args.debug: + print(Fore.CYAN + f" [Debug] Ensured parent directory for JSON exists: {output_path.parent}") + except Exception as e_mkdir: + if self.run_args.show_progress: # Also show error if progress is on + print(Fore.RED + f"❌ Error creating directory for JSON output {output_path.parent}: {e_mkdir}") + if self.run_args.debug: traceback.print_exc() + return # Cannot save if directory cannot be made + + data_to_save = { + 'pdf_file_processed': str(self.pdf_path.name), 'pdf_full_path': str(self.pdf_path.resolve()), + 'pages_processed_spec': self.run_args.pages if self.run_args.pages else DEFAULT_PAGES_TO_PROCESS, + 'extraction_timestamp': time.strftime("%Y-%m-%d %H:%M:%S %Z"), + 'total_highlights_extracted': len(self.highlights_data), + 'settings_used': { + 'clean_edges': self.run_args.clean_edges, + 'show_diff_percentage': self.run_args.show_diff_percentage + }, + 'highlights_data': self.highlights_data } + try: + with open(output_path, 'w', encoding='utf-8') as f: json.dump(data_to_save, f, indent=2, ensure_ascii=False) + if self.run_args.show_progress: print(Fore.GREEN + f"💾 Data saved to {output_path}") + except IOError as e: + if self.run_args.show_progress: print(Fore.RED + f"❌ Error saving JSON to {output_path}: {e}") + if self.run_args.debug: traceback.print_exc() def main(): - start_time = time.time() - - test_mode = is_test_mode() - - print("🎨 PDF Highlight Extractor - BALANCED PRECISION") - print("✅ More inclusive extraction (40% overlap vs 75%)") - print("✅ Small boundary expansion (+2 pixels)") - print("✅ Better word capture at highlight edges") - print("✅ Detailed extraction logging") - print("✅ Smart hyphenation merging") - - if test_mode: - print("🧪 TEST MODE: Using defaults") - print("✅ Default file: /mnt/c/Users/admin/Downloads/test2.pdf") - print("✅ Skipping JSON/CSV output") - else: - print("🔧 FULL MODE: Interactive prompts") - - print() - - if test_mode: - default_pdf = "/mnt/c/Users/admin/Downloads/test2.pdf" - pdf_path = default_pdf - print(f"📄 Using default: {pdf_path}") - else: - pdf_input = input("📄 PDF file path: ").strip('"') - if not pdf_input: - print("❌ No file specified!") - return - pdf_path = pdf_input - - if not Path(pdf_path).exists(): - print("❌ File not found!") - return - - output_json = "" - output_csv = "" - - if test_mode: - print("📋 Test mode: Display only (no file output)") - else: - print("\n📤 Output options:") - output_json = input("💾 JSON file (Enter to skip): ").strip('"') - output_csv = input("📊 CSV file (Enter to skip): ").strip('"') - - # Process - extractor = PDFHighlightExtractor(pdf_path) - annotations, highlights = extractor.extract_all_highlights() - - # Display results - extractor.display_results() - - # Save files (only in full mode and if specified) - if not test_mode: - if output_json: - extractor.save_to_json(annotations, highlights, output_json) - if output_csv: - extractor.save_to_csv(annotations, highlights, output_csv) - - if not output_json and not output_csv: - print("\n📋 Display only - no files saved") - - end_time = time.time() - elapsed_time = end_time - start_time - - print(f"\n⏱️ Processing completed in {elapsed_time:.2f} seconds") - - if test_mode: - print("\n🧪 Test mode completed. Use without --test flag for full options.") + parser = argparse.ArgumentParser( + description="Enhanced PDF Highlight Extractor.", + formatter_class=argparse.RawTextHelpFormatter, + epilog=f"""Examples: + {sys.argv[0]} mydoc.pdf + {sys.argv[0]} mydoc.pdf -p "1,5-7" -i + {sys.argv[0]} -t -s --output-json results/test.json + {sys.argv[0]} doc.pdf -d +If interactive image viewing ('O' option) fails, try running with the -d (debug) +flag. This will print detailed information about image paths and creation steps. +Common issues include missing default PNG viewers or OS-level permission problems. +The IMAGE_FOLDER_PATH ('{IMAGE_FOLDER_PATH}') is relative to where you run the script. +""") + parser.add_argument("pdf_path_arg", nargs='?', default=None, help="Path to PDF. Prompts if not in test/silent mode & not provided.") + parser.add_argument("-p", "--pages", type=str, default=None, help=f"Pages (e.g., \"1,3-5\", \"all\"). Default: \"{DEFAULT_PAGES_TO_PROCESS}\".") + parser.add_argument("-i", "--interactive", action="store_true", help="Enable interactive review mode.") + parser.add_argument("-t", "--test", action="store_true", help=f"Test mode. Uses default PDF ('{DEFAULT_PDF_PATH}'), auto-saves JSON.") + parser.add_argument("-s", "--silent", action="store_true", help="Silent mode. Minimal output. Auto-saves JSON. Implies -t if no PDF path.") + parser.add_argument("-d", "--debug", action="store_true", help="Debug mode. Enables all detailed SHOW flags and prints more internal details.") + parser.add_argument("--output-json", type=str, default=None, help="Custom output JSON filename/path.") + + cli_args = parser.parse_args() + + effective_run_args = argparse.Namespace() + effective_run_args.debug = cli_args.debug + effective_run_args.silent = cli_args.silent + + # Initialize based on global defaults + effective_run_args.show_timing = INITIAL_SHOW_TIMING + effective_run_args.show_progress = INITIAL_SHOW_PROGRESS + effective_run_args.show_raw_segments = INITIAL_SHOW_RAW_SEGMENTS + effective_run_args.show_extraction_details = INITIAL_SHOW_EXTRACTION_DETAILS + effective_run_args.show_rect_details = INITIAL_SHOW_RECT_DETAILS + effective_run_args.show_diff_percentage = INITIAL_SHOW_DIFF_PERCENTAGE + effective_run_args.clean_edges = INITIAL_CLEAN_EDGES + + # Override show flags based on debug or silent + if effective_run_args.debug: + for key in ['show_timing', 'show_progress', 'show_raw_segments', 'show_extraction_details', 'show_rect_details', 'show_diff_percentage']: + setattr(effective_run_args, key, True) # Debug enables all these + + if effective_run_args.silent: + for key in ['show_timing', 'show_progress', 'show_raw_segments', 'show_extraction_details', 'show_rect_details', 'show_diff_percentage']: + setattr(effective_run_args, key, False) # Silent disables all these + effective_run_args.interactive = False + else: # Not silent + effective_run_args.interactive = cli_args.interactive + + effective_run_args.pages = cli_args.pages + + start_time = time.time() + if effective_run_args.show_progress: print(Fore.MAGENTA + Style.BRIGHT + "🎨 PDF Highlight Extractor 🎨" + Style.RESET_ALL) + if effective_run_args.debug: + print(Fore.CYAN + f" [Debug] Current Working Directory: {Path.cwd()}") + print(Fore.CYAN + f" [Debug] Effective Run Arguments: {effective_run_args}") + + + if IMAGE_FOLDER_PATH and CLEAR_IMAGE_FOLDER_ON_START: + _clear_png_files_in_folder(IMAGE_FOLDER_PATH, effective_run_args) + + pdf_path_to_use = None + if cli_args.test: pdf_path_to_use = DEFAULT_PDF_PATH + elif cli_args.pdf_path_arg: pdf_path_to_use = cli_args.pdf_path_arg + elif cli_args.silent: pdf_path_to_use = DEFAULT_PDF_PATH + else: + pdf_path_input = input(f"📄 PDF path (Enter for default '{DEFAULT_PDF_PATH}'): ").strip().strip('"') + pdf_path_to_use = pdf_path_input if pdf_path_input else DEFAULT_PDF_PATH + + if not pdf_path_to_use: + if effective_run_args.show_progress: print(Fore.RED + "❌ No PDF path specified. Exiting.") + sys.exit(1) + + resolved_path = Path(pdf_path_to_use).resolve() + if not resolved_path.exists() or not resolved_path.is_file(): + if effective_run_args.show_progress: print(Fore.RED + f"❌ PDF not found or is not a file: {resolved_path}") + sys.exit(1) + + doc_for_processing = None + try: + doc_for_processing = fitz.open(str(resolved_path)) + extractor = EnhancedPDFHighlightExtractor(resolved_path, effective_run_args, main_doc_for_image_view=doc_for_processing) + extractor.extract_highlights(doc_for_processing) + + if not effective_run_args.interactive and effective_run_args.show_progress: + extractor.display_results() + elif effective_run_args.interactive and effective_run_args.show_progress: + if input(Fore.CYAN+"Interactive session ended. Display final results? [Y/n]: " + Style.RESET_ALL).lower().strip()!='n': + extractor.display_results() + + json_output_path_str = cli_args.output_json if cli_args.output_json else str(resolved_path.parent / f"{resolved_path.stem}_highlights.json") + + perform_save = False + if cli_args.test or cli_args.silent: + perform_save = True + elif effective_run_args.show_progress: # Only prompt if not silent + save_prompt_input = input(f"💾 Save to JSON? (Enter for default '{json_output_path_str}', type 'skip' to not save, or enter a custom path): " + Style.RESET_ALL).strip() + if save_prompt_input.lower() != 'skip': + perform_save = True + if save_prompt_input: + json_output_path_str = save_prompt_input + + if perform_save: + if extractor.highlights_data: + extractor.save_to_json(json_output_path_str) + elif effective_run_args.show_progress: + print(Fore.YELLOW + "No highlights were extracted or kept, so JSON file was not saved.") + elif effective_run_args.show_progress: + print(Fore.BLUE + "Skipped saving highlights to JSON.") + + except Exception as e: + if effective_run_args.show_progress: print(Fore.RED+Style.BRIGHT+f"💥 An critical error occurred in the main execution: {e}") + if effective_run_args.debug: + traceback.print_exc() + finally: + if doc_for_processing: + doc_for_processing.close() + + if IMAGE_FOLDER_PATH and CLEAR_IMAGE_FOLDER_ON_END: + _clear_png_files_in_folder(IMAGE_FOLDER_PATH, effective_run_args) + + if effective_run_args.show_timing: + print(Fore.CYAN + f"\n⏱️ Total execution time: {time.time()-start_time:.2f} seconds") if __name__ == '__main__': main() diff --git a/requirements.txt b/requirements.txt index 70368a5..55f5e47 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,2 @@ -pdfplumber==0.10.3 colorama==0.4.6 -pandas==2.0.3 PyMuPDF==1.23.1 diff --git a/test/test2.pdf b/test/test2.pdf new file mode 100644 index 0000000..5563ad2 Binary files /dev/null and b/test/test2.pdf differ