hilitehero/main.py
2025-07-11 09:32:01 -08:00

754 lines
43 KiB
Python

#!/usr/bin/env python3
# =============================================================================
# ENHANCED PDF HIGHLIGHT EXTRACTOR
# Author: Perplexity AI Companion (Updated by User Feedback)
# Date: June 3, 2025
# License: MIT
#
# Extracts highlights from PDF files, with options for interactive review,
# detailed output, text cleaning, JSON export, and page image viewing.
# =============================================================================
import time
import os
import fitz # PyMuPDF
import json
from colorama import init, Fore, Back, Style
from pathlib import Path
import re
import string
import sys
import traceback
import argparse
import difflib # For text difference calculation
import tempfile # For temporary image files
import webbrowser # For opening images/PDFs
import uuid # For unique filenames
# Attempt to import readline for better input() experience on some systems
try:
import readline
READLINE_AVAILABLE = True
except ImportError:
READLINE_AVAILABLE = False # readline not available
# =============================================================================
# GLOBAL CONFIGURATION FLAGS (Defaults, can be overridden by CLI args)
# =============================================================================
DEFAULT_PDF_PATH = "/mnt/c/Users/admin/Downloads/test2.pdf" # Example, adjust if needed
DEFAULT_PAGES_TO_PROCESS = "3" # Example: "1,3-5,all"
# Default Behavior flags (can be influenced by -d or -s CLI flags)
# These are used to initialize effective_run_args
# Keep these distinct from the effective_run_args object itself
INITIAL_SHOW_TIMING = True
INITIAL_SHOW_PROGRESS = True
INITIAL_SHOW_RAW_SEGMENTS = True
INITIAL_SHOW_EXTRACTION_DETAILS = True
INITIAL_SHOW_RECT_DETAILS = True
INITIAL_SHOW_DIFF_PERCENTAGE = True
INITIAL_CLEAN_EDGES = True
# Text extraction parameters (generally fixed)
TEXT_EXTRACTION_HORIZONTAL_PADDING = 6.0
TEXT_EXTRACTION_VERTICAL_PADDING = 1.0
# Edge cleaning configuration (generally fixed)
VALID_TWO_LETTER_WORDS = {
'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he', 'if', 'in', 'is', 'it', 'me', 'my',
'no', 'of', 'on', 'or', 'ox', 'so', 'to', 'up', 'us', 'we'}
VALID_SINGLE_LETTERS = {'i', 'a'}
# Image handling configuration
IMAGE_FOLDER_PATH = 'pdf_page_images' # Relative to CWD by default
CLEAR_IMAGE_FOLDER_ON_START = True
CLEAR_IMAGE_FOLDER_ON_END = False
# Initialize colorama
init(autoreset=True)
# --- Helper Functions ---
def get_text_diff_ratio(text1, text2):
if not text1 and not text2: return 1.0
if not text1 or not text2: return 0.0
return difflib.SequenceMatcher(None, str(text1), str(text2)).ratio()
def clean_segment_edges_func(text_to_clean, clean_edges_setting):
if not clean_edges_setting or not text_to_clean: return text_to_clean
text_to_clean = re.sub(r'\s+', ' ', text_to_clean.strip())
words = text_to_clean.split()
if not words: return text_to_clean
current_idx = 0
while current_idx < len(words):
token = words[current_idx]
core_token = token.rstrip(string.punctuation)
trailing_punctuation = token[len(core_token):]
if not core_token: words.pop(current_idx); continue
core_should_be_removed = (len(core_token) == 1 and core_token.isalpha() and core_token.lower() not in VALID_SINGLE_LETTERS) or \
(len(core_token) == 2 and core_token.isalpha() and core_token.lower() not in VALID_TWO_LETTER_WORDS)
if core_should_be_removed:
if trailing_punctuation: words[current_idx] = trailing_punctuation
else: words.pop(current_idx)
continue
break
while words:
token = words[-1]
core_token = token.lstrip(string.punctuation)
leading_punctuation = token[:-len(core_token)] if core_token else ""
if not core_token: words.pop(); continue
core_should_be_removed = (len(core_token) == 1 and core_token.isalpha() and core_token.lower() not in VALID_SINGLE_LETTERS) or \
(len(core_token) == 2 and core_token.isalpha() and core_token.lower() not in VALID_TWO_LETTER_WORDS)
if core_should_be_removed:
if leading_punctuation: words[-1] = leading_punctuation
else: words.pop()
continue
break
return ' '.join(words)
def input_with_prefill(prompt, text):
if READLINE_AVAILABLE:
def hook():
readline.insert_text(text)
readline.redisplay()
readline.set_pre_input_hook(hook)
result = input(prompt)
readline.set_pre_input_hook()
return result
else:
print(Fore.MAGENTA + "Current text (edit below):\n" + Style.RESET_ALL + f"{text}")
return input(prompt)
def _clear_png_files_in_folder(folder_path_str, run_args_for_print_control):
# This function CLEARS files if folder exists. It DOES NOT CREATE the folder.
if not folder_path_str: return
folder = Path(folder_path_str) # Path relative to CWD if not absolute
abs_folder_path = folder.resolve()
if run_args_for_print_control.debug:
print(Fore.CYAN + f" [Debug] _clear_png_files_in_folder: Checking {abs_folder_path} (Specified as: '{folder_path_str}')")
if abs_folder_path.is_dir():
if run_args_for_print_control.show_progress:
print(Fore.BLUE + f"Clearing *.png files from {abs_folder_path}...")
cleared_count = 0
try:
for file_path in abs_folder_path.glob("*.png"):
if file_path.is_file():
file_path.unlink()
cleared_count +=1
except Exception as e:
if run_args_for_print_control.show_progress: # Also show error if progress is on
print(Fore.RED + f"Error during file deletion in {abs_folder_path}: {e}")
if run_args_for_print_control.show_progress:
if cleared_count > 0:
print(Fore.BLUE + f"Cleared {cleared_count} *.png files from {abs_folder_path}.")
else:
print(Fore.BLUE + f"No *.png files found to clear in {abs_folder_path}.")
else:
if run_args_for_print_control.show_progress:
print(Fore.YELLOW + f"Image folder {abs_folder_path} not found, skipping clear.")
elif run_args_for_print_control.debug: # Still log if not found in debug, even if not show_progress
print(Fore.CYAN + f" [Debug] _clear_png_files_in_folder: Folder {abs_folder_path} does not exist. Nothing to clear.")
class EnhancedPDFHighlightExtractor:
def __init__(self, pdf_path, effective_run_args, main_doc_for_image_view=None):
self.pdf_path = Path(pdf_path)
self.run_args = effective_run_args
self.pdf_filename_stem = self.pdf_path.stem
self.highlights_data = []
self.main_doc_for_image_view = main_doc_for_image_view
def _get_highlight_color_from_rgb_tuple(self, rgb_tuple_floats_or_ints):
if not rgb_tuple_floats_or_ints or len(rgb_tuple_floats_or_ints) < 3 : return 'unknown_color'
r, g, b = [int(x * 255) if isinstance(x, float) and 0.0 <= x <= 1.0 else int(x) for x in rgb_tuple_floats_or_ints[:3]]
if r == 142 and g == 221 and b == 249: return 'blue'
if r > 200 and g > 200 and b < 150: return 'yellow'
if r < 150 and g > 180 and b < 150: return 'green'
if r < 150 and g < 180 and b > 180: return 'blue'
if r > 180 and g < 180 and b > 180: return 'pink'
return 'other_color'
def _get_highlight_color_from_annot_colors_dict(self, colors_dict):
if not colors_dict: return 'unknown_color', None
rgb_tuple = colors_dict.get('stroke') or colors_dict.get('fill')
if not rgb_tuple: return 'unknown_color', None
return self._get_highlight_color_from_rgb_tuple(rgb_tuple), rgb_tuple[:3]
def _extract_text_from_multi_segment_highlight(self, page, annot, page_num, hl_id):
overall_highlight_color_name, _ = self._get_highlight_color_from_annot_colors_dict(annot.colors)
color_code_for_segment_print = self._get_color_display_codes(overall_highlight_color_name)
quads_vertices = annot.vertices
if not quads_vertices:
if self.run_args.show_extraction_details: print(Fore.YELLOW + f" No quads for HL {hl_id} on page {page_num}")
return None, 0, []
processed_quads_as_points_list = []
if len(quads_vertices) % 4 == 0:
for i in range(0, len(quads_vertices), 4):
try:
quad_points = [fitz.Point(p) for p in quads_vertices[i:i+4]]
processed_quads_as_points_list.append(quad_points)
except Exception as e:
if self.run_args.show_extraction_details: print(Fore.YELLOW + f" Skipping malformed quad points: {e}")
continue
try:
sorted_quad_points_list = sorted(processed_quads_as_points_list, key=lambda qp_list: (fitz.Quad(qp_list).rect.y0, fitz.Quad(qp_list).rect.x0))
except Exception as e:
if self.run_args.show_extraction_details: print(Fore.RED + f" Error sorting quads for HL {hl_id}: {e}. Using original order.")
sorted_quad_points_list = processed_quads_as_points_list
if self.run_args.show_extraction_details:
print(color_code_for_segment_print + Fore.CYAN + f" Processing {len(sorted_quad_points_list)} segments for HL {hl_id} (Color: {overall_highlight_color_name.upper()}) on page {page_num}" + Style.RESET_ALL)
segment_texts_final = []
raw_segment_texts_for_diff = []
for seg_idx, quad_points in enumerate(sorted_quad_points_list):
try:
bounds = fitz.Quad(quad_points).rect
padded_rect = fitz.Rect(bounds.x0 - TEXT_EXTRACTION_HORIZONTAL_PADDING, bounds.y0 - TEXT_EXTRACTION_VERTICAL_PADDING,
bounds.x1 + TEXT_EXTRACTION_HORIZONTAL_PADDING, bounds.y1 + TEXT_EXTRACTION_VERTICAL_PADDING)
padded_rect.intersect(page.rect)
if padded_rect.is_empty:
if self.run_args.show_extraction_details: print(Fore.YELLOW + f" Segment {seg_idx+1} empty padded_rect for HL {hl_id}")
continue
raw_text_from_pdf_segment = page.get_text("text", clip=padded_rect, sort=True).strip()
raw_segment_texts_for_diff.append(raw_text_from_pdf_segment)
cleaned_text_segment = re.sub(r'\s+', ' ', raw_text_from_pdf_segment).strip()
cleaned_text_segment = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]', '', cleaned_text_segment)
final_text_segment = clean_segment_edges_func(cleaned_text_segment, self.run_args.clean_edges)
if final_text_segment:
segment_texts_final.append(final_text_segment)
if self.run_args.show_raw_segments and not self.run_args.interactive:
print(color_code_for_segment_print + Fore.LIGHTBLUE_EX + f" Segment {seg_idx+1} (P{page_num}, HL{hl_id}, Color: {overall_highlight_color_name.upper()}):" + Style.RESET_ALL)
if self.run_args.show_diff_percentage:
similarity = get_text_diff_ratio(raw_text_from_pdf_segment, final_text_segment)
diff_percent = (1 - similarity) * 100
print(Fore.LIGHTMAGENTA_EX + f" Raw PDF : \"{raw_text_from_pdf_segment}\"")
print(Fore.LIGHTBLUE_EX + f" Final Seg: \"{final_text_segment}\"")
print(Fore.YELLOW + f" Diff: {diff_percent:.2f}%")
else: print(Fore.LIGHTBLUE_EX + f" Final Seg: \"{final_text_segment}\"")
except Exception as e:
if self.run_args.show_extraction_details: print(Fore.RED + f" Error processing segment {seg_idx+1} for HL {hl_id}: {e}")
raw_segment_texts_for_diff.append("")
continue
if not segment_texts_final: return None, len(sorted_quad_points_list), raw_segment_texts_for_diff
combined_text = segment_texts_final[0]
for i in range(1, len(segment_texts_final)):
prev_text = combined_text; current_text = segment_texts_final[i]
if prev_text.endswith('-') or prev_text.endswith('¬'): combined_text = prev_text.rstrip('') + current_text
else: combined_text += ' ' + current_text
if self.run_args.clean_edges: combined_text = clean_segment_edges_func(combined_text, self.run_args.clean_edges)
combined_text = re.sub(r'\s+', ' ', combined_text).strip()
return combined_text if combined_text else None, len(sorted_quad_points_list), raw_segment_texts_for_diff
def extract_highlights(self, doc):
all_extracted_highlights = []
try:
if self.run_args.show_progress and not self.run_args.interactive:
print(Fore.BLUE + f"\n🎨 Processing highlights for PDF: {self.pdf_path.name}")
pages_str_to_parse = self.run_args.pages if self.run_args.pages else DEFAULT_PAGES_TO_PROCESS
pages_to_process = self._parse_specific_pages(pages_str_to_parse, doc.page_count)
if not pages_to_process:
if self.run_args.show_progress: print(Fore.YELLOW + "No valid pages selected.")
return []
highlight_id_counter_on_page = {}
for page_num in pages_to_process:
page = doc.load_page(page_num - 1)
highlight_id_counter_on_page.setdefault(page_num, 0)
if self.run_args.show_progress and not self.run_args.interactive:
print(Fore.CYAN + f" 📄 Processing Page {page_num}...")
try: page_annotations = list(page.annots())
except Exception as e:
if self.run_args.show_progress: print(Fore.RED + f" ⚠️ Error loading annots: {e}")
continue
highlight_annotations = [a for a in page_annotations if hasattr(a, 'type') and a.type[0] == fitz.PDF_ANNOT_HIGHLIGHT and hasattr(a, 'rect') and a.rect]
if not highlight_annotations:
if self.run_args.show_progress and not self.run_args.interactive: print(Fore.WHITE + f" No highlights on page {page_num}.")
continue
if self.run_args.show_rect_details:
print(Fore.YELLOW + f"--- Annotations before sorting (Page {page_num}) ---")
temp_debug_list = []
for annot_debug in highlight_annotations:
debug_text_snippet = page.get_text("text", clip=annot_debug.rect).strip().replace("\n", " ")
color_name_debug, rgb_values_debug = self._get_highlight_color_from_annot_colors_dict(annot_debug.colors)
rgb_display = f"RGB: {tuple(int(c*255) if isinstance(c,float) else int(c) for c in rgb_values_debug[:3])}" if rgb_values_debug else "RGB: N/A"
temp_debug_list.append({
"rect": annot_debug.rect, "text_snippet": debug_text_snippet, "color_name": color_name_debug,
"rgb_display": rgb_display, "vertices_count": len(annot_debug.vertices) if annot_debug.vertices else 0 })
temp_debug_list.sort(key=lambda item: (item["rect"].y0, item["rect"].x0))
for item_idx, item_val in enumerate(temp_debug_list):
print(f" {item_idx+1}. Rect: {item_val['rect']}, Vertices: {item_val['vertices_count']}, Color: {item_val['color_name'].upper()} ({item_val['rgb_display']}), Text: \"{item_val['text_snippet']}\"")
print(Fore.YELLOW + "----------------------------------------------------")
highlight_annotations.sort(key=lambda a: (a.rect.y0, a.rect.x0))
for annot in highlight_annotations:
try:
highlight_id_counter_on_page[page_num] += 1; current_hl_id_on_page = highlight_id_counter_on_page[page_num]
color_name, raw_rgb_floats = self._get_highlight_color_from_annot_colors_dict(annot.colors)
extracted_text, num_segments, _ = self._extract_text_from_multi_segment_highlight(page, annot, page_num, current_hl_id_on_page)
if extracted_text and extracted_text.strip():
if self.run_args.show_extraction_details and not self.run_args.interactive:
print(Fore.GREEN + f" ✅ Final (P{page_num}, HL{current_hl_id_on_page}): \"{extracted_text[:100]}\"")
all_extracted_highlights.append({
'page': page_num, 'highlight_id_on_page': current_hl_id_on_page, 'text': extracted_text,
'color': color_name, 'raw_rgb_values': raw_rgb_floats, 'type': 'highlight',
'y_position': annot.rect.y0, 'x_position': annot.rect.x0,
'rect_details': (annot.rect.x0, annot.rect.y0, annot.rect.x1, annot.rect.y1),
'num_segments': num_segments })
elif self.run_args.show_progress and not self.run_args.interactive:
print(Fore.YELLOW + f" ⚠️ No text for HL {current_hl_id_on_page} on page {page_num}")
except Exception as e:
if self.run_args.show_progress and not self.run_args.interactive:
print(Fore.RED + f" 🔴 Error processing annot on page {page_num}: {e}")
if self.run_args.debug: traceback.print_exc()
continue
if self.run_args.interactive:
print(Fore.MAGENTA + "\nEntering interactive review session...")
self.highlights_data = self._interactive_review_session(all_extracted_highlights)
else: self.highlights_data = all_extracted_highlights
if self.run_args.show_progress and not self.run_args.interactive and not self.run_args.silent:
print(Fore.MAGENTA + f" 📊 Total highlights extracted: {len(self.highlights_data)}")
return self.highlights_data
except Exception as e:
print(Fore.RED + f"❌ Major error during highlight extraction: {e}")
if self.run_args.debug: traceback.print_exc()
return []
def _view_page_image_interactively(self, page_num_to_view):
if not self.main_doc_for_image_view:
print(Fore.RED + "Error: PDF document not available for image rendering. This should not happen.")
return
tmp_image_path_obj = None
image_created_in_managed_folder = False
image_successfully_saved = False
if self.run_args.show_progress:
print(Fore.BLUE + f"Preparing to view image for page {page_num_to_view}...")
try:
page_index = page_num_to_view - 1
page = self.main_doc_for_image_view.load_page(page_index)
if self.run_args.debug:
print(Fore.CYAN + f" [Debug] Loaded page object for index {page_index}: {page}")
pix = page.get_pixmap(dpi=150)
if self.run_args.debug:
print(Fore.CYAN + f" [Debug] Created pixmap: {pix}. Alpha: {pix.alpha}, Colorspace: {pix.colorspace.name}")
if IMAGE_FOLDER_PATH:
img_dir_path_obj = Path(IMAGE_FOLDER_PATH) # Path relative to CWD if not absolute
abs_img_dir = img_dir_path_obj.resolve()
if self.run_args.debug:
print(Fore.CYAN + f" [Debug] Using IMAGE_FOLDER_PATH: '{IMAGE_FOLDER_PATH}' (Absolute: {abs_img_dir})")
try:
abs_img_dir.mkdir(parents=True, exist_ok=True)
if self.run_args.debug:
print(Fore.CYAN + f" [Debug] Ensured image directory exists: {abs_img_dir} (Status: {abs_img_dir.is_dir()})")
except Exception as e_mkdir:
print(Fore.RED + f" ERROR: Could not create directory {abs_img_dir}: {e_mkdir}")
if self.run_args.debug: traceback.print_exc()
# Do not proceed if directory creation fails
input(Fore.CYAN + "Press Enter to acknowledge and continue...")
return
unique_id = uuid.uuid4().hex[:8]
tmp_image_path_obj = abs_img_dir / f"page_{page_num_to_view}_{unique_id}.png"
image_created_in_managed_folder = True
else:
fd, temp_path_str = tempfile.mkstemp(suffix=".png", prefix="pdf_page_img_")
os.close(fd)
tmp_image_path_obj = Path(temp_path_str)
if self.run_args.debug:
print(Fore.CYAN + f" [Debug] Using system temporary file: {tmp_image_path_obj.resolve()}")
resolved_save_path = tmp_image_path_obj.resolve()
if self.run_args.debug:
print(Fore.CYAN + f" [Debug] Attempting to save image to: {resolved_save_path}")
pix.save(str(resolved_save_path))
if resolved_save_path.exists() and resolved_save_path.is_file():
image_successfully_saved = True
if self.run_args.show_progress: # Print for normal progress too, not just debug
print(Fore.GREEN + f" Image for page {page_num_to_view} successfully saved to: {resolved_save_path}")
if self.run_args.debug:
print(Fore.CYAN + f" [Debug] File size: {resolved_save_path.stat().st_size} bytes")
else:
if self.run_args.show_progress:
print(Fore.RED + f" ERROR: Failed to save image to {resolved_save_path}. File does not exist after save attempt.")
except Exception as e_render_save:
if self.run_args.show_progress:
print(Fore.RED + f" Error during image rendering or saving: {e_render_save}")
if self.run_args.debug:
traceback.print_exc()
if image_successfully_saved and tmp_image_path_obj:
if self.run_args.show_progress:
print(Fore.CYAN + f"Attempting to open image with default application...")
try:
file_uri = tmp_image_path_obj.resolve().as_uri()
if self.run_args.debug:
print(Fore.CYAN + f" [Debug] Opening URI: {file_uri}")
opened_successfully = webbrowser.open(file_uri)
if self.run_args.debug: # More detailed feedback in debug mode
print(Fore.CYAN + f" [Debug] webbrowser.open() returned: {opened_successfully}")
if not opened_successfully:
if self.run_args.show_progress:
print(Fore.YELLOW + " webbrowser.open() reported failure (returned False or None).")
print(Fore.YELLOW + f" This often means no default application is configured for PNG files or your browser.")
elif self.run_args.show_progress:
print(Fore.GREEN + " Image hopefully opened. Check your applications.")
if self.run_args.show_progress:
print(Fore.YELLOW + f" If the image did not open, please manually open: {tmp_image_path_obj.resolve()}")
input(Fore.CYAN + "Press Enter after viewing image to continue...")
except Exception as e_open:
if self.run_args.show_progress:
print(Fore.RED + f" Could not open image using webbrowser: {e_open}")
print(Fore.YELLOW + " This could be due to your system's environment (e.g., missing 'xdg-utils' on Linux, no default PNG viewer).")
print(Fore.YELLOW + f" Please try opening the image manually: {tmp_image_path_obj.resolve()}")
if self.run_args.debug:
traceback.print_exc()
input(Fore.CYAN + "Press Enter to acknowledge and continue...")
elif tmp_image_path_obj :
if self.run_args.show_progress:
print(Fore.YELLOW + " Skipping attempt to open image as it was not saved successfully.")
input(Fore.CYAN + "Press Enter to continue...")
else:
if self.run_args.show_progress:
print(Fore.RED + " Cannot attempt to open image as image path was not determined.")
input(Fore.CYAN + "Press Enter to continue...")
finally:
if tmp_image_path_obj and tmp_image_path_obj.exists():
if image_created_in_managed_folder:
if self.run_args.debug:
print(Fore.CYAN + f" [Debug] Image '{tmp_image_path_obj.name}' remains in managed folder '{IMAGE_FOLDER_PATH}'.")
print(Fore.CYAN + f" [Debug] It will be cleared based on CLEAR_IMAGE_FOLDER_ON_END ({CLEAR_IMAGE_FOLDER_ON_END}).")
else:
try:
tmp_image_path_obj.unlink()
if self.run_args.debug:
print(Fore.CYAN + f" [Debug] Deleted system temporary image: {tmp_image_path_obj.resolve()}")
except Exception as e_unlink:
if self.run_args.debug:
print(Fore.YELLOW + f" Warning: Could not delete system temp image {tmp_image_path_obj.resolve()}: {e_unlink}")
elif tmp_image_path_obj and not tmp_image_path_obj.exists() and image_successfully_saved:
if self.run_args.debug:
print(Fore.RED + f" [Debug] Inconsistency: Image was marked saved, but {tmp_image_path_obj.resolve()} does not exist at cleanup (and wasn't a system temp explicitly deleted here).")
def _interactive_review_session(self, highlights_list):
if not highlights_list:
if self.run_args.show_progress : print(Fore.YELLOW + "No highlights to review.")
return []
reviewed_highlights = [dict(h) for h in highlights_list]
idx, num_highlights = 0, len(reviewed_highlights)
AVAILABLE_COLORS = ['yellow', 'green', 'blue', 'pink', 'other_color', 'unknown_color']
while 0 <= idx < num_highlights:
item = reviewed_highlights[idx]
print(Style.RESET_ALL + "\n" + "="*15 + f" Review HL {idx+1}/{num_highlights} (Page {item['page']}) " + "="*15)
current_color_display = self._get_color_display_codes(item['color'])
print(f"Color: {current_color_display}{item['color'].upper()}{Style.RESET_ALL}", end="")
if item['color'] == 'other_color' and item.get('raw_rgb_values'):
rgb = item['raw_rgb_values'][:3]
rgb_disp = tuple(int(c*255) if isinstance(c,float) else int(c) for c in rgb)
print(f" (RGB: {rgb_disp})", end="")
print()
print(f"Text: {item['text']}")
prompt_options = ["[N]ext", "[P]rev", "[U]p", "[M]ove Down", "[C]olor", "[E]dit", "[D]elete", "[O]pen Img", "[S]ave&Exit", "[Q]uit"]
action_prompt_str = Fore.CYAN + ", ".join(prompt_options) + "? > " + Style.RESET_ALL
action = input(action_prompt_str).lower().strip()
if action == 'n': idx = (idx + 1) % num_highlights if num_highlights > 0 else 0
elif action == 'p': idx = (idx - 1 + num_highlights) % num_highlights if num_highlights > 0 else 0
elif action == 'u':
if idx > 0:
reviewed_highlights.insert(idx - 1, reviewed_highlights.pop(idx))
idx -= 1
print(Fore.GREEN + "Moved up.")
else: print(Fore.YELLOW + "Already at the top.")
elif action == 'm':
if idx < num_highlights - 1:
reviewed_highlights.insert(idx + 1, reviewed_highlights.pop(idx))
idx += 1
print(Fore.GREEN + "Moved down.")
else: print(Fore.YELLOW + "Already at the bottom.")
elif action == 'c':
print("Available colors:", ", ".join(f"{i+1}.{self._get_color_display_codes(co)}{co.upper()}{Style.RESET_ALL}" for i,co in enumerate(AVAILABLE_COLORS)))
try:
choice_str = input(Fore.YELLOW + "Enter number for new color: " + Style.RESET_ALL)
if not choice_str: print(Fore.BLUE + "Color change cancelled (no input)."); continue
choice = int(choice_str) - 1
if 0 <= choice < len(AVAILABLE_COLORS):
item['color'] = AVAILABLE_COLORS[choice]
print(Fore.GREEN + f"Color changed to {AVAILABLE_COLORS[choice].upper()}.")
else: print(Fore.RED + "Invalid color choice.")
except ValueError: print(Fore.RED + "Invalid input. Please enter a number.")
elif action == 'e':
edit_prompt = Fore.YELLOW + "New text (blank=keep, 'CLEAR'=empty): > " + Style.RESET_ALL
new_text = input_with_prefill(edit_prompt, item['text'])
if new_text.strip().upper() == 'CLEAR':
item['text'] = ""
print(Fore.GREEN + "Text cleared.")
elif new_text == item['text'] or not new_text.strip() :
print(Fore.BLUE + "Text kept as is.")
else:
item['text'] = new_text
print(Fore.GREEN + "Text updated.")
elif action == 'd':
if input(Fore.RED + "Are you sure you want to delete this highlight? [y/N]: " + Style.RESET_ALL).lower() == 'y':
reviewed_highlights.pop(idx)
num_highlights = len(reviewed_highlights)
print(Fore.GREEN + "Highlight deleted.")
if num_highlights == 0:
print(Fore.YELLOW + "No more highlights to review."); break
if idx >= num_highlights: idx = num_highlights - 1
else: print(Fore.BLUE + "Deletion cancelled.")
elif action == 'o': self._view_page_image_interactively(item['page'])
elif action == 's':
print(Fore.GREEN + "Saving changes and exiting review session.")
break
elif action == 'q':
if input(Fore.RED+"Are you sure you want to quit review? Changes will not be saved. [y/N]: " + Style.RESET_ALL).lower()=='y':
print(Fore.YELLOW+"Quitting review session. Changes made in this session are DISCARDED.")
return highlights_list
else:
print(Fore.BLUE + "Quit cancelled.")
else: print(Fore.RED + "Invalid action. Please choose from the list.")
return reviewed_highlights
def _parse_specific_pages(self, pages_str, total_pages):
if not pages_str or pages_str.lower() == "all": return list(range(1, total_pages + 1))
parsed_pages = set()
try:
for part in pages_str.split(','):
part = part.strip();
if not part: continue
if '-' in part:
start_str, end_str = part.split('-', 1); start = int(start_str); end = int(end_str)
start = max(1, start); end = min(total_pages, end)
if start <= end: parsed_pages.update(range(start, end + 1))
else:
page_val = int(part)
if 1 <= page_val <= total_pages: parsed_pages.add(page_val)
return sorted(list(parsed_pages)) if parsed_pages else []
except ValueError as e:
if self.run_args.show_progress: print(Fore.YELLOW + f"⚠️ Invalid page range: {pages_str}. Error: {e}.")
return []
def _get_color_display_codes(self, color_name_str):
return {'yellow': Back.YELLOW + Fore.BLACK, 'green': Back.GREEN + Fore.BLACK,
'blue': Back.BLUE + Fore.WHITE, 'pink': Back.MAGENTA + Fore.WHITE,
'other_color': Back.WHITE + Fore.BLACK, 'unknown_color': Back.LIGHTBLACK_EX + Fore.WHITE
}.get(color_name_str.lower(), Back.LIGHTBLACK_EX + Fore.WHITE)
def display_results(self):
if not self.run_args.show_progress: return # Don't display if progress is off (e.g. silent)
print("\n" + Fore.CYAN + Style.BRIGHT + "="*30 + " EXTRACTED HIGHLIGHTS " + "="*30 + Style.RESET_ALL)
if not self.highlights_data: print("\n❌ No highlights extracted or all were deleted."); return
current_page = None
for item in self.highlights_data:
if item.get('page') != current_page:
current_page = item.get('page'); print(f"\n📄 {Style.BRIGHT}Page {current_page}{Style.RESET_ALL}\n" + "-"*25)
color_name = item.get('color', 'unknown_color')
color_code = self._get_color_display_codes(color_name)
num_segments = item.get('num_segments', 0)
segment_info = f" [{num_segments} segments]" if num_segments > 1 else ""
text_content = item.get('text', "*NO TEXT*")
display_color_name = color_name.upper()
if color_name == 'other_color':
raw_rgb = item.get('raw_rgb_values')
if raw_rgb and len(raw_rgb) >=3:
rgb_disp = tuple(int(c*255) if isinstance(c,float) else int(c) for c in raw_rgb[:3])
display_color_name += f" (RGB: {rgb_disp})"
print(f"🎨 {color_code}{display_color_name}{Style.RESET_ALL}{segment_info}")
print(f" \"{text_content}\""); print()
def save_to_json(self, output_path_str):
output_path = Path(output_path_str).resolve() # Resolve to absolute path for clarity
try:
output_path.parent.mkdir(parents=True, exist_ok=True)
if self.run_args.debug:
print(Fore.CYAN + f" [Debug] Ensured parent directory for JSON exists: {output_path.parent}")
except Exception as e_mkdir:
if self.run_args.show_progress: # Also show error if progress is on
print(Fore.RED + f"❌ Error creating directory for JSON output {output_path.parent}: {e_mkdir}")
if self.run_args.debug: traceback.print_exc()
return # Cannot save if directory cannot be made
data_to_save = {
'pdf_file_processed': str(self.pdf_path.name), 'pdf_full_path': str(self.pdf_path.resolve()),
'pages_processed_spec': self.run_args.pages if self.run_args.pages else DEFAULT_PAGES_TO_PROCESS,
'extraction_timestamp': time.strftime("%Y-%m-%d %H:%M:%S %Z"),
'total_highlights_extracted': len(self.highlights_data),
'settings_used': {
'clean_edges': self.run_args.clean_edges,
'show_diff_percentage': self.run_args.show_diff_percentage
},
'highlights_data': self.highlights_data }
try:
with open(output_path, 'w', encoding='utf-8') as f: json.dump(data_to_save, f, indent=2, ensure_ascii=False)
if self.run_args.show_progress: print(Fore.GREEN + f"💾 Data saved to {output_path}")
except IOError as e:
if self.run_args.show_progress: print(Fore.RED + f"❌ Error saving JSON to {output_path}: {e}")
if self.run_args.debug: traceback.print_exc()
def main():
parser = argparse.ArgumentParser(
description="Enhanced PDF Highlight Extractor.",
formatter_class=argparse.RawTextHelpFormatter,
epilog=f"""Examples:
{sys.argv[0]} mydoc.pdf
{sys.argv[0]} mydoc.pdf -p "1,5-7" -i
{sys.argv[0]} -t -s --output-json results/test.json
{sys.argv[0]} doc.pdf -d
If interactive image viewing ('O' option) fails, try running with the -d (debug)
flag. This will print detailed information about image paths and creation steps.
Common issues include missing default PNG viewers or OS-level permission problems.
The IMAGE_FOLDER_PATH ('{IMAGE_FOLDER_PATH}') is relative to where you run the script.
""")
parser.add_argument("pdf_path_arg", nargs='?', default=None, help="Path to PDF. Prompts if not in test/silent mode & not provided.")
parser.add_argument("-p", "--pages", type=str, default=None, help=f"Pages (e.g., \"1,3-5\", \"all\"). Default: \"{DEFAULT_PAGES_TO_PROCESS}\".")
parser.add_argument("-i", "--interactive", action="store_true", help="Enable interactive review mode.")
parser.add_argument("-t", "--test", action="store_true", help=f"Test mode. Uses default PDF ('{DEFAULT_PDF_PATH}'), auto-saves JSON.")
parser.add_argument("-s", "--silent", action="store_true", help="Silent mode. Minimal output. Auto-saves JSON. Implies -t if no PDF path.")
parser.add_argument("-d", "--debug", action="store_true", help="Debug mode. Enables all detailed SHOW flags and prints more internal details.")
parser.add_argument("--output-json", type=str, default=None, help="Custom output JSON filename/path.")
cli_args = parser.parse_args()
effective_run_args = argparse.Namespace()
effective_run_args.debug = cli_args.debug
effective_run_args.silent = cli_args.silent
# Initialize based on global defaults
effective_run_args.show_timing = INITIAL_SHOW_TIMING
effective_run_args.show_progress = INITIAL_SHOW_PROGRESS
effective_run_args.show_raw_segments = INITIAL_SHOW_RAW_SEGMENTS
effective_run_args.show_extraction_details = INITIAL_SHOW_EXTRACTION_DETAILS
effective_run_args.show_rect_details = INITIAL_SHOW_RECT_DETAILS
effective_run_args.show_diff_percentage = INITIAL_SHOW_DIFF_PERCENTAGE
effective_run_args.clean_edges = INITIAL_CLEAN_EDGES
# Override show flags based on debug or silent
if effective_run_args.debug:
for key in ['show_timing', 'show_progress', 'show_raw_segments', 'show_extraction_details', 'show_rect_details', 'show_diff_percentage']:
setattr(effective_run_args, key, True) # Debug enables all these
if effective_run_args.silent:
for key in ['show_timing', 'show_progress', 'show_raw_segments', 'show_extraction_details', 'show_rect_details', 'show_diff_percentage']:
setattr(effective_run_args, key, False) # Silent disables all these
effective_run_args.interactive = False
else: # Not silent
effective_run_args.interactive = cli_args.interactive
effective_run_args.pages = cli_args.pages
start_time = time.time()
if effective_run_args.show_progress: print(Fore.MAGENTA + Style.BRIGHT + "🎨 PDF Highlight Extractor 🎨" + Style.RESET_ALL)
if effective_run_args.debug:
print(Fore.CYAN + f" [Debug] Current Working Directory: {Path.cwd()}")
print(Fore.CYAN + f" [Debug] Effective Run Arguments: {effective_run_args}")
if IMAGE_FOLDER_PATH and CLEAR_IMAGE_FOLDER_ON_START:
_clear_png_files_in_folder(IMAGE_FOLDER_PATH, effective_run_args)
pdf_path_to_use = None
if cli_args.test: pdf_path_to_use = DEFAULT_PDF_PATH
elif cli_args.pdf_path_arg: pdf_path_to_use = cli_args.pdf_path_arg
elif cli_args.silent: pdf_path_to_use = DEFAULT_PDF_PATH
else:
pdf_path_input = input(f"📄 PDF path (Enter for default '{DEFAULT_PDF_PATH}'): ").strip().strip('"')
pdf_path_to_use = pdf_path_input if pdf_path_input else DEFAULT_PDF_PATH
if not pdf_path_to_use:
if effective_run_args.show_progress: print(Fore.RED + "❌ No PDF path specified. Exiting.")
sys.exit(1)
resolved_path = Path(pdf_path_to_use).resolve()
if not resolved_path.exists() or not resolved_path.is_file():
if effective_run_args.show_progress: print(Fore.RED + f"❌ PDF not found or is not a file: {resolved_path}")
sys.exit(1)
doc_for_processing = None
try:
doc_for_processing = fitz.open(str(resolved_path))
extractor = EnhancedPDFHighlightExtractor(resolved_path, effective_run_args, main_doc_for_image_view=doc_for_processing)
extractor.extract_highlights(doc_for_processing)
if not effective_run_args.interactive and effective_run_args.show_progress:
extractor.display_results()
elif effective_run_args.interactive and effective_run_args.show_progress:
if input(Fore.CYAN+"Interactive session ended. Display final results? [Y/n]: " + Style.RESET_ALL).lower().strip()!='n':
extractor.display_results()
json_output_path_str = cli_args.output_json if cli_args.output_json else str(resolved_path.parent / f"{resolved_path.stem}_highlights.json")
perform_save = False
if cli_args.test or cli_args.silent:
perform_save = True
elif effective_run_args.show_progress: # Only prompt if not silent
save_prompt_input = input(f"💾 Save to JSON? (Enter for default '{json_output_path_str}', type 'skip' to not save, or enter a custom path): " + Style.RESET_ALL).strip()
if save_prompt_input.lower() != 'skip':
perform_save = True
if save_prompt_input:
json_output_path_str = save_prompt_input
if perform_save:
if extractor.highlights_data:
extractor.save_to_json(json_output_path_str)
elif effective_run_args.show_progress:
print(Fore.YELLOW + "No highlights were extracted or kept, so JSON file was not saved.")
elif effective_run_args.show_progress:
print(Fore.BLUE + "Skipped saving highlights to JSON.")
except Exception as e:
if effective_run_args.show_progress: print(Fore.RED+Style.BRIGHT+f"💥 An critical error occurred in the main execution: {e}")
if effective_run_args.debug:
traceback.print_exc()
finally:
if doc_for_processing:
doc_for_processing.close()
if IMAGE_FOLDER_PATH and CLEAR_IMAGE_FOLDER_ON_END:
_clear_png_files_in_folder(IMAGE_FOLDER_PATH, effective_run_args)
if effective_run_args.show_timing:
print(Fore.CYAN + f"\n⏱️ Total execution time: {time.time()-start_time:.2f} seconds")
if __name__ == '__main__':
main()