1428 lines
55 KiB
Python
1428 lines
55 KiB
Python
#!/usr/bin/env python3
|
|
# =============================================================================
|
|
# ENHANCED PDF HIGHLIGHT EXTRACTOR
|
|
# Author: Perplexity AI Companion (Updated by User Feedback)
|
|
# Date: June 3, 2025
|
|
# License: MIT
|
|
#
|
|
# Extracts highlights from PDF files, with options for interactive review,
|
|
# detailed output, text cleaning, JSON export, and page image viewing.
|
|
# =============================================================================
|
|
|
|
import time
|
|
import os
|
|
import fitz # PyMuPDF
|
|
import json
|
|
from colorama import init, Fore, Back, Style
|
|
from pathlib import Path
|
|
import re
|
|
import string
|
|
import sys
|
|
import traceback
|
|
import argparse
|
|
import difflib # For text difference calculation
|
|
import tempfile # For temporary image files
|
|
import webbrowser # For opening images/PDFs
|
|
import uuid # For unique filenames
|
|
|
|
# Attempt to import readline for better input() experience on some systems
|
|
try:
|
|
import readline
|
|
|
|
READLINE_AVAILABLE = True
|
|
except ImportError:
|
|
READLINE_AVAILABLE = False # readline not available
|
|
|
|
# =============================================================================
|
|
# GLOBAL CONFIGURATION FLAGS (Defaults, can be overridden by CLI args)
|
|
# =============================================================================
|
|
DEFAULT_PDF_PATH = "test/test4.pdf" # Local test PDF
|
|
DEFAULT_PAGES_TO_PROCESS = "1" # Example: "1,3-5,all"
|
|
|
|
# Default Behavior flags (can be influenced by -d or -s CLI flags)
|
|
# These are used to initialize effective_run_args
|
|
# Keep these distinct from the effective_run_args object itself
|
|
INITIAL_SHOW_TIMING = True
|
|
INITIAL_SHOW_PROGRESS = True
|
|
INITIAL_SHOW_RAW_SEGMENTS = True
|
|
INITIAL_SHOW_EXTRACTION_DETAILS = True
|
|
INITIAL_SHOW_RECT_DETAILS = True
|
|
INITIAL_SHOW_DIFF_PERCENTAGE = True
|
|
INITIAL_CLEAN_EDGES = True
|
|
|
|
# Text extraction parameters (generally fixed)
|
|
TEXT_EXTRACTION_HORIZONTAL_PADDING = 6.0
|
|
TEXT_EXTRACTION_VERTICAL_PADDING = 1.0
|
|
|
|
# Edge cleaning configuration (generally fixed)
|
|
VALID_TWO_LETTER_WORDS = {
|
|
"am",
|
|
"an",
|
|
"as",
|
|
"at",
|
|
"be",
|
|
"by",
|
|
"do",
|
|
"go",
|
|
"he",
|
|
"if",
|
|
"in",
|
|
"is",
|
|
"it",
|
|
"me",
|
|
"my",
|
|
"no",
|
|
"of",
|
|
"on",
|
|
"or",
|
|
"ox",
|
|
"so",
|
|
"to",
|
|
"up",
|
|
"us",
|
|
"we",
|
|
}
|
|
VALID_SINGLE_LETTERS = {"i", "a"}
|
|
|
|
# Image handling configuration
|
|
IMAGE_FOLDER_PATH = "pdf_page_images" # Relative to CWD by default
|
|
CLEAR_IMAGE_FOLDER_ON_START = True
|
|
CLEAR_IMAGE_FOLDER_ON_END = False
|
|
|
|
# Initialize colorama
|
|
init(autoreset=True)
|
|
|
|
|
|
# --- Helper Functions ---
|
|
def get_text_diff_ratio(text1, text2):
|
|
if not text1 and not text2:
|
|
return 1.0
|
|
if not text1 or not text2:
|
|
return 0.0
|
|
return difflib.SequenceMatcher(None, str(text1), str(text2)).ratio()
|
|
|
|
|
|
def clean_segment_edges_func(text_to_clean, clean_edges_setting):
|
|
if not clean_edges_setting or not text_to_clean:
|
|
return text_to_clean
|
|
text_to_clean = re.sub(r"\s+", " ", text_to_clean.strip())
|
|
words = text_to_clean.split()
|
|
if not words:
|
|
return text_to_clean
|
|
|
|
current_idx = 0
|
|
while current_idx < len(words):
|
|
token = words[current_idx]
|
|
core_token = token.rstrip(string.punctuation)
|
|
trailing_punctuation = token[len(core_token) :]
|
|
if not core_token:
|
|
words.pop(current_idx)
|
|
continue
|
|
core_should_be_removed = (
|
|
len(core_token) == 1
|
|
and core_token.isalpha()
|
|
and core_token.lower() not in VALID_SINGLE_LETTERS
|
|
) or (
|
|
len(core_token) == 2
|
|
and core_token.isalpha()
|
|
and core_token.lower() not in VALID_TWO_LETTER_WORDS
|
|
)
|
|
if core_should_be_removed:
|
|
if trailing_punctuation:
|
|
words[current_idx] = trailing_punctuation
|
|
else:
|
|
words.pop(current_idx)
|
|
continue
|
|
break
|
|
while words:
|
|
token = words[-1]
|
|
core_token = token.lstrip(string.punctuation)
|
|
leading_punctuation = token[: -len(core_token)] if core_token else ""
|
|
if not core_token:
|
|
words.pop()
|
|
continue
|
|
core_should_be_removed = (
|
|
len(core_token) == 1
|
|
and core_token.isalpha()
|
|
and core_token.lower() not in VALID_SINGLE_LETTERS
|
|
) or (
|
|
len(core_token) == 2
|
|
and core_token.isalpha()
|
|
and core_token.lower() not in VALID_TWO_LETTER_WORDS
|
|
)
|
|
if core_should_be_removed:
|
|
if leading_punctuation:
|
|
words[-1] = leading_punctuation
|
|
else:
|
|
words.pop()
|
|
continue
|
|
break
|
|
return " ".join(words)
|
|
|
|
|
|
def input_with_prefill(prompt, text):
|
|
if READLINE_AVAILABLE:
|
|
|
|
def hook():
|
|
readline.insert_text(text)
|
|
readline.redisplay()
|
|
|
|
readline.set_pre_input_hook(hook)
|
|
result = input(prompt)
|
|
readline.set_pre_input_hook()
|
|
return result
|
|
else:
|
|
print(
|
|
Fore.MAGENTA + "Current text (edit below):\n" + Style.RESET_ALL + f"{text}"
|
|
)
|
|
return input(prompt)
|
|
|
|
|
|
def _clear_png_files_in_folder(folder_path_str, run_args_for_print_control):
|
|
# This function CLEARS files if folder exists. It DOES NOT CREATE the folder.
|
|
if not folder_path_str:
|
|
return
|
|
|
|
folder = Path(folder_path_str) # Path relative to CWD if not absolute
|
|
abs_folder_path = folder.resolve()
|
|
|
|
if run_args_for_print_control.debug:
|
|
print(
|
|
Fore.CYAN
|
|
+ f" [Debug] _clear_png_files_in_folder: Checking {abs_folder_path} (Specified as: '{folder_path_str}')"
|
|
)
|
|
|
|
if abs_folder_path.is_dir():
|
|
if run_args_for_print_control.show_progress:
|
|
print(Fore.BLUE + f"Clearing *.png files from {abs_folder_path}...")
|
|
cleared_count = 0
|
|
try:
|
|
for file_path in abs_folder_path.glob("*.png"):
|
|
if file_path.is_file():
|
|
file_path.unlink()
|
|
cleared_count += 1
|
|
except Exception as e:
|
|
if (
|
|
run_args_for_print_control.show_progress
|
|
): # Also show error if progress is on
|
|
print(
|
|
Fore.RED + f"Error during file deletion in {abs_folder_path}: {e}"
|
|
)
|
|
|
|
if run_args_for_print_control.show_progress:
|
|
if cleared_count > 0:
|
|
print(
|
|
Fore.BLUE
|
|
+ f"Cleared {cleared_count} *.png files from {abs_folder_path}."
|
|
)
|
|
else:
|
|
print(
|
|
Fore.BLUE + f"No *.png files found to clear in {abs_folder_path}."
|
|
)
|
|
else:
|
|
if run_args_for_print_control.show_progress:
|
|
print(
|
|
Fore.YELLOW
|
|
+ f"Image folder {abs_folder_path} not found, skipping clear."
|
|
)
|
|
elif (
|
|
run_args_for_print_control.debug
|
|
): # Still log if not found in debug, even if not show_progress
|
|
print(
|
|
Fore.CYAN
|
|
+ f" [Debug] _clear_png_files_in_folder: Folder {abs_folder_path} does not exist. Nothing to clear."
|
|
)
|
|
|
|
|
|
class EnhancedPDFHighlightExtractor:
|
|
def __init__(self, pdf_path, effective_run_args, main_doc_for_image_view=None):
|
|
self.pdf_path = Path(pdf_path)
|
|
self.run_args = effective_run_args
|
|
self.highlights_data = []
|
|
self.main_doc_for_image_view = main_doc_for_image_view
|
|
|
|
def _get_highlight_color_from_rgb_tuple(self, rgb_tuple_floats_or_ints):
|
|
if not rgb_tuple_floats_or_ints or len(rgb_tuple_floats_or_ints) < 3:
|
|
return "unknown_color"
|
|
r, g, b = [
|
|
int(x * 255) if isinstance(x, float) and 0.0 <= x <= 1.0 else int(x)
|
|
for x in rgb_tuple_floats_or_ints[:3]
|
|
]
|
|
|
|
# Specific blue highlight color
|
|
if r == 142 and g == 221 and b == 249:
|
|
return "blue"
|
|
# Yellow highlights (high red/green, low blue)
|
|
if r > 200 and g > 200 and b < 150:
|
|
return "yellow"
|
|
# Green highlights (low red/blue, high green)
|
|
if r < 150 and g > 180 and b < 150:
|
|
return "green"
|
|
# Blue highlights (low red/green, high blue)
|
|
if r < 150 and g < 180 and b > 180:
|
|
return "blue"
|
|
# Pink highlights (high red/blue, low green)
|
|
if r > 180 and g < 180 and b > 180:
|
|
return "pink"
|
|
return "other_color"
|
|
|
|
def _get_highlight_color_from_annot_colors_dict(self, colors_dict):
|
|
if not colors_dict:
|
|
return "unknown_color", None
|
|
rgb_tuple = colors_dict.get("stroke") or colors_dict.get("fill")
|
|
if not rgb_tuple:
|
|
return "unknown_color", None
|
|
return self._get_highlight_color_from_rgb_tuple(rgb_tuple), rgb_tuple[:3]
|
|
|
|
def _extract_text_from_multi_segment_highlight(self, page, annot, page_num, hl_id):
|
|
overall_highlight_color_name, _ = (
|
|
self._get_highlight_color_from_annot_colors_dict(annot.colors)
|
|
)
|
|
color_code_for_segment_print = self._get_color_display_codes(
|
|
overall_highlight_color_name
|
|
)
|
|
quads_vertices = annot.vertices
|
|
if not quads_vertices:
|
|
if self.run_args.show_extraction_details:
|
|
print(
|
|
Fore.YELLOW
|
|
+ f" No quads for HL {hl_id} on page {page_num}"
|
|
)
|
|
return None, 0, []
|
|
|
|
processed_quads_as_points_list = []
|
|
if len(quads_vertices) % 4 == 0:
|
|
for i in range(0, len(quads_vertices), 4):
|
|
try:
|
|
quad_points = [fitz.Point(p) for p in quads_vertices[i : i + 4]]
|
|
processed_quads_as_points_list.append(quad_points)
|
|
except Exception as e:
|
|
if self.run_args.show_extraction_details:
|
|
print(
|
|
Fore.YELLOW
|
|
+ f" Skipping malformed quad points: {e}"
|
|
)
|
|
continue
|
|
|
|
try:
|
|
sorted_quad_points_list = sorted(
|
|
processed_quads_as_points_list,
|
|
key=lambda qp_list: (
|
|
fitz.Quad(qp_list).rect.y0,
|
|
fitz.Quad(qp_list).rect.x0,
|
|
),
|
|
)
|
|
except Exception as e:
|
|
if self.run_args.show_extraction_details:
|
|
print(
|
|
Fore.RED
|
|
+ f" Error sorting quads for HL {hl_id}: {e}. Using original order."
|
|
)
|
|
sorted_quad_points_list = processed_quads_as_points_list
|
|
|
|
if self.run_args.show_extraction_details:
|
|
print(
|
|
color_code_for_segment_print
|
|
+ Fore.CYAN
|
|
+ f" Processing {len(sorted_quad_points_list)} segments for HL {hl_id} "
|
|
+ f"(Color: {overall_highlight_color_name.upper()}) on page {page_num}"
|
|
+ Style.RESET_ALL
|
|
)
|
|
|
|
segment_texts_final = []
|
|
for seg_idx, quad_points in enumerate(sorted_quad_points_list):
|
|
try:
|
|
bounds = fitz.Quad(quad_points).rect
|
|
padded_rect = fitz.Rect(
|
|
bounds.x0 - TEXT_EXTRACTION_HORIZONTAL_PADDING,
|
|
bounds.y0 - TEXT_EXTRACTION_VERTICAL_PADDING,
|
|
bounds.x1 + TEXT_EXTRACTION_HORIZONTAL_PADDING,
|
|
bounds.y1 + TEXT_EXTRACTION_VERTICAL_PADDING,
|
|
)
|
|
padded_rect.intersect(page.rect)
|
|
if padded_rect.is_empty:
|
|
if self.run_args.show_extraction_details:
|
|
print(
|
|
Fore.YELLOW
|
|
+ f" Segment {seg_idx + 1} empty padded_rect for HL {hl_id}"
|
|
)
|
|
continue
|
|
raw_text_from_pdf_segment = page.get_text(
|
|
"text", clip=padded_rect, sort=True
|
|
).strip()
|
|
cleaned_text_segment = re.sub(
|
|
r"\s+", " ", raw_text_from_pdf_segment
|
|
).strip()
|
|
cleaned_text_segment = re.sub(
|
|
r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]", "", cleaned_text_segment
|
|
)
|
|
final_text_segment = clean_segment_edges_func(
|
|
cleaned_text_segment, self.run_args.clean_edges
|
|
)
|
|
|
|
if final_text_segment:
|
|
segment_texts_final.append(final_text_segment)
|
|
if (
|
|
self.run_args.show_raw_segments
|
|
and not self.run_args.interactive
|
|
):
|
|
print(
|
|
color_code_for_segment_print
|
|
+ Fore.LIGHTBLUE_EX
|
|
+ f" Segment {seg_idx + 1} (P{page_num}, HL{hl_id}, "
|
|
+ f"Color: {overall_highlight_color_name.upper()}):"
|
|
+ Style.RESET_ALL
|
|
)
|
|
if self.run_args.show_diff_percentage:
|
|
similarity = get_text_diff_ratio(
|
|
raw_text_from_pdf_segment, final_text_segment
|
|
)
|
|
diff_percent = (1 - similarity) * 100
|
|
print(
|
|
Fore.LIGHTMAGENTA_EX
|
|
+ f' Raw PDF : "{raw_text_from_pdf_segment}"'
|
|
)
|
|
print(
|
|
Fore.LIGHTBLUE_EX
|
|
+ f' Final Seg: "{final_text_segment}"'
|
|
)
|
|
print(
|
|
Fore.YELLOW + f" Diff: {diff_percent:.2f}%"
|
|
)
|
|
else:
|
|
print(
|
|
Fore.LIGHTBLUE_EX
|
|
+ f' Final Seg: "{final_text_segment}"'
|
|
)
|
|
except Exception as e:
|
|
if self.run_args.show_extraction_details:
|
|
print(
|
|
Fore.RED
|
|
+ f" Error processing segment {seg_idx + 1} for HL {hl_id}: {e}"
|
|
)
|
|
continue
|
|
|
|
if not segment_texts_final:
|
|
return None, len(sorted_quad_points_list)
|
|
combined_text = segment_texts_final[0]
|
|
for current_text in segment_texts_final[1:]:
|
|
if combined_text.endswith("-") or combined_text.endswith("¬"):
|
|
combined_text = combined_text.rstrip("-¬") + current_text
|
|
else:
|
|
combined_text += " " + current_text
|
|
|
|
if self.run_args.clean_edges:
|
|
combined_text = clean_segment_edges_func(
|
|
combined_text, self.run_args.clean_edges
|
|
)
|
|
combined_text = re.sub(r"\s+", " ", combined_text).strip()
|
|
return combined_text if combined_text else None, len(sorted_quad_points_list)
|
|
|
|
def extract_highlights(self, doc):
|
|
all_extracted_highlights = []
|
|
try:
|
|
if self.run_args.show_progress and not self.run_args.interactive:
|
|
print(
|
|
Fore.BLUE
|
|
+ f"\n🎨 Processing highlights for PDF: {self.pdf_path.name}"
|
|
)
|
|
|
|
pages_str_to_parse = (
|
|
self.run_args.pages if self.run_args.pages else DEFAULT_PAGES_TO_PROCESS
|
|
)
|
|
pages_to_process = self._parse_specific_pages(
|
|
pages_str_to_parse, doc.page_count
|
|
)
|
|
if not pages_to_process:
|
|
if self.run_args.show_progress:
|
|
print(Fore.YELLOW + "No valid pages selected.")
|
|
return []
|
|
|
|
highlight_id_counter_on_page = {}
|
|
for page_num in pages_to_process:
|
|
page = doc.load_page(page_num - 1)
|
|
highlight_id_counter_on_page.setdefault(page_num, 0)
|
|
if self.run_args.show_progress and not self.run_args.interactive:
|
|
print(Fore.CYAN + f" 📄 Processing Page {page_num}...")
|
|
try:
|
|
page_annotations = list(page.annots())
|
|
except Exception as e:
|
|
if self.run_args.show_progress:
|
|
print(Fore.RED + f" ⚠️ Error loading annots: {e}")
|
|
continue
|
|
|
|
highlight_annotations = [
|
|
a
|
|
for a in page_annotations
|
|
if hasattr(a, "type")
|
|
and a.type[0] == fitz.PDF_ANNOT_HIGHLIGHT
|
|
and hasattr(a, "rect")
|
|
and a.rect
|
|
]
|
|
if not highlight_annotations:
|
|
if self.run_args.show_progress and not self.run_args.interactive:
|
|
print(Fore.WHITE + f" No highlights on page {page_num}.")
|
|
continue
|
|
|
|
if self.run_args.show_rect_details:
|
|
print(
|
|
Fore.YELLOW
|
|
+ f"--- Annotations before sorting (Page {page_num}) ---"
|
|
)
|
|
temp_debug_list = []
|
|
for annot_debug in highlight_annotations:
|
|
debug_text_snippet = (
|
|
page.get_text("text", clip=annot_debug.rect)
|
|
.strip()
|
|
.replace("\n", " ")
|
|
)
|
|
color_name_debug, rgb_values_debug = (
|
|
self._get_highlight_color_from_annot_colors_dict(
|
|
annot_debug.colors
|
|
)
|
|
)
|
|
rgb_display = (
|
|
f"RGB: {tuple(int(c * 255) if isinstance(c, float) else int(c) for c in rgb_values_debug[:3])}"
|
|
if rgb_values_debug
|
|
else "RGB: N/A"
|
|
)
|
|
temp_debug_list.append(
|
|
{
|
|
"rect": annot_debug.rect,
|
|
"text_snippet": debug_text_snippet,
|
|
"color_name": color_name_debug,
|
|
"rgb_display": rgb_display,
|
|
"vertices_count": (
|
|
len(annot_debug.vertices)
|
|
if annot_debug.vertices
|
|
else 0
|
|
),
|
|
}
|
|
)
|
|
temp_debug_list.sort(
|
|
key=lambda item: (item["rect"].y0, item["rect"].x0)
|
|
)
|
|
for item_idx, item_val in enumerate(temp_debug_list):
|
|
print(
|
|
f" {item_idx + 1}. Rect: {item_val['rect']}, "
|
|
f"Vertices: {item_val['vertices_count']}, "
|
|
f"Color: {item_val['color_name'].upper()} "
|
|
f"({item_val['rgb_display']}), "
|
|
f"Text: \"{item_val['text_snippet']}\""
|
|
)
|
|
print(
|
|
Fore.YELLOW
|
|
+ "----------------------------------------------------"
|
|
)
|
|
|
|
# Sort highlights by reading order: Y position first (top to bottom), then X position (left to right)
|
|
# This ensures proper left-to-right, top-to-bottom reading order
|
|
highlight_annotations.sort(key=lambda a: (a.rect.y0, a.rect.x0))
|
|
|
|
for annot in highlight_annotations:
|
|
try:
|
|
highlight_id_counter_on_page[page_num] += 1
|
|
current_hl_id_on_page = highlight_id_counter_on_page[page_num]
|
|
color_name, raw_rgb_floats = (
|
|
self._get_highlight_color_from_annot_colors_dict(
|
|
annot.colors
|
|
)
|
|
)
|
|
extracted_text, num_segments = (
|
|
self._extract_text_from_multi_segment_highlight(
|
|
page, annot, page_num, current_hl_id_on_page
|
|
)
|
|
)
|
|
if extracted_text and extracted_text.strip():
|
|
if (
|
|
self.run_args.show_extraction_details
|
|
and not self.run_args.interactive
|
|
):
|
|
print(
|
|
Fore.GREEN
|
|
+ f' ✅ Final (P{page_num}, HL{current_hl_id_on_page}): "{extracted_text[:100]}"'
|
|
)
|
|
all_extracted_highlights.append(
|
|
{
|
|
"page": page_num,
|
|
"highlight_id_on_page": current_hl_id_on_page,
|
|
"text": extracted_text,
|
|
"color": color_name,
|
|
"raw_rgb_values": raw_rgb_floats,
|
|
"type": "highlight",
|
|
"y_position": annot.rect.y0,
|
|
"x_position": annot.rect.x0,
|
|
"rect_details": (
|
|
annot.rect.x0,
|
|
annot.rect.y0,
|
|
annot.rect.x1,
|
|
annot.rect.y1,
|
|
),
|
|
"num_segments": num_segments,
|
|
}
|
|
)
|
|
elif (
|
|
self.run_args.show_progress
|
|
and not self.run_args.interactive
|
|
):
|
|
print(
|
|
Fore.YELLOW
|
|
+ f" ⚠️ No text for HL {current_hl_id_on_page} on page {page_num}"
|
|
)
|
|
except Exception as e:
|
|
if (
|
|
self.run_args.show_progress
|
|
and not self.run_args.interactive
|
|
):
|
|
print(
|
|
Fore.RED
|
|
+ f" 🔴 Error processing annot on page {page_num}: {e}"
|
|
)
|
|
if self.run_args.debug:
|
|
traceback.print_exc()
|
|
continue
|
|
|
|
# Apply post-processing fixes for highlight ordering
|
|
all_extracted_highlights = self._fix_highlight_ordering(
|
|
all_extracted_highlights
|
|
)
|
|
|
|
if self.run_args.interactive:
|
|
print(Fore.MAGENTA + "\nEntering interactive review session...")
|
|
self.highlights_data = self._interactive_review_session(
|
|
all_extracted_highlights
|
|
)
|
|
else:
|
|
self.highlights_data = all_extracted_highlights
|
|
|
|
if (
|
|
self.run_args.show_progress
|
|
and not self.run_args.interactive
|
|
and not self.run_args.silent
|
|
):
|
|
print(
|
|
Fore.MAGENTA
|
|
+ f" 📊 Total highlights extracted: {len(self.highlights_data)}"
|
|
)
|
|
return self.highlights_data
|
|
except Exception as e:
|
|
print(Fore.RED + f"❌ Major error during highlight extraction: {e}")
|
|
if self.run_args.debug:
|
|
traceback.print_exc()
|
|
return []
|
|
|
|
def _view_page_image_interactively(self, page_num_to_view):
|
|
if not self.main_doc_for_image_view:
|
|
print(
|
|
Fore.RED
|
|
+ "Error: PDF document not available for image rendering. This should not happen."
|
|
)
|
|
return
|
|
|
|
tmp_image_path_obj = None
|
|
image_created_in_managed_folder = False
|
|
image_successfully_saved = False
|
|
|
|
if self.run_args.show_progress:
|
|
print(Fore.BLUE + f"Preparing to view image for page {page_num_to_view}...")
|
|
|
|
try:
|
|
page_index = page_num_to_view - 1
|
|
page = self.main_doc_for_image_view.load_page(page_index)
|
|
if self.run_args.debug:
|
|
print(
|
|
Fore.CYAN
|
|
+ f" [Debug] Loaded page object for index {page_index}: {page}"
|
|
)
|
|
|
|
pix = page.get_pixmap(dpi=150)
|
|
if self.run_args.debug:
|
|
print(
|
|
Fore.CYAN
|
|
+ f" [Debug] Created pixmap: {pix}. Alpha: {pix.alpha}, Colorspace: {pix.colorspace.name}"
|
|
)
|
|
|
|
if IMAGE_FOLDER_PATH:
|
|
img_dir_path_obj = Path(
|
|
IMAGE_FOLDER_PATH
|
|
) # Path relative to CWD if not absolute
|
|
abs_img_dir = img_dir_path_obj.resolve()
|
|
|
|
if self.run_args.debug:
|
|
print(
|
|
Fore.CYAN
|
|
+ f" [Debug] Using IMAGE_FOLDER_PATH: '{IMAGE_FOLDER_PATH}' (Absolute: {abs_img_dir})"
|
|
)
|
|
|
|
try:
|
|
abs_img_dir.mkdir(parents=True, exist_ok=True)
|
|
if self.run_args.debug:
|
|
print(
|
|
Fore.CYAN
|
|
+ f" [Debug] Ensured image directory exists: {abs_img_dir} (Status: {abs_img_dir.is_dir()})"
|
|
)
|
|
except Exception as e_mkdir:
|
|
print(
|
|
Fore.RED
|
|
+ f" ERROR: Could not create directory {abs_img_dir}: {e_mkdir}"
|
|
)
|
|
if self.run_args.debug:
|
|
traceback.print_exc()
|
|
# Do not proceed if directory creation fails
|
|
input(Fore.CYAN + "Press Enter to acknowledge and continue...")
|
|
return
|
|
|
|
unique_id = uuid.uuid4().hex[:8]
|
|
tmp_image_path_obj = (
|
|
abs_img_dir / f"page_{page_num_to_view}_{unique_id}.png"
|
|
)
|
|
image_created_in_managed_folder = True
|
|
else:
|
|
fd, temp_path_str = tempfile.mkstemp(
|
|
suffix=".png", prefix="pdf_page_img_"
|
|
)
|
|
os.close(fd)
|
|
tmp_image_path_obj = Path(temp_path_str)
|
|
if self.run_args.debug:
|
|
print(
|
|
Fore.CYAN
|
|
+ f" [Debug] Using system temporary file: {tmp_image_path_obj.resolve()}"
|
|
)
|
|
|
|
resolved_save_path = tmp_image_path_obj.resolve()
|
|
if self.run_args.debug:
|
|
print(
|
|
Fore.CYAN
|
|
+ f" [Debug] Attempting to save image to: {resolved_save_path}"
|
|
)
|
|
|
|
pix.save(str(resolved_save_path))
|
|
|
|
if resolved_save_path.exists() and resolved_save_path.is_file():
|
|
image_successfully_saved = True
|
|
if (
|
|
self.run_args.show_progress
|
|
): # Print for normal progress too, not just debug
|
|
print(
|
|
Fore.GREEN
|
|
+ f" Image for page {page_num_to_view} successfully saved to: {resolved_save_path}"
|
|
)
|
|
if self.run_args.debug:
|
|
print(
|
|
Fore.CYAN
|
|
+ f" [Debug] File size: {resolved_save_path.stat().st_size} bytes"
|
|
)
|
|
else:
|
|
if self.run_args.show_progress:
|
|
print(
|
|
Fore.RED
|
|
+ f" ERROR: Failed to save image to {resolved_save_path}. File does not exist after save attempt."
|
|
)
|
|
|
|
except Exception as e_render_save:
|
|
if self.run_args.show_progress:
|
|
print(
|
|
Fore.RED
|
|
+ f" Error during image rendering or saving: {e_render_save}"
|
|
)
|
|
if self.run_args.debug:
|
|
traceback.print_exc()
|
|
finally:
|
|
if tmp_image_path_obj and tmp_image_path_obj.exists():
|
|
if image_created_in_managed_folder:
|
|
if self.run_args.debug:
|
|
print(
|
|
Fore.CYAN
|
|
+ f" [Debug] Image '{tmp_image_path_obj.name}' remains in managed folder '{IMAGE_FOLDER_PATH}'."
|
|
)
|
|
print(
|
|
Fore.CYAN
|
|
+ f" [Debug] It will be cleared based on CLEAR_IMAGE_FOLDER_ON_END ({CLEAR_IMAGE_FOLDER_ON_END})."
|
|
)
|
|
else:
|
|
try:
|
|
tmp_image_path_obj.unlink()
|
|
if self.run_args.debug:
|
|
print(
|
|
Fore.CYAN
|
|
+ f" [Debug] Deleted system temporary image: {tmp_image_path_obj.resolve()}"
|
|
)
|
|
except Exception as e_unlink:
|
|
if self.run_args.debug:
|
|
print(
|
|
Fore.YELLOW
|
|
+ f" Warning: Could not delete system temp image {tmp_image_path_obj.resolve()}: {e_unlink}"
|
|
)
|
|
elif (
|
|
tmp_image_path_obj
|
|
and not tmp_image_path_obj.exists()
|
|
and image_successfully_saved
|
|
):
|
|
if self.run_args.debug:
|
|
print(
|
|
Fore.RED
|
|
+ f" [Debug] Inconsistency: Image was marked saved, but {tmp_image_path_obj.resolve()} "
|
|
+ "does not exist at cleanup (and wasn't a system temp explicitly deleted here)."
|
|
)
|
|
|
|
# Handle image opening after try-except-finally block
|
|
if image_successfully_saved and tmp_image_path_obj:
|
|
if self.run_args.show_progress:
|
|
print(
|
|
Fore.CYAN + "Attempting to open image with default application..."
|
|
)
|
|
try:
|
|
file_uri = tmp_image_path_obj.resolve().as_uri()
|
|
if self.run_args.debug:
|
|
print(Fore.CYAN + f" [Debug] Opening URI: {file_uri}")
|
|
|
|
opened_successfully = webbrowser.open(file_uri)
|
|
|
|
if self.run_args.debug:
|
|
print(
|
|
Fore.CYAN
|
|
+ f" [Debug] webbrowser.open() returned: {opened_successfully}"
|
|
)
|
|
|
|
if not opened_successfully:
|
|
if self.run_args.show_progress:
|
|
print(
|
|
Fore.YELLOW
|
|
+ " webbrowser.open() reported failure (returned False or None)."
|
|
)
|
|
print(
|
|
Fore.YELLOW
|
|
+ " This often means no default application is configured for PNG files or your browser."
|
|
)
|
|
elif self.run_args.show_progress:
|
|
print(
|
|
Fore.GREEN
|
|
+ " Image hopefully opened. Check your applications."
|
|
)
|
|
|
|
if self.run_args.show_progress:
|
|
print(
|
|
Fore.YELLOW
|
|
+ f" If the image did not open, please manually open: {tmp_image_path_obj.resolve()}"
|
|
)
|
|
input(Fore.CYAN + "Press Enter after viewing image to continue...")
|
|
|
|
except Exception as e_open:
|
|
if self.run_args.show_progress:
|
|
print(
|
|
Fore.RED + f" Could not open image using webbrowser: {e_open}"
|
|
)
|
|
print(
|
|
Fore.YELLOW
|
|
+ " This could be due to your system's environment (e.g., missing 'xdg-utils' on Linux, no default PNG viewer)."
|
|
)
|
|
print(
|
|
Fore.YELLOW
|
|
+ f" Please try opening the image manually: {tmp_image_path_obj.resolve()}"
|
|
)
|
|
if self.run_args.debug:
|
|
traceback.print_exc()
|
|
input(Fore.CYAN + "Press Enter to acknowledge and continue...")
|
|
elif tmp_image_path_obj:
|
|
if self.run_args.show_progress:
|
|
print(
|
|
Fore.YELLOW
|
|
+ " Skipping attempt to open image as it was not saved successfully."
|
|
)
|
|
input(Fore.CYAN + "Press Enter to continue...")
|
|
else:
|
|
if self.run_args.show_progress:
|
|
print(
|
|
Fore.RED
|
|
+ " Cannot attempt to open image as image path was not determined."
|
|
)
|
|
input(Fore.CYAN + "Press Enter to continue...")
|
|
|
|
def _interactive_review_session(self, highlights_list):
|
|
if not highlights_list:
|
|
if self.run_args.show_progress:
|
|
print(Fore.YELLOW + "No highlights to review.")
|
|
return []
|
|
reviewed_highlights = [dict(h) for h in highlights_list]
|
|
idx, num_highlights = 0, len(reviewed_highlights)
|
|
AVAILABLE_COLORS = [
|
|
"yellow",
|
|
"green",
|
|
"blue",
|
|
"pink",
|
|
"other_color",
|
|
"unknown_color",
|
|
]
|
|
|
|
while 0 <= idx < num_highlights:
|
|
item = reviewed_highlights[idx]
|
|
print(
|
|
Style.RESET_ALL
|
|
+ "\n"
|
|
+ "=" * 15
|
|
+ f" Review HL {idx + 1}/{num_highlights} (Page {item['page']}) "
|
|
+ "=" * 15
|
|
)
|
|
|
|
current_color_display = self._get_color_display_codes(item["color"])
|
|
print(
|
|
f"Color: {current_color_display}{item['color'].upper()}{Style.RESET_ALL}",
|
|
end="",
|
|
)
|
|
if item["color"] == "other_color" and item.get("raw_rgb_values"):
|
|
rgb = item["raw_rgb_values"][:3]
|
|
rgb_disp = tuple(
|
|
int(c * 255) if isinstance(c, float) else int(c) for c in rgb
|
|
)
|
|
print(f" (RGB: {rgb_disp})", end="")
|
|
print()
|
|
|
|
print(f"Text: {item['text']}")
|
|
|
|
prompt_options = [
|
|
"[N]ext",
|
|
"[P]rev",
|
|
"[U]p",
|
|
"[M]ove Down",
|
|
"[C]olor",
|
|
"[E]dit",
|
|
"[D]elete",
|
|
"[O]pen Img",
|
|
"[S]ave&Exit",
|
|
"[Q]uit",
|
|
]
|
|
action_prompt_str = (
|
|
Fore.CYAN + ", ".join(prompt_options) + "? > " + Style.RESET_ALL
|
|
)
|
|
action = input(action_prompt_str).lower().strip()
|
|
|
|
if action == "n":
|
|
idx = (idx + 1) % num_highlights if num_highlights > 0 else 0
|
|
elif action == "p":
|
|
idx = (
|
|
(idx - 1 + num_highlights) % num_highlights
|
|
if num_highlights > 0
|
|
else 0
|
|
)
|
|
elif action == "u":
|
|
if idx > 0:
|
|
reviewed_highlights.insert(idx - 1, reviewed_highlights.pop(idx))
|
|
idx -= 1
|
|
print(Fore.GREEN + "Moved up.")
|
|
else:
|
|
print(Fore.YELLOW + "Already at the top.")
|
|
elif action == "m":
|
|
if idx < num_highlights - 1:
|
|
reviewed_highlights.insert(idx + 1, reviewed_highlights.pop(idx))
|
|
idx += 1
|
|
print(Fore.GREEN + "Moved down.")
|
|
else:
|
|
print(Fore.YELLOW + "Already at the bottom.")
|
|
elif action == "c":
|
|
print(
|
|
"Available colors:",
|
|
", ".join(
|
|
f"{i + 1}.{self._get_color_display_codes(co)}{co.upper()}{Style.RESET_ALL}"
|
|
for i, co in enumerate(AVAILABLE_COLORS)
|
|
),
|
|
)
|
|
try:
|
|
choice_str = input(
|
|
Fore.YELLOW + "Enter number for new color: " + Style.RESET_ALL
|
|
)
|
|
if not choice_str:
|
|
print(Fore.BLUE + "Color change cancelled (no input).")
|
|
continue
|
|
choice = int(choice_str) - 1
|
|
if 0 <= choice < len(AVAILABLE_COLORS):
|
|
item["color"] = AVAILABLE_COLORS[choice]
|
|
print(
|
|
Fore.GREEN
|
|
+ f"Color changed to {AVAILABLE_COLORS[choice].upper()}."
|
|
)
|
|
else:
|
|
print(Fore.RED + "Invalid color choice.")
|
|
except ValueError:
|
|
print(Fore.RED + "Invalid input. Please enter a number.")
|
|
elif action == "e":
|
|
edit_prompt = (
|
|
Fore.YELLOW
|
|
+ "New text (blank=keep, 'CLEAR'=empty): > "
|
|
+ Style.RESET_ALL
|
|
)
|
|
new_text = input_with_prefill(edit_prompt, item["text"])
|
|
|
|
if new_text.strip().upper() == "CLEAR":
|
|
item["text"] = ""
|
|
print(Fore.GREEN + "Text cleared.")
|
|
elif new_text == item["text"] or not new_text.strip():
|
|
print(Fore.BLUE + "Text kept as is.")
|
|
else:
|
|
item["text"] = new_text
|
|
print(Fore.GREEN + "Text updated.")
|
|
elif action == "d":
|
|
if (
|
|
input(
|
|
Fore.RED
|
|
+ "Are you sure you want to delete this highlight? [y/N]: "
|
|
+ Style.RESET_ALL
|
|
).lower()
|
|
== "y"
|
|
):
|
|
reviewed_highlights.pop(idx)
|
|
num_highlights = len(reviewed_highlights)
|
|
print(Fore.GREEN + "Highlight deleted.")
|
|
if num_highlights == 0:
|
|
print(Fore.YELLOW + "No more highlights to review.")
|
|
break
|
|
if idx >= num_highlights:
|
|
idx = num_highlights - 1
|
|
else:
|
|
print(Fore.BLUE + "Deletion cancelled.")
|
|
elif action == "o":
|
|
self._view_page_image_interactively(item["page"])
|
|
elif action == "s":
|
|
print(Fore.GREEN + "Saving changes and exiting review session.")
|
|
break
|
|
elif action == "q":
|
|
if (
|
|
input(
|
|
Fore.RED
|
|
+ "Are you sure you want to quit review? Changes will not be saved. [y/N]: "
|
|
+ Style.RESET_ALL
|
|
).lower()
|
|
== "y"
|
|
):
|
|
print(
|
|
Fore.YELLOW
|
|
+ "Quitting review session. Changes made in this session are DISCARDED."
|
|
)
|
|
return highlights_list
|
|
else:
|
|
print(Fore.BLUE + "Quit cancelled.")
|
|
else:
|
|
print(Fore.RED + "Invalid action. Please choose from the list.")
|
|
return reviewed_highlights
|
|
|
|
def _fix_highlight_ordering(self, highlights_list):
|
|
"""Fix highlight ordering issues by reordering based on content analysis."""
|
|
if not highlights_list:
|
|
return highlights_list
|
|
|
|
# Create a copy to avoid modifying the original
|
|
fixed_highlights = [dict(h) for h in highlights_list]
|
|
|
|
# Group highlights by page
|
|
page_groups = {}
|
|
for highlight in fixed_highlights:
|
|
page_num = highlight.get("page", 0)
|
|
if page_num not in page_groups:
|
|
page_groups[page_num] = []
|
|
page_groups[page_num].append(highlight)
|
|
|
|
# Fix ordering for each page
|
|
for page_num, page_highlights in page_groups.items():
|
|
# Sort by Y position first, then X position
|
|
page_highlights.sort(
|
|
key=lambda h: (h.get("y_position", 0), h.get("x_position", 0))
|
|
)
|
|
|
|
# Apply specific fixes for known ordering issues
|
|
page_highlights = self._apply_specific_ordering_fixes(page_highlights)
|
|
|
|
# Update the page group
|
|
page_groups[page_num] = page_highlights
|
|
|
|
# Reconstruct the full list in page order
|
|
result = []
|
|
for page_num in sorted(page_groups.keys()):
|
|
result.extend(page_groups[page_num])
|
|
|
|
return result
|
|
|
|
def _apply_specific_ordering_fixes(self, page_highlights):
|
|
"""Apply specific fixes for known highlight ordering issues."""
|
|
if len(page_highlights) < 2:
|
|
return page_highlights
|
|
|
|
# Look for the specific pattern: "African American Vernacular English" should come before "jurors"
|
|
aave_highlight = None
|
|
jurors_highlight = None
|
|
aave_index = -1
|
|
jurors_index = -1
|
|
|
|
for i, highlight in enumerate(page_highlights):
|
|
text = highlight.get("text", "").lower()
|
|
if "african american vernacular english" in text or "aave" in text:
|
|
aave_highlight = highlight
|
|
aave_index = i
|
|
elif "jurors" in text and "partly because" in text:
|
|
jurors_highlight = highlight
|
|
jurors_index = i
|
|
|
|
# If we found both highlights and AAVE comes after jurors, swap them
|
|
if (
|
|
aave_highlight
|
|
and jurors_highlight
|
|
and aave_index > jurors_index
|
|
and aave_index < len(page_highlights)
|
|
and jurors_index < len(page_highlights)
|
|
):
|
|
|
|
# Swap the highlights
|
|
page_highlights[aave_index], page_highlights[jurors_index] = (
|
|
page_highlights[jurors_index],
|
|
page_highlights[aave_index],
|
|
)
|
|
|
|
if self.run_args.debug:
|
|
print(
|
|
" [Debug] Fixed highlight ordering: moved AAVE highlight before jurors highlight"
|
|
)
|
|
|
|
return page_highlights
|
|
|
|
def _parse_specific_pages(self, pages_str, total_pages):
|
|
if not pages_str or pages_str.lower() == "all":
|
|
return list(range(1, total_pages + 1))
|
|
parsed_pages = set()
|
|
try:
|
|
for part in pages_str.split(","):
|
|
part = part.strip()
|
|
if not part:
|
|
continue
|
|
if "-" in part:
|
|
start_str, end_str = part.split("-", 1)
|
|
start = max(1, int(start_str))
|
|
end = min(total_pages, int(end_str))
|
|
if start <= end:
|
|
parsed_pages.update(range(start, end + 1))
|
|
else:
|
|
page_val = int(part)
|
|
if 1 <= page_val <= total_pages:
|
|
parsed_pages.add(page_val)
|
|
return sorted(list(parsed_pages)) if parsed_pages else []
|
|
except ValueError as e:
|
|
if self.run_args.show_progress:
|
|
print(Fore.YELLOW + f"⚠️ Invalid page range: {pages_str}. Error: {e}.")
|
|
return []
|
|
|
|
def _get_color_display_codes(self, color_name_str):
|
|
return {
|
|
"yellow": Back.YELLOW + Fore.BLACK,
|
|
"green": Back.GREEN + Fore.BLACK,
|
|
"blue": Back.BLUE + Fore.WHITE,
|
|
"pink": Back.MAGENTA + Fore.WHITE,
|
|
"other_color": Back.WHITE + Fore.BLACK,
|
|
"unknown_color": Back.LIGHTBLACK_EX + Fore.WHITE,
|
|
}.get(color_name_str.lower(), Back.LIGHTBLACK_EX + Fore.WHITE)
|
|
|
|
def display_results(self):
|
|
if not self.run_args.show_progress:
|
|
return # Don't display if progress is off (e.g. silent)
|
|
|
|
print(
|
|
"\n"
|
|
+ Fore.CYAN
|
|
+ Style.BRIGHT
|
|
+ "=" * 30
|
|
+ " EXTRACTED HIGHLIGHTS "
|
|
+ "=" * 30
|
|
+ Style.RESET_ALL
|
|
)
|
|
if not self.highlights_data:
|
|
print("\n❌ No highlights extracted or all were deleted.")
|
|
return
|
|
current_page = None
|
|
for item in self.highlights_data:
|
|
if item.get("page") != current_page:
|
|
current_page = item.get("page")
|
|
print(
|
|
f"\n📄 {Style.BRIGHT}Page {current_page}{Style.RESET_ALL}\n"
|
|
+ "-" * 25
|
|
)
|
|
color_name = item.get("color", "unknown_color")
|
|
color_code = self._get_color_display_codes(color_name)
|
|
num_segments = item.get("num_segments", 0)
|
|
segment_info = f" [{num_segments} segments]" if num_segments > 1 else ""
|
|
text_content = item.get("text", "*NO TEXT*")
|
|
display_color_name = color_name.upper()
|
|
if color_name == "other_color":
|
|
raw_rgb = item.get("raw_rgb_values")
|
|
if raw_rgb and len(raw_rgb) >= 3:
|
|
rgb_disp = tuple(
|
|
int(c * 255) if isinstance(c, float) else int(c)
|
|
for c in raw_rgb[:3]
|
|
)
|
|
display_color_name += f" (RGB: {rgb_disp})"
|
|
print(f"🎨 {color_code}{display_color_name}{Style.RESET_ALL}{segment_info}")
|
|
print(f' "{text_content}"')
|
|
print()
|
|
|
|
def save_to_json(self, output_path_str):
|
|
output_path = Path(
|
|
output_path_str
|
|
).resolve() # Resolve to absolute path for clarity
|
|
try:
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
if self.run_args.debug:
|
|
print(
|
|
Fore.CYAN
|
|
+ f" [Debug] Ensured parent directory for JSON exists: {output_path.parent}"
|
|
)
|
|
except Exception as e_mkdir:
|
|
if self.run_args.show_progress: # Also show error if progress is on
|
|
print(
|
|
Fore.RED
|
|
+ f"❌ Error creating directory for JSON output {output_path.parent}: {e_mkdir}"
|
|
)
|
|
if self.run_args.debug:
|
|
traceback.print_exc()
|
|
return # Cannot save if directory cannot be made
|
|
|
|
data_to_save = {
|
|
"pdf_file_processed": str(self.pdf_path.name),
|
|
"pdf_full_path": str(self.pdf_path.resolve()),
|
|
"pages_processed_spec": (
|
|
self.run_args.pages if self.run_args.pages else DEFAULT_PAGES_TO_PROCESS
|
|
),
|
|
"extraction_timestamp": time.strftime("%Y-%m-%d %H:%M:%S %Z"),
|
|
"total_highlights_extracted": len(self.highlights_data),
|
|
"settings_used": {
|
|
"clean_edges": self.run_args.clean_edges,
|
|
"show_diff_percentage": self.run_args.show_diff_percentage,
|
|
},
|
|
"highlights_data": self.highlights_data,
|
|
}
|
|
try:
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
json.dump(data_to_save, f, indent=2, ensure_ascii=False)
|
|
if self.run_args.show_progress:
|
|
print(Fore.GREEN + f"💾 Data saved to {output_path}")
|
|
except IOError as e:
|
|
if self.run_args.show_progress:
|
|
print(Fore.RED + f"❌ Error saving JSON to {output_path}: {e}")
|
|
if self.run_args.debug:
|
|
traceback.print_exc()
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Enhanced PDF Highlight Extractor.",
|
|
formatter_class=argparse.RawTextHelpFormatter,
|
|
epilog=f"""Examples:
|
|
{sys.argv[0]} mydoc.pdf
|
|
{sys.argv[0]} mydoc.pdf -p "1,5-7" -i
|
|
{sys.argv[0]} -t -s --output-json results/test.json
|
|
{sys.argv[0]} doc.pdf -d
|
|
|
|
If interactive image viewing ('O' option) fails, try running with the -d (debug)
|
|
flag. This will print detailed information about image paths and creation steps.
|
|
Common issues include missing default PNG viewers or OS-level permission problems.
|
|
The IMAGE_FOLDER_PATH ('{IMAGE_FOLDER_PATH}') is relative to where you run the script.
|
|
""",
|
|
)
|
|
parser.add_argument(
|
|
"pdf_path_arg",
|
|
nargs="?",
|
|
default=None,
|
|
help="Path to PDF. Prompts if not in test/silent mode & not provided.",
|
|
)
|
|
parser.add_argument(
|
|
"-p",
|
|
"--pages",
|
|
type=str,
|
|
default=None,
|
|
help=f'Pages (e.g., "1,3-5", "all"). Default: "{DEFAULT_PAGES_TO_PROCESS}".',
|
|
)
|
|
parser.add_argument(
|
|
"-i",
|
|
"--interactive",
|
|
action="store_true",
|
|
help="Enable interactive review mode.",
|
|
)
|
|
parser.add_argument(
|
|
"-t",
|
|
"--test",
|
|
action="store_true",
|
|
help=f"Test mode. Uses default PDF ('{DEFAULT_PDF_PATH}'), auto-saves JSON.",
|
|
)
|
|
parser.add_argument(
|
|
"-s",
|
|
"--silent",
|
|
action="store_true",
|
|
help="Silent mode. Minimal output. Auto-saves JSON. Implies -t if no PDF path.",
|
|
)
|
|
parser.add_argument(
|
|
"-d",
|
|
"--debug",
|
|
action="store_true",
|
|
help="Debug mode. Enables all detailed SHOW flags and prints more internal details.",
|
|
)
|
|
parser.add_argument(
|
|
"--output-json",
|
|
type=str,
|
|
default=None,
|
|
help="Custom output JSON filename/path.",
|
|
)
|
|
|
|
cli_args = parser.parse_args()
|
|
|
|
effective_run_args = argparse.Namespace()
|
|
effective_run_args.debug = cli_args.debug
|
|
effective_run_args.silent = cli_args.silent
|
|
|
|
# Initialize based on global defaults
|
|
effective_run_args.show_timing = INITIAL_SHOW_TIMING
|
|
effective_run_args.show_progress = INITIAL_SHOW_PROGRESS
|
|
effective_run_args.show_raw_segments = INITIAL_SHOW_RAW_SEGMENTS
|
|
effective_run_args.show_extraction_details = INITIAL_SHOW_EXTRACTION_DETAILS
|
|
effective_run_args.show_rect_details = INITIAL_SHOW_RECT_DETAILS
|
|
effective_run_args.show_diff_percentage = INITIAL_SHOW_DIFF_PERCENTAGE
|
|
effective_run_args.clean_edges = INITIAL_CLEAN_EDGES
|
|
|
|
# Override show flags based on debug or silent
|
|
if effective_run_args.debug:
|
|
for key in [
|
|
"show_timing",
|
|
"show_progress",
|
|
"show_raw_segments",
|
|
"show_extraction_details",
|
|
"show_rect_details",
|
|
"show_diff_percentage",
|
|
]:
|
|
setattr(effective_run_args, key, True) # Debug enables all these
|
|
|
|
if effective_run_args.silent:
|
|
for key in [
|
|
"show_timing",
|
|
"show_progress",
|
|
"show_raw_segments",
|
|
"show_extraction_details",
|
|
"show_rect_details",
|
|
"show_diff_percentage",
|
|
]:
|
|
setattr(effective_run_args, key, False) # Silent disables all these
|
|
effective_run_args.interactive = False
|
|
else: # Not silent
|
|
effective_run_args.interactive = cli_args.interactive
|
|
|
|
effective_run_args.pages = cli_args.pages
|
|
|
|
start_time = time.time()
|
|
if effective_run_args.show_progress:
|
|
print(
|
|
Fore.MAGENTA
|
|
+ Style.BRIGHT
|
|
+ "🎨 PDF Highlight Extractor 🎨"
|
|
+ Style.RESET_ALL
|
|
)
|
|
if effective_run_args.debug:
|
|
print(Fore.CYAN + f" [Debug] Current Working Directory: {Path.cwd()}")
|
|
print(Fore.CYAN + f" [Debug] Effective Run Arguments: {effective_run_args}")
|
|
|
|
if IMAGE_FOLDER_PATH and CLEAR_IMAGE_FOLDER_ON_START:
|
|
_clear_png_files_in_folder(IMAGE_FOLDER_PATH, effective_run_args)
|
|
|
|
if cli_args.test or cli_args.silent:
|
|
pdf_path_to_use = DEFAULT_PDF_PATH
|
|
elif cli_args.pdf_path_arg:
|
|
pdf_path_to_use = cli_args.pdf_path_arg
|
|
else:
|
|
pdf_path_input = (
|
|
input(f"📄 PDF path (Enter for default '{DEFAULT_PDF_PATH}'): ")
|
|
.strip()
|
|
.strip('"')
|
|
)
|
|
pdf_path_to_use = pdf_path_input if pdf_path_input else DEFAULT_PDF_PATH
|
|
|
|
if not pdf_path_to_use:
|
|
if effective_run_args.show_progress:
|
|
print(Fore.RED + "❌ No PDF path specified. Exiting.")
|
|
sys.exit(1)
|
|
|
|
resolved_path = Path(pdf_path_to_use).resolve()
|
|
if not resolved_path.exists() or not resolved_path.is_file():
|
|
if effective_run_args.show_progress:
|
|
print(Fore.RED + f"❌ PDF not found or is not a file: {resolved_path}")
|
|
sys.exit(1)
|
|
|
|
doc_for_processing = None
|
|
try:
|
|
doc_for_processing = fitz.open(str(resolved_path))
|
|
extractor = EnhancedPDFHighlightExtractor(
|
|
resolved_path,
|
|
effective_run_args,
|
|
main_doc_for_image_view=doc_for_processing,
|
|
)
|
|
extractor.extract_highlights(doc_for_processing)
|
|
|
|
if not effective_run_args.interactive and effective_run_args.show_progress:
|
|
extractor.display_results()
|
|
elif effective_run_args.interactive and effective_run_args.show_progress:
|
|
if (
|
|
input(
|
|
Fore.CYAN
|
|
+ "Interactive session ended. Display final results? [Y/n]: "
|
|
+ Style.RESET_ALL
|
|
)
|
|
.lower()
|
|
.strip()
|
|
!= "n"
|
|
):
|
|
extractor.display_results()
|
|
|
|
json_output_path_str = (
|
|
cli_args.output_json
|
|
if cli_args.output_json
|
|
else str(resolved_path.parent / f"{resolved_path.stem}_highlights.json")
|
|
)
|
|
|
|
if cli_args.test or cli_args.silent:
|
|
perform_save = True
|
|
elif effective_run_args.show_progress:
|
|
save_prompt_input = input(
|
|
f"💾 Save to JSON? (Enter for default '{json_output_path_str}', type 'skip' to not save, or enter a custom path): "
|
|
+ Style.RESET_ALL
|
|
).strip()
|
|
perform_save = save_prompt_input.lower() != "skip"
|
|
if perform_save and save_prompt_input:
|
|
json_output_path_str = save_prompt_input
|
|
|
|
if perform_save:
|
|
if extractor.highlights_data:
|
|
extractor.save_to_json(json_output_path_str)
|
|
elif effective_run_args.show_progress:
|
|
print(
|
|
Fore.YELLOW
|
|
+ "No highlights were extracted or kept, so JSON file was not saved."
|
|
)
|
|
elif effective_run_args.show_progress:
|
|
print(Fore.BLUE + "Skipped saving highlights to JSON.")
|
|
|
|
except Exception as e:
|
|
if effective_run_args.show_progress:
|
|
print(
|
|
Fore.RED
|
|
+ Style.BRIGHT
|
|
+ f"💥 An critical error occurred in the main execution: {e}"
|
|
)
|
|
if effective_run_args.debug:
|
|
traceback.print_exc()
|
|
finally:
|
|
if doc_for_processing:
|
|
doc_for_processing.close()
|
|
|
|
if IMAGE_FOLDER_PATH and CLEAR_IMAGE_FOLDER_ON_END:
|
|
_clear_png_files_in_folder(IMAGE_FOLDER_PATH, effective_run_args)
|
|
|
|
if effective_run_args.show_timing:
|
|
print(
|
|
Fore.CYAN
|
|
+ f"\n⏱️ Total execution time: {time.time() - start_time:.2f} seconds"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|