710 lines
25 KiB
Python
710 lines
25 KiB
Python
"""
|
|
PDF Highlight Extractor
|
|
======================
|
|
|
|
A robust tool for extracting highlighted text from PDF files with intelligent text ordering
|
|
and hyphenation handling.
|
|
|
|
Overview:
|
|
--------
|
|
This tool addresses common PDF text extraction challenges:
|
|
- PDFs store text in creation order, not reading order
|
|
- Multi-line highlights can extract in wrong sequence
|
|
- Hyphenated words across lines need rejoining
|
|
- Boundary words may be partially highlighted
|
|
|
|
Architecture:
|
|
------------
|
|
1. PDFHighlightExtractor: Main class handling extraction logic
|
|
2. Multi-method extraction: Fallback system for maximum compatibility
|
|
3. Smart text ordering: Line detection and geometric sorting
|
|
4. Hyphenation merger: Detects and combines split words
|
|
|
|
Technical Approach:
|
|
-----------------
|
|
METHOD A: PyMuPDF built-in text sorting
|
|
- Uses page.get_text("text", sort=True) for automatic ordering
|
|
- Most reliable for simple layouts
|
|
|
|
METHOD B: Text block extraction
|
|
- Extracts PDF text blocks which maintain better reading order
|
|
- Geometric sorting by block position
|
|
|
|
METHOD C: Enhanced word-level sorting
|
|
- Individual word extraction with custom line detection
|
|
- Groups words by Y-position, sorts by X-position within lines
|
|
- Handles complex multi-line highlights
|
|
|
|
Hyphenation Algorithm:
|
|
--------------------
|
|
1. Detects highlights ending with '-'
|
|
2. Checks next highlight for same color and reasonable distance
|
|
3. Merges: "lin-" + "guistics" → "linguistics"
|
|
4. Supports both same-page and cross-page hyphenation
|
|
|
|
Color Detection:
|
|
---------------
|
|
- RGB color space analysis
|
|
- Supports 4 highlight colors: Yellow, Pink, Green, Blue
|
|
- Handles both fill and stroke color properties
|
|
|
|
Precision Control:
|
|
-----------------
|
|
- 40% overlap threshold for word inclusion
|
|
- +2 pixel boundary expansion for edge cases
|
|
- 5-pixel line tolerance for multi-line detection
|
|
|
|
Usage Patterns:
|
|
--------------
|
|
Test Mode: python script.py --test
|
|
- Uses default PDF path
|
|
- Display-only output
|
|
- Quick testing and debugging
|
|
|
|
Full Mode: python script.py
|
|
- Interactive prompts for file paths
|
|
- Optional JSON/CSV export
|
|
- Complete control over options
|
|
"""
|
|
import time
|
|
import pdfplumber
|
|
import fitz # PyMuPDF
|
|
import json
|
|
from colorama import init, Fore, Back, Style
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
import re
|
|
import sys
|
|
|
|
# Initialize colorama for colored terminal output
|
|
init(autoreset=True)
|
|
|
|
class PDFHighlightExtractor:
|
|
"""
|
|
Main extraction class for PDF highlighted text.
|
|
|
|
This class handles the complete extraction pipeline from PDF analysis
|
|
to formatted output with intelligent text ordering and hyphenation.
|
|
|
|
Key Features:
|
|
------------
|
|
- Multi-method text extraction with fallback
|
|
- Geometric text ordering for proper reading sequence
|
|
- Hyphenation detection and merging
|
|
- 4-color highlight support (Yellow, Pink, Green, Blue)
|
|
- Cross-page highlight handling
|
|
|
|
Extraction Pipeline:
|
|
------------------
|
|
1. PDF Loading: Opens PDF with PyMuPDF
|
|
2. Annotation Detection: Finds highlight annotations
|
|
3. Color Classification: Identifies highlight colors
|
|
4. Text Extraction: Uses multi-method approach
|
|
5. Text Ordering: Applies geometric sorting
|
|
6. Hyphenation Merging: Combines split words
|
|
7. Output Formatting: Prepares results for display/export
|
|
|
|
Methods Overview:
|
|
---------------
|
|
extract_all_highlights(): Main entry point
|
|
_extract_text_balanced(): Core text extraction with ordering
|
|
_smart_hyphenation_merge(): Hyphenation detection and merging
|
|
_is_clear_hyphenation(): Hyphenation pattern recognition
|
|
display_results(): Formatted terminal output
|
|
|
|
Usage:
|
|
------
|
|
extractor = PDFHighlightExtractor('path/to/file.pdf')
|
|
annotations, highlights = extractor.extract_all_highlights()
|
|
extractor.display_results()
|
|
"""
|
|
def __init__(self, pdf_path):
|
|
self.pdf_path = Path(pdf_path)
|
|
self.annotations = []
|
|
self.highlights = []
|
|
|
|
def extract_annotation_highlights(self):
|
|
"""Extract annotations with simple processing."""
|
|
annotations = []
|
|
try:
|
|
with pdfplumber.open(self.pdf_path) as pdf:
|
|
print(f"📄 Processing annotations...")
|
|
for page_num, page in enumerate(pdf.pages, 1):
|
|
if hasattr(page, 'annots') and page.annots:
|
|
for annot in page.annots:
|
|
try:
|
|
annot_type = annot.get('subtype', 'Unknown')
|
|
if annot_type in ['Highlight', 'Squiggly', 'StrikeOut', 'Underline', 'FreeText', 'Text']:
|
|
rect = annot.get('rect', [])
|
|
text = self._get_annotation_text(page, annot, rect)
|
|
color = self._get_simple_color(annot.get('color', []))
|
|
|
|
if text and text.strip():
|
|
annotations.append({
|
|
'page': page_num,
|
|
'text': text.strip(),
|
|
'color': color,
|
|
'type': 'annotation',
|
|
'y_position': rect[1] if len(rect) >= 4 else 0
|
|
})
|
|
except:
|
|
continue
|
|
|
|
print(f" ✅ Found {len(annotations)} annotations")
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
|
|
return annotations
|
|
|
|
def extract_background_highlights(self):
|
|
"""Extract highlights with BALANCED precision - capture complete highlights."""
|
|
all_highlights = []
|
|
|
|
try:
|
|
print(f"\n🎨 Processing highlights...")
|
|
doc = fitz.open(str(self.pdf_path))
|
|
|
|
# Collect each individual highlight with BALANCED extraction
|
|
for page_num in range(doc.page_count):
|
|
page = doc[page_num]
|
|
annotations = page.annots()
|
|
|
|
for annot in annotations:
|
|
try:
|
|
if annot.type[1] == 'Highlight':
|
|
colors = annot.colors
|
|
color_name = self._get_highlight_color(colors)
|
|
|
|
if color_name in ['yellow', 'pink', 'green', 'blue']:
|
|
# BALANCED: Extract complete highlighted phrases
|
|
text = self._extract_text_balanced(page, annot)
|
|
|
|
if text and text.strip():
|
|
all_highlights.append({
|
|
'page': page_num + 1,
|
|
'text': text.strip(),
|
|
'color': color_name,
|
|
'type': 'highlight',
|
|
'y_position': annot.rect.y0,
|
|
'x_position': annot.rect.x0,
|
|
'y_end': annot.rect.y1,
|
|
'x_end': annot.rect.x1,
|
|
'rect': annot.rect
|
|
})
|
|
print(f" 🎨 {color_name.upper()}: \"{text[:70]}...\"")
|
|
except Exception as e:
|
|
continue
|
|
|
|
doc.close()
|
|
|
|
# Smart hyphenation merging only
|
|
merged_highlights = self._smart_hyphenation_merge(all_highlights)
|
|
|
|
print(f" 📊 Raw: {len(all_highlights)} → Merged: {len(merged_highlights)}")
|
|
return merged_highlights
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
return []
|
|
|
|
def _extract_text_balanced(self, page, annot):
|
|
"""BALANCED: Extract text with PROPER READING ORDER."""
|
|
try:
|
|
# Method 1: Use PyMuPDF's built-in text ordering with sorting
|
|
highlight_rect = annot.rect
|
|
|
|
# SMALL EXPANSION for boundary words
|
|
expanded_rect = fitz.Rect(
|
|
highlight_rect.x0 - 2,
|
|
highlight_rect.y0 - 1,
|
|
highlight_rect.x1 + 2,
|
|
highlight_rect.y1 + 1
|
|
)
|
|
|
|
# METHOD A: Use text extraction with BUILT-IN SORTING
|
|
print(f" 🔍 Method A: Text extraction with sorting")
|
|
text_with_sort = page.get_text("text", clip=expanded_rect, sort=True)
|
|
if text_with_sort and text_with_sort.strip():
|
|
cleaned_text = re.sub(r'\s+', ' ', text_with_sort.strip())
|
|
print(f" ✅ Sorted text result: \"{cleaned_text}\"")
|
|
return cleaned_text
|
|
|
|
# METHOD B: Text blocks (better reading order than individual words)
|
|
print(f" 🔍 Method B: Text blocks extraction")
|
|
text_blocks = page.get_text("blocks", clip=expanded_rect)
|
|
if text_blocks:
|
|
# Sort blocks by reading order (top to bottom, left to right)
|
|
text_blocks.sort(key=lambda block: (block[1], block[0])) # y-pos, then x-pos
|
|
|
|
block_texts = []
|
|
for block in text_blocks:
|
|
if len(block) >= 5 and block[4].strip():
|
|
block_text = block[4].strip()
|
|
block_text = re.sub(r'\s+', ' ', block_text)
|
|
block_texts.append(block_text)
|
|
|
|
if block_texts:
|
|
combined_text = " ".join(block_texts)
|
|
print(f" ✅ Block result: \"{combined_text}\"")
|
|
return combined_text
|
|
|
|
# METHOD C: Enhanced word-level with geometric sorting
|
|
print(f" 🔍 Method C: Enhanced word sorting")
|
|
all_words = page.get_text("words")
|
|
highlight_words = []
|
|
|
|
for word in all_words:
|
|
word_rect = fitz.Rect(word[:4])
|
|
word_text = word[4]
|
|
|
|
if expanded_rect.intersects(word_rect):
|
|
intersection = expanded_rect & word_rect
|
|
word_area = word_rect.get_area()
|
|
|
|
if word_area > 0:
|
|
overlap_ratio = intersection.get_area() / word_area
|
|
|
|
if overlap_ratio >= 0.40:
|
|
highlight_words.append({
|
|
'text': word_text,
|
|
'x0': word[0],
|
|
'y0': word[1],
|
|
'x1': word[2],
|
|
'y1': word[3],
|
|
'center_y': (word[1] + word[3]) / 2,
|
|
'center_x': (word[0] + word[2]) / 2
|
|
})
|
|
|
|
if highlight_words:
|
|
# ENHANCED SORTING: Group by lines first, then sort within lines
|
|
# Group words by approximate line (within 5 pixels of each other)
|
|
lines = []
|
|
for word in highlight_words:
|
|
placed = False
|
|
for line in lines:
|
|
# Check if word belongs to existing line
|
|
avg_y = sum(w['center_y'] for w in line) / len(line)
|
|
if abs(word['center_y'] - avg_y) <= 5: # Same line tolerance
|
|
line.append(word)
|
|
placed = True
|
|
break
|
|
|
|
if not placed:
|
|
lines.append([word])
|
|
|
|
# Sort lines by Y position (top to bottom)
|
|
lines.sort(key=lambda line: sum(w['center_y'] for w in line) / len(line))
|
|
|
|
# Sort words within each line by X position (left to right)
|
|
for line in lines:
|
|
line.sort(key=lambda w: w['center_x'])
|
|
|
|
# Combine all words in reading order
|
|
ordered_words = []
|
|
for line in lines:
|
|
ordered_words.extend(line)
|
|
|
|
extracted_text = " ".join([w['text'] for w in ordered_words])
|
|
print(f" ✅ Enhanced word sorting ({len(ordered_words)} words): \"{extracted_text}\"")
|
|
return extracted_text
|
|
|
|
print(f" ❌ No text found in highlight area")
|
|
return ""
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Extraction error: {e}")
|
|
return ""
|
|
|
|
|
|
def _extract_by_quads_balanced(self, page, annot):
|
|
"""Extract using quad points with BALANCED precision."""
|
|
try:
|
|
quad_points = annot.vertices
|
|
if not quad_points:
|
|
return ""
|
|
|
|
quad_count = int(len(quad_points) / 4)
|
|
all_words = page.get_text("words")
|
|
highlight_words = []
|
|
|
|
print(f" 🔍 Processing {quad_count} quads with balanced precision")
|
|
|
|
for i in range(quad_count):
|
|
points = quad_points[i * 4: i * 4 + 4]
|
|
quad_rect = fitz.Quad(points).rect
|
|
|
|
# SMALL EXPANSION - 2 pixels to catch boundary words
|
|
expanded_quad = fitz.Rect(
|
|
quad_rect.x0 - 2, quad_rect.y0 - 1,
|
|
quad_rect.x1 + 2, quad_rect.y1 + 1
|
|
)
|
|
|
|
for word in all_words:
|
|
word_rect = fitz.Rect(word[:4])
|
|
word_text = word[4]
|
|
|
|
if expanded_quad.intersects(word_rect):
|
|
intersection = expanded_quad & word_rect
|
|
word_area = word_rect.get_area()
|
|
|
|
if word_area > 0:
|
|
overlap_ratio = intersection.get_area() / word_area
|
|
|
|
# RELAXED: 40% overlap required (was 75%)
|
|
if overlap_ratio >= 0.40:
|
|
highlight_words.append({
|
|
'text': word_text,
|
|
'x0': word[0],
|
|
'y0': word[1],
|
|
'line': self._estimate_line_number(word[1])
|
|
})
|
|
print(f" ✓ Quad '{word_text}' (overlap: {overlap_ratio:.2f})")
|
|
|
|
if highlight_words:
|
|
# Remove duplicates while preserving order
|
|
seen = set()
|
|
unique_words = []
|
|
for word in highlight_words:
|
|
word_key = (word['text'], word['x0'], word['y0'])
|
|
if word_key not in seen:
|
|
seen.add(word_key)
|
|
unique_words.append(word)
|
|
|
|
# Sort by reading order
|
|
unique_words.sort(key=lambda w: (w['line'], w['x0']))
|
|
extracted_text = " ".join([w['text'] for w in unique_words])
|
|
print(f" ✅ Quad balanced ({len(unique_words)} words): \"{extracted_text}\"")
|
|
return extracted_text
|
|
|
|
return ""
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Quad extraction error: {e}")
|
|
return ""
|
|
|
|
def _estimate_line_number(self, y_position, avg_line_height=14):
|
|
"""Estimate line number based on y-position."""
|
|
return round(y_position / avg_line_height)
|
|
|
|
def _smart_hyphenation_merge(self, highlights):
|
|
"""Smart merging - ONLY for clear hyphenation patterns."""
|
|
if not highlights:
|
|
return highlights
|
|
|
|
# Sort by page, color, then position
|
|
highlights.sort(key=lambda x: (x['page'], x['color'], x['y_position'], x['x_position']))
|
|
|
|
merged = []
|
|
i = 0
|
|
|
|
while i < len(highlights):
|
|
current = highlights[i]
|
|
|
|
# Look for hyphenation continuation
|
|
if (i + 1 < len(highlights) and
|
|
self._is_clear_hyphenation(current, highlights[i + 1])):
|
|
|
|
next_hl = highlights[i + 1]
|
|
merged_text = self._join_hyphenated_text(current['text'], next_hl['text'])
|
|
|
|
merged_highlight = current.copy()
|
|
merged_highlight['text'] = merged_text
|
|
|
|
if current['page'] != next_hl['page']:
|
|
merged_highlight['pages_spanned'] = f"Pages {current['page']}-{next_hl['page']}"
|
|
print(f" 🔗 Cross-page hyphen: \"{merged_text[:80]}\"")
|
|
else:
|
|
merged_highlight['hyphen_merged'] = True
|
|
print(f" 🔗 Same-page hyphen: \"{merged_text[:80]}\"")
|
|
|
|
merged.append(merged_highlight)
|
|
i += 2 # Skip both highlights
|
|
else:
|
|
merged.append(current)
|
|
i += 1
|
|
|
|
return merged
|
|
|
|
def _is_clear_hyphenation(self, hl1, hl2):
|
|
"""Detect ONLY clear hyphenation patterns."""
|
|
# Must be same color
|
|
if hl1['color'] != hl2['color']:
|
|
return False
|
|
|
|
text1 = hl1['text'].strip()
|
|
text2 = hl2['text'].strip()
|
|
|
|
# MUST end with hyphen for hyphenation
|
|
if not text1.endswith('-'):
|
|
return False
|
|
|
|
# Same page: check reasonable line spacing
|
|
if hl1['page'] == hl2['page']:
|
|
y_diff = abs(hl1['y_position'] - hl2['y_position'])
|
|
# Reasonable line height (8-30 pixels) - slightly more lenient
|
|
if 8 <= y_diff <= 30 and hl2['y_position'] > hl1['y_position']:
|
|
print(f" 🔍 Same-page hyphen detected: '{text1}' + '{text2[:15]}'")
|
|
return True
|
|
|
|
# Cross-page: second highlight should be near top
|
|
elif hl2['page'] == hl1['page'] + 1 and hl2['y_position'] < 150:
|
|
print(f" 🔍 Cross-page hyphen detected: '{text1}' + '{text2[:15]}'")
|
|
return True
|
|
|
|
return False
|
|
|
|
def _join_hyphenated_text(self, text1, text2):
|
|
"""Join hyphenated text correctly."""
|
|
text1 = text1.strip()
|
|
text2 = text2.strip()
|
|
|
|
if text1.endswith('-'):
|
|
# Remove hyphen and join
|
|
return text1[:-1] + text2
|
|
else:
|
|
return text1 + " " + text2
|
|
|
|
def _get_highlight_color(self, colors):
|
|
"""Get highlight color - only 4 colors."""
|
|
if not colors:
|
|
return 'unknown'
|
|
|
|
if 'fill' in colors and colors['fill']:
|
|
rgb = colors['fill']
|
|
elif 'stroke' in colors and colors['stroke']:
|
|
rgb = colors['stroke']
|
|
else:
|
|
return 'unknown'
|
|
|
|
return self._rgb_to_simple_color(rgb)
|
|
def _rgb_to_simple_color(self, rgb):
|
|
"""Convert RGB to one of 4 colors."""
|
|
if not rgb or len(rgb) < 3:
|
|
return 'unknown'
|
|
|
|
r, g, b = rgb[:3]
|
|
|
|
if r <= 1:
|
|
r, g, b = r*255, g*255, b*255
|
|
|
|
if r > 220 and g > 220 and b < 120:
|
|
return 'yellow'
|
|
elif r < 120 and g > 180 and b < 120:
|
|
return 'green'
|
|
elif r < 120 and g < 180 and b > 180:
|
|
return 'blue'
|
|
elif r > 180 and g < 180 and b > 180:
|
|
return 'pink'
|
|
else:
|
|
max_val = max(r, g, b)
|
|
if max_val == r and r > 150:
|
|
return 'pink'
|
|
elif max_val == g and g > 150:
|
|
return 'green'
|
|
elif max_val == b and b > 150:
|
|
return 'blue'
|
|
elif r > 180 and g > 180:
|
|
return 'yellow'
|
|
return 'unknown'
|
|
|
|
def _get_simple_color(self, color_rgb):
|
|
"""Get simple color from annotation."""
|
|
if color_rgb:
|
|
return self._rgb_to_simple_color(color_rgb)
|
|
return 'unknown'
|
|
|
|
def _get_annotation_text(self, page, annot, rect):
|
|
"""Extract annotation text."""
|
|
text = annot.get('contents', '').strip()
|
|
if text:
|
|
return text
|
|
|
|
if rect and len(rect) == 4:
|
|
try:
|
|
x0, y0, x1, y1 = rect
|
|
cropped = page.crop((x0-1, y0-1, x1+1, y1+1))
|
|
text = cropped.extract_text()
|
|
if text and text.strip():
|
|
return text.strip()
|
|
except:
|
|
pass
|
|
|
|
return ""
|
|
|
|
def extract_all_highlights(self):
|
|
"""Main extraction method."""
|
|
print("🔍 PDF Highlight Extractor - BALANCED PRECISION")
|
|
print("🎯 Colors: Yellow, Pink, Green, Blue only")
|
|
print("🎯 BALANCED extraction - complete highlights without over-capture")
|
|
print("📏 Small expansion (+2 pixels) for boundary words")
|
|
print("🔍 40% overlap requirement (was 75% - more inclusive)")
|
|
print("🔗 Smart hyphenation merging")
|
|
print("=" * 70)
|
|
|
|
self.annotations = self.extract_annotation_highlights()
|
|
self.highlights = self.extract_background_highlights()
|
|
|
|
print(f"\n✨ Total: {len(self.annotations)} annotations, {len(self.highlights)} highlights")
|
|
return self.annotations, self.highlights
|
|
|
|
def display_results(self):
|
|
"""Display results cleanly."""
|
|
print("\n" + "="*70)
|
|
print("📋 EXTRACTION RESULTS")
|
|
print("="*70)
|
|
|
|
all_items = []
|
|
for item in self.annotations:
|
|
item['category'] = 'annotation'
|
|
all_items.append(item)
|
|
for item in self.highlights:
|
|
item['category'] = 'highlight'
|
|
all_items.append(item)
|
|
|
|
if not all_items:
|
|
print("\n❌ No highlights found")
|
|
return
|
|
|
|
all_items.sort(key=lambda x: (x['page'], x['y_position']))
|
|
|
|
current_page = None
|
|
for item in all_items:
|
|
if item['page'] != current_page:
|
|
current_page = item['page']
|
|
print(f"\n📄 Page {current_page}")
|
|
print("-" * 25)
|
|
|
|
color_code = self._get_color_display(item['color'])
|
|
icon = "📝" if item['category'] == 'annotation' else "🎨"
|
|
|
|
merge_info = ""
|
|
if item.get('pages_spanned'):
|
|
merge_info = f" ({item['pages_spanned']})"
|
|
elif item.get('hyphen_merged'):
|
|
merge_info = " (hyphen-merged)"
|
|
|
|
print(f"{icon} {color_code}{item['color'].upper()}{Style.RESET_ALL}{merge_info}")
|
|
print(f" \"{item['text']}\"")
|
|
|
|
def _get_color_display(self, color_name):
|
|
"""Terminal color codes."""
|
|
colors = {
|
|
'yellow': Back.YELLOW + Fore.BLACK,
|
|
'green': Back.GREEN + Fore.BLACK,
|
|
'blue': Back.BLUE + Fore.WHITE,
|
|
'pink': Back.MAGENTA + Fore.WHITE,
|
|
}
|
|
return colors.get(color_name, Back.WHITE + Fore.BLACK)
|
|
|
|
def save_to_json(self, annotations, highlights, output_path):
|
|
"""Save to JSON."""
|
|
data = {
|
|
'annotations': annotations,
|
|
'highlights': highlights,
|
|
'summary': {
|
|
'total_annotations': len(annotations),
|
|
'total_highlights': len(highlights)
|
|
}
|
|
}
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
print(f"💾 Saved to {output_path}")
|
|
|
|
def save_to_csv(self, annotations, highlights, output_path):
|
|
"""Save to CSV."""
|
|
all_items = []
|
|
for item in annotations:
|
|
item_copy = item.copy()
|
|
item_copy['category'] = 'annotation'
|
|
all_items.append(item_copy)
|
|
for item in highlights:
|
|
item_copy = item.copy()
|
|
item_copy['category'] = 'highlight'
|
|
all_items.append(item_copy)
|
|
|
|
df = pd.DataFrame(all_items)
|
|
df.to_csv(output_path, index=False, encoding='utf-8')
|
|
print(f"📊 Saved to {output_path}")
|
|
|
|
|
|
def is_test_mode():
|
|
"""Check if script is run in test mode."""
|
|
test_flags = ['--test', '-t', 'test']
|
|
return any(flag in sys.argv for flag in test_flags)
|
|
|
|
|
|
def main():
|
|
start_time = time.time()
|
|
|
|
test_mode = is_test_mode()
|
|
|
|
print("🎨 PDF Highlight Extractor - BALANCED PRECISION")
|
|
print("✅ More inclusive extraction (40% overlap vs 75%)")
|
|
print("✅ Small boundary expansion (+2 pixels)")
|
|
print("✅ Better word capture at highlight edges")
|
|
print("✅ Detailed extraction logging")
|
|
print("✅ Smart hyphenation merging")
|
|
|
|
if test_mode:
|
|
print("🧪 TEST MODE: Using defaults")
|
|
print("✅ Default file: /mnt/c/Users/admin/Downloads/test2.pdf")
|
|
print("✅ Skipping JSON/CSV output")
|
|
else:
|
|
print("🔧 FULL MODE: Interactive prompts")
|
|
|
|
print()
|
|
|
|
if test_mode:
|
|
default_pdf = "/mnt/c/Users/admin/Downloads/test2.pdf"
|
|
pdf_path = default_pdf
|
|
print(f"📄 Using default: {pdf_path}")
|
|
else:
|
|
pdf_input = input("📄 PDF file path: ").strip('"')
|
|
if not pdf_input:
|
|
print("❌ No file specified!")
|
|
return
|
|
pdf_path = pdf_input
|
|
|
|
if not Path(pdf_path).exists():
|
|
print("❌ File not found!")
|
|
return
|
|
|
|
output_json = ""
|
|
output_csv = ""
|
|
|
|
if test_mode:
|
|
print("📋 Test mode: Display only (no file output)")
|
|
else:
|
|
print("\n📤 Output options:")
|
|
output_json = input("💾 JSON file (Enter to skip): ").strip('"')
|
|
output_csv = input("📊 CSV file (Enter to skip): ").strip('"')
|
|
|
|
# Process
|
|
extractor = PDFHighlightExtractor(pdf_path)
|
|
annotations, highlights = extractor.extract_all_highlights()
|
|
|
|
# Display results
|
|
extractor.display_results()
|
|
|
|
# Save files (only in full mode and if specified)
|
|
if not test_mode:
|
|
if output_json:
|
|
extractor.save_to_json(annotations, highlights, output_json)
|
|
if output_csv:
|
|
extractor.save_to_csv(annotations, highlights, output_csv)
|
|
|
|
if not output_json and not output_csv:
|
|
print("\n📋 Display only - no files saved")
|
|
|
|
end_time = time.time()
|
|
elapsed_time = end_time - start_time
|
|
|
|
print(f"\n⏱️ Processing completed in {elapsed_time:.2f} seconds")
|
|
|
|
if test_mode:
|
|
print("\n🧪 Test mode completed. Use without --test flag for full options.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|