hilitehero/main.py
ilia 70e3b66c95 add documentation and update readme.
fix extraction issues.
Add test param.
2025-05-27 11:55:20 -04:00

710 lines
25 KiB
Python

"""
PDF Highlight Extractor
======================
A robust tool for extracting highlighted text from PDF files with intelligent text ordering
and hyphenation handling.
Overview:
--------
This tool addresses common PDF text extraction challenges:
- PDFs store text in creation order, not reading order
- Multi-line highlights can extract in wrong sequence
- Hyphenated words across lines need rejoining
- Boundary words may be partially highlighted
Architecture:
------------
1. PDFHighlightExtractor: Main class handling extraction logic
2. Multi-method extraction: Fallback system for maximum compatibility
3. Smart text ordering: Line detection and geometric sorting
4. Hyphenation merger: Detects and combines split words
Technical Approach:
-----------------
METHOD A: PyMuPDF built-in text sorting
- Uses page.get_text("text", sort=True) for automatic ordering
- Most reliable for simple layouts
METHOD B: Text block extraction
- Extracts PDF text blocks which maintain better reading order
- Geometric sorting by block position
METHOD C: Enhanced word-level sorting
- Individual word extraction with custom line detection
- Groups words by Y-position, sorts by X-position within lines
- Handles complex multi-line highlights
Hyphenation Algorithm:
--------------------
1. Detects highlights ending with '-'
2. Checks next highlight for same color and reasonable distance
3. Merges: "lin-" + "guistics""linguistics"
4. Supports both same-page and cross-page hyphenation
Color Detection:
---------------
- RGB color space analysis
- Supports 4 highlight colors: Yellow, Pink, Green, Blue
- Handles both fill and stroke color properties
Precision Control:
-----------------
- 40% overlap threshold for word inclusion
- +2 pixel boundary expansion for edge cases
- 5-pixel line tolerance for multi-line detection
Usage Patterns:
--------------
Test Mode: python script.py --test
- Uses default PDF path
- Display-only output
- Quick testing and debugging
Full Mode: python script.py
- Interactive prompts for file paths
- Optional JSON/CSV export
- Complete control over options
"""
import time
import pdfplumber
import fitz # PyMuPDF
import json
from colorama import init, Fore, Back, Style
import pandas as pd
from pathlib import Path
import re
import sys
# Initialize colorama for colored terminal output
init(autoreset=True)
class PDFHighlightExtractor:
"""
Main extraction class for PDF highlighted text.
This class handles the complete extraction pipeline from PDF analysis
to formatted output with intelligent text ordering and hyphenation.
Key Features:
------------
- Multi-method text extraction with fallback
- Geometric text ordering for proper reading sequence
- Hyphenation detection and merging
- 4-color highlight support (Yellow, Pink, Green, Blue)
- Cross-page highlight handling
Extraction Pipeline:
------------------
1. PDF Loading: Opens PDF with PyMuPDF
2. Annotation Detection: Finds highlight annotations
3. Color Classification: Identifies highlight colors
4. Text Extraction: Uses multi-method approach
5. Text Ordering: Applies geometric sorting
6. Hyphenation Merging: Combines split words
7. Output Formatting: Prepares results for display/export
Methods Overview:
---------------
extract_all_highlights(): Main entry point
_extract_text_balanced(): Core text extraction with ordering
_smart_hyphenation_merge(): Hyphenation detection and merging
_is_clear_hyphenation(): Hyphenation pattern recognition
display_results(): Formatted terminal output
Usage:
------
extractor = PDFHighlightExtractor('path/to/file.pdf')
annotations, highlights = extractor.extract_all_highlights()
extractor.display_results()
"""
def __init__(self, pdf_path):
self.pdf_path = Path(pdf_path)
self.annotations = []
self.highlights = []
def extract_annotation_highlights(self):
"""Extract annotations with simple processing."""
annotations = []
try:
with pdfplumber.open(self.pdf_path) as pdf:
print(f"📄 Processing annotations...")
for page_num, page in enumerate(pdf.pages, 1):
if hasattr(page, 'annots') and page.annots:
for annot in page.annots:
try:
annot_type = annot.get('subtype', 'Unknown')
if annot_type in ['Highlight', 'Squiggly', 'StrikeOut', 'Underline', 'FreeText', 'Text']:
rect = annot.get('rect', [])
text = self._get_annotation_text(page, annot, rect)
color = self._get_simple_color(annot.get('color', []))
if text and text.strip():
annotations.append({
'page': page_num,
'text': text.strip(),
'color': color,
'type': 'annotation',
'y_position': rect[1] if len(rect) >= 4 else 0
})
except:
continue
print(f" ✅ Found {len(annotations)} annotations")
except Exception as e:
print(f"❌ Error: {e}")
return annotations
def extract_background_highlights(self):
"""Extract highlights with BALANCED precision - capture complete highlights."""
all_highlights = []
try:
print(f"\n🎨 Processing highlights...")
doc = fitz.open(str(self.pdf_path))
# Collect each individual highlight with BALANCED extraction
for page_num in range(doc.page_count):
page = doc[page_num]
annotations = page.annots()
for annot in annotations:
try:
if annot.type[1] == 'Highlight':
colors = annot.colors
color_name = self._get_highlight_color(colors)
if color_name in ['yellow', 'pink', 'green', 'blue']:
# BALANCED: Extract complete highlighted phrases
text = self._extract_text_balanced(page, annot)
if text and text.strip():
all_highlights.append({
'page': page_num + 1,
'text': text.strip(),
'color': color_name,
'type': 'highlight',
'y_position': annot.rect.y0,
'x_position': annot.rect.x0,
'y_end': annot.rect.y1,
'x_end': annot.rect.x1,
'rect': annot.rect
})
print(f" 🎨 {color_name.upper()}: \"{text[:70]}...\"")
except Exception as e:
continue
doc.close()
# Smart hyphenation merging only
merged_highlights = self._smart_hyphenation_merge(all_highlights)
print(f" 📊 Raw: {len(all_highlights)} → Merged: {len(merged_highlights)}")
return merged_highlights
except Exception as e:
print(f"❌ Error: {e}")
return []
def _extract_text_balanced(self, page, annot):
"""BALANCED: Extract text with PROPER READING ORDER."""
try:
# Method 1: Use PyMuPDF's built-in text ordering with sorting
highlight_rect = annot.rect
# SMALL EXPANSION for boundary words
expanded_rect = fitz.Rect(
highlight_rect.x0 - 2,
highlight_rect.y0 - 1,
highlight_rect.x1 + 2,
highlight_rect.y1 + 1
)
# METHOD A: Use text extraction with BUILT-IN SORTING
print(f" 🔍 Method A: Text extraction with sorting")
text_with_sort = page.get_text("text", clip=expanded_rect, sort=True)
if text_with_sort and text_with_sort.strip():
cleaned_text = re.sub(r'\s+', ' ', text_with_sort.strip())
print(f" ✅ Sorted text result: \"{cleaned_text}\"")
return cleaned_text
# METHOD B: Text blocks (better reading order than individual words)
print(f" 🔍 Method B: Text blocks extraction")
text_blocks = page.get_text("blocks", clip=expanded_rect)
if text_blocks:
# Sort blocks by reading order (top to bottom, left to right)
text_blocks.sort(key=lambda block: (block[1], block[0])) # y-pos, then x-pos
block_texts = []
for block in text_blocks:
if len(block) >= 5 and block[4].strip():
block_text = block[4].strip()
block_text = re.sub(r'\s+', ' ', block_text)
block_texts.append(block_text)
if block_texts:
combined_text = " ".join(block_texts)
print(f" ✅ Block result: \"{combined_text}\"")
return combined_text
# METHOD C: Enhanced word-level with geometric sorting
print(f" 🔍 Method C: Enhanced word sorting")
all_words = page.get_text("words")
highlight_words = []
for word in all_words:
word_rect = fitz.Rect(word[:4])
word_text = word[4]
if expanded_rect.intersects(word_rect):
intersection = expanded_rect & word_rect
word_area = word_rect.get_area()
if word_area > 0:
overlap_ratio = intersection.get_area() / word_area
if overlap_ratio >= 0.40:
highlight_words.append({
'text': word_text,
'x0': word[0],
'y0': word[1],
'x1': word[2],
'y1': word[3],
'center_y': (word[1] + word[3]) / 2,
'center_x': (word[0] + word[2]) / 2
})
if highlight_words:
# ENHANCED SORTING: Group by lines first, then sort within lines
# Group words by approximate line (within 5 pixels of each other)
lines = []
for word in highlight_words:
placed = False
for line in lines:
# Check if word belongs to existing line
avg_y = sum(w['center_y'] for w in line) / len(line)
if abs(word['center_y'] - avg_y) <= 5: # Same line tolerance
line.append(word)
placed = True
break
if not placed:
lines.append([word])
# Sort lines by Y position (top to bottom)
lines.sort(key=lambda line: sum(w['center_y'] for w in line) / len(line))
# Sort words within each line by X position (left to right)
for line in lines:
line.sort(key=lambda w: w['center_x'])
# Combine all words in reading order
ordered_words = []
for line in lines:
ordered_words.extend(line)
extracted_text = " ".join([w['text'] for w in ordered_words])
print(f" ✅ Enhanced word sorting ({len(ordered_words)} words): \"{extracted_text}\"")
return extracted_text
print(f" ❌ No text found in highlight area")
return ""
except Exception as e:
print(f" ❌ Extraction error: {e}")
return ""
def _extract_by_quads_balanced(self, page, annot):
"""Extract using quad points with BALANCED precision."""
try:
quad_points = annot.vertices
if not quad_points:
return ""
quad_count = int(len(quad_points) / 4)
all_words = page.get_text("words")
highlight_words = []
print(f" 🔍 Processing {quad_count} quads with balanced precision")
for i in range(quad_count):
points = quad_points[i * 4: i * 4 + 4]
quad_rect = fitz.Quad(points).rect
# SMALL EXPANSION - 2 pixels to catch boundary words
expanded_quad = fitz.Rect(
quad_rect.x0 - 2, quad_rect.y0 - 1,
quad_rect.x1 + 2, quad_rect.y1 + 1
)
for word in all_words:
word_rect = fitz.Rect(word[:4])
word_text = word[4]
if expanded_quad.intersects(word_rect):
intersection = expanded_quad & word_rect
word_area = word_rect.get_area()
if word_area > 0:
overlap_ratio = intersection.get_area() / word_area
# RELAXED: 40% overlap required (was 75%)
if overlap_ratio >= 0.40:
highlight_words.append({
'text': word_text,
'x0': word[0],
'y0': word[1],
'line': self._estimate_line_number(word[1])
})
print(f" ✓ Quad '{word_text}' (overlap: {overlap_ratio:.2f})")
if highlight_words:
# Remove duplicates while preserving order
seen = set()
unique_words = []
for word in highlight_words:
word_key = (word['text'], word['x0'], word['y0'])
if word_key not in seen:
seen.add(word_key)
unique_words.append(word)
# Sort by reading order
unique_words.sort(key=lambda w: (w['line'], w['x0']))
extracted_text = " ".join([w['text'] for w in unique_words])
print(f" ✅ Quad balanced ({len(unique_words)} words): \"{extracted_text}\"")
return extracted_text
return ""
except Exception as e:
print(f" ❌ Quad extraction error: {e}")
return ""
def _estimate_line_number(self, y_position, avg_line_height=14):
"""Estimate line number based on y-position."""
return round(y_position / avg_line_height)
def _smart_hyphenation_merge(self, highlights):
"""Smart merging - ONLY for clear hyphenation patterns."""
if not highlights:
return highlights
# Sort by page, color, then position
highlights.sort(key=lambda x: (x['page'], x['color'], x['y_position'], x['x_position']))
merged = []
i = 0
while i < len(highlights):
current = highlights[i]
# Look for hyphenation continuation
if (i + 1 < len(highlights) and
self._is_clear_hyphenation(current, highlights[i + 1])):
next_hl = highlights[i + 1]
merged_text = self._join_hyphenated_text(current['text'], next_hl['text'])
merged_highlight = current.copy()
merged_highlight['text'] = merged_text
if current['page'] != next_hl['page']:
merged_highlight['pages_spanned'] = f"Pages {current['page']}-{next_hl['page']}"
print(f" 🔗 Cross-page hyphen: \"{merged_text[:80]}\"")
else:
merged_highlight['hyphen_merged'] = True
print(f" 🔗 Same-page hyphen: \"{merged_text[:80]}\"")
merged.append(merged_highlight)
i += 2 # Skip both highlights
else:
merged.append(current)
i += 1
return merged
def _is_clear_hyphenation(self, hl1, hl2):
"""Detect ONLY clear hyphenation patterns."""
# Must be same color
if hl1['color'] != hl2['color']:
return False
text1 = hl1['text'].strip()
text2 = hl2['text'].strip()
# MUST end with hyphen for hyphenation
if not text1.endswith('-'):
return False
# Same page: check reasonable line spacing
if hl1['page'] == hl2['page']:
y_diff = abs(hl1['y_position'] - hl2['y_position'])
# Reasonable line height (8-30 pixels) - slightly more lenient
if 8 <= y_diff <= 30 and hl2['y_position'] > hl1['y_position']:
print(f" 🔍 Same-page hyphen detected: '{text1}' + '{text2[:15]}'")
return True
# Cross-page: second highlight should be near top
elif hl2['page'] == hl1['page'] + 1 and hl2['y_position'] < 150:
print(f" 🔍 Cross-page hyphen detected: '{text1}' + '{text2[:15]}'")
return True
return False
def _join_hyphenated_text(self, text1, text2):
"""Join hyphenated text correctly."""
text1 = text1.strip()
text2 = text2.strip()
if text1.endswith('-'):
# Remove hyphen and join
return text1[:-1] + text2
else:
return text1 + " " + text2
def _get_highlight_color(self, colors):
"""Get highlight color - only 4 colors."""
if not colors:
return 'unknown'
if 'fill' in colors and colors['fill']:
rgb = colors['fill']
elif 'stroke' in colors and colors['stroke']:
rgb = colors['stroke']
else:
return 'unknown'
return self._rgb_to_simple_color(rgb)
def _rgb_to_simple_color(self, rgb):
"""Convert RGB to one of 4 colors."""
if not rgb or len(rgb) < 3:
return 'unknown'
r, g, b = rgb[:3]
if r <= 1:
r, g, b = r*255, g*255, b*255
if r > 220 and g > 220 and b < 120:
return 'yellow'
elif r < 120 and g > 180 and b < 120:
return 'green'
elif r < 120 and g < 180 and b > 180:
return 'blue'
elif r > 180 and g < 180 and b > 180:
return 'pink'
else:
max_val = max(r, g, b)
if max_val == r and r > 150:
return 'pink'
elif max_val == g and g > 150:
return 'green'
elif max_val == b and b > 150:
return 'blue'
elif r > 180 and g > 180:
return 'yellow'
return 'unknown'
def _get_simple_color(self, color_rgb):
"""Get simple color from annotation."""
if color_rgb:
return self._rgb_to_simple_color(color_rgb)
return 'unknown'
def _get_annotation_text(self, page, annot, rect):
"""Extract annotation text."""
text = annot.get('contents', '').strip()
if text:
return text
if rect and len(rect) == 4:
try:
x0, y0, x1, y1 = rect
cropped = page.crop((x0-1, y0-1, x1+1, y1+1))
text = cropped.extract_text()
if text and text.strip():
return text.strip()
except:
pass
return ""
def extract_all_highlights(self):
"""Main extraction method."""
print("🔍 PDF Highlight Extractor - BALANCED PRECISION")
print("🎯 Colors: Yellow, Pink, Green, Blue only")
print("🎯 BALANCED extraction - complete highlights without over-capture")
print("📏 Small expansion (+2 pixels) for boundary words")
print("🔍 40% overlap requirement (was 75% - more inclusive)")
print("🔗 Smart hyphenation merging")
print("=" * 70)
self.annotations = self.extract_annotation_highlights()
self.highlights = self.extract_background_highlights()
print(f"\n✨ Total: {len(self.annotations)} annotations, {len(self.highlights)} highlights")
return self.annotations, self.highlights
def display_results(self):
"""Display results cleanly."""
print("\n" + "="*70)
print("📋 EXTRACTION RESULTS")
print("="*70)
all_items = []
for item in self.annotations:
item['category'] = 'annotation'
all_items.append(item)
for item in self.highlights:
item['category'] = 'highlight'
all_items.append(item)
if not all_items:
print("\n❌ No highlights found")
return
all_items.sort(key=lambda x: (x['page'], x['y_position']))
current_page = None
for item in all_items:
if item['page'] != current_page:
current_page = item['page']
print(f"\n📄 Page {current_page}")
print("-" * 25)
color_code = self._get_color_display(item['color'])
icon = "📝" if item['category'] == 'annotation' else "🎨"
merge_info = ""
if item.get('pages_spanned'):
merge_info = f" ({item['pages_spanned']})"
elif item.get('hyphen_merged'):
merge_info = " (hyphen-merged)"
print(f"{icon} {color_code}{item['color'].upper()}{Style.RESET_ALL}{merge_info}")
print(f" \"{item['text']}\"")
def _get_color_display(self, color_name):
"""Terminal color codes."""
colors = {
'yellow': Back.YELLOW + Fore.BLACK,
'green': Back.GREEN + Fore.BLACK,
'blue': Back.BLUE + Fore.WHITE,
'pink': Back.MAGENTA + Fore.WHITE,
}
return colors.get(color_name, Back.WHITE + Fore.BLACK)
def save_to_json(self, annotations, highlights, output_path):
"""Save to JSON."""
data = {
'annotations': annotations,
'highlights': highlights,
'summary': {
'total_annotations': len(annotations),
'total_highlights': len(highlights)
}
}
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"💾 Saved to {output_path}")
def save_to_csv(self, annotations, highlights, output_path):
"""Save to CSV."""
all_items = []
for item in annotations:
item_copy = item.copy()
item_copy['category'] = 'annotation'
all_items.append(item_copy)
for item in highlights:
item_copy = item.copy()
item_copy['category'] = 'highlight'
all_items.append(item_copy)
df = pd.DataFrame(all_items)
df.to_csv(output_path, index=False, encoding='utf-8')
print(f"📊 Saved to {output_path}")
def is_test_mode():
"""Check if script is run in test mode."""
test_flags = ['--test', '-t', 'test']
return any(flag in sys.argv for flag in test_flags)
def main():
start_time = time.time()
test_mode = is_test_mode()
print("🎨 PDF Highlight Extractor - BALANCED PRECISION")
print("✅ More inclusive extraction (40% overlap vs 75%)")
print("✅ Small boundary expansion (+2 pixels)")
print("✅ Better word capture at highlight edges")
print("✅ Detailed extraction logging")
print("✅ Smart hyphenation merging")
if test_mode:
print("🧪 TEST MODE: Using defaults")
print("✅ Default file: /mnt/c/Users/admin/Downloads/test2.pdf")
print("✅ Skipping JSON/CSV output")
else:
print("🔧 FULL MODE: Interactive prompts")
print()
if test_mode:
default_pdf = "/mnt/c/Users/admin/Downloads/test2.pdf"
pdf_path = default_pdf
print(f"📄 Using default: {pdf_path}")
else:
pdf_input = input("📄 PDF file path: ").strip('"')
if not pdf_input:
print("❌ No file specified!")
return
pdf_path = pdf_input
if not Path(pdf_path).exists():
print("❌ File not found!")
return
output_json = ""
output_csv = ""
if test_mode:
print("📋 Test mode: Display only (no file output)")
else:
print("\n📤 Output options:")
output_json = input("💾 JSON file (Enter to skip): ").strip('"')
output_csv = input("📊 CSV file (Enter to skip): ").strip('"')
# Process
extractor = PDFHighlightExtractor(pdf_path)
annotations, highlights = extractor.extract_all_highlights()
# Display results
extractor.display_results()
# Save files (only in full mode and if specified)
if not test_mode:
if output_json:
extractor.save_to_json(annotations, highlights, output_json)
if output_csv:
extractor.save_to_csv(annotations, highlights, output_csv)
if not output_json and not output_csv:
print("\n📋 Display only - no files saved")
end_time = time.time()
elapsed_time = end_time - start_time
print(f"\n⏱️ Processing completed in {elapsed_time:.2f} seconds")
if test_mode:
print("\n🧪 Test mode completed. Use without --test flag for full options.")
if __name__ == '__main__':
main()