hilitehero/main.py

541 lines
21 KiB
Python

import pdfplumber
import fitz # PyMuPDF
import json
from colorama import init, Fore, Back, Style
import pandas as pd
from pathlib import Path
import re
# Initialize colorama for colored terminal output
init(autoreset=True)
class PDFHighlightExtractor:
def __init__(self, pdf_path):
self.pdf_path = Path(pdf_path)
self.annotations = []
self.highlights = []
def extract_annotation_highlights(self):
"""Extract ALL types of annotations with improved processing."""
annotations = []
try:
with pdfplumber.open(self.pdf_path) as pdf:
print(f"📄 Processing annotations...")
for page_num, page in enumerate(pdf.pages, 1):
if hasattr(page, 'annots') and page.annots:
page_annotations = 0
for i, annot in enumerate(page.annots):
try:
annot_type = annot.get('subtype', 'Unknown')
# Process all annotation types
if annot_type in ['Highlight', 'Squiggly', 'StrikeOut', 'Underline', 'FreeText', 'Text']:
rect = annot.get('rect', [])
# Try multiple text extraction methods
text = self._get_annotation_text(page, annot, rect)
color = self._get_color_from_annot(annot)
if text and text.strip():
annotations.append({
'page': page_num,
'text': self._clean_text(text),
'color': color,
'type': f'annotation_{annot_type.lower()}',
'coordinates': rect,
'y_position': rect[1] if len(rect) >= 4 else 0
})
page_annotations += 1
except Exception as e:
continue
if page_annotations > 0:
print(f" ✅ Page {page_num}: Found {page_annotations} annotations")
print(f" 📊 Total annotations: {len(annotations)}")
except Exception as e:
print(f"❌ Error reading annotations: {e}")
return annotations
def _get_annotation_text(self, page, annot, rect):
"""Try multiple methods to extract annotation text."""
# Method 1: From annotation contents
text = annot.get('contents', '').strip()
if text:
return text
# Method 2: From rect area
if rect and len(rect) == 4:
try:
x0, y0, x1, y1 = rect
cropped = page.crop((x0-1, y0-1, x1+1, y1+1))
text = cropped.extract_text()
if text and text.strip():
return text.strip()
except:
pass
# Method 3: From annotation object properties
for prop in ['label', 'title', 'subject']:
text = annot.get(prop, '').strip()
if text:
return text
return ""
def extract_background_highlights(self):
"""Extract background highlights with word completion."""
highlights = []
try:
print(f"\n🎨 Processing highlights...")
doc = fitz.open(str(self.pdf_path))
for page_num in range(doc.page_count):
page = doc[page_num]
page_highlights = 0
# Get all text words on the page for word completion
all_words = page.get_text("words") # [(x0, y0, x1, y1, "word", block_no, line_no, word_no)]
annotations = page.annots()
for annot in annotations:
try:
if annot.type[1] == 'Highlight':
# Get color information
colors = annot.colors
color_name = self._analyze_highlight_color(colors)
if color_name != 'unknown':
# Extract text from highlighted area
rect = annot.rect
highlight_text = self._extract_text_from_rect_pymupdf(page, rect)
if highlight_text and len(highlight_text.strip()) > 2:
# Complete partial words at start and end
completed_text = self._complete_partial_words(highlight_text, rect, all_words)
clean_text = self._clean_text(completed_text)
# Create highlight entry
highlight_entry = {
'page': page_num + 1,
'text': clean_text,
'color': color_name,
'type': 'highlight',
'coordinates': list(rect),
'y_position': rect.y0
}
highlights.append(highlight_entry)
page_highlights += 1
except Exception as e:
continue
if page_highlights > 0:
print(f" ✅ Page {page_num + 1}: Found {page_highlights} highlights")
doc.close()
print(f" 📊 Total highlights: {len(highlights)}")
except Exception as e:
print(f"❌ Error reading highlights: {e}")
return highlights
def _complete_partial_words(self, highlight_text, rect, all_words):
"""Complete partial words at the beginning and end of highlights."""
if not highlight_text or not all_words:
return highlight_text
words = highlight_text.split()
if not words:
return highlight_text
first_word = words[0]
last_word = words[-1]
# Find words that intersect with the highlight rectangle
highlight_rect = fitz.Rect(rect)
nearby_words = []
for word_info in all_words:
word_rect = fitz.Rect(word_info[:4])
word_text = word_info[4]
# Check if word is near the highlight area (within expanded boundaries)
expanded_rect = fitz.Rect(
highlight_rect.x0 - 50, # Expand left
highlight_rect.y0 - 5, # Expand up
highlight_rect.x1 + 50, # Expand right
highlight_rect.y1 + 5 # Expand down
)
if word_rect.intersects(expanded_rect):
nearby_words.append((word_rect, word_text))
# Sort by position (left to right, top to bottom)
nearby_words.sort(key=lambda x: (x[0].y0, x[0].x0))
# Complete first word if it seems partial
if len(first_word) >= 3 and self._is_likely_partial(first_word):
completed_first = self._find_complete_word(first_word, nearby_words, 'start')
if completed_first and completed_first != first_word:
words[0] = completed_first
print(f" 🔧 Completed first word: '{first_word}''{completed_first}'")
# Complete last word if it seems partial
if len(last_word) >= 3 and self._is_likely_partial(last_word):
completed_last = self._find_complete_word(last_word, nearby_words, 'end')
if completed_last and completed_last != last_word:
words[-1] = completed_last
print(f" 🔧 Completed last word: '{last_word}''{completed_last}'")
return ' '.join(words)
def _is_likely_partial(self, word):
"""Check if a word is likely partial/incomplete."""
if not word:
return False
# Common indicators of partial words
partial_indicators = [
len(word) < 3, # Very short
word.endswith('-'), # Hyphenated break
not word.isalpha() and not word[-1].isalpha(), # Ends with punctuation
word.lower() in ['the', 'and', 'of', 'to', 'in', 'for', 'with'], # Complete common words
]
# If it's a common complete word, it's not partial
if word.lower() in ['the', 'and', 'of', 'to', 'in', 'for', 'with', 'a', 'an', 'is', 'are', 'was', 'were']:
return False
# Check for incomplete endings (consonant clusters that suggest more letters)
if len(word) >= 4:
ending = word[-2:].lower()
incomplete_endings = ['th', 'st', 'nd', 'rd', 'ch', 'sh', 'nt', 'mp', 'ck', 'ng']
if any(word.lower().endswith(end) for end in incomplete_endings):
return True
# Check if it doesn't end with typical word endings
common_endings = ['ed', 'ing', 'er', 'est', 'ly', 'ion', 'tion', 'ment', 'ness', 'ful', 'less', 'able', 'ible']
if len(word) >= 4 and not any(word.lower().endswith(end) for end in common_endings):
return True
return False
def _find_complete_word(self, partial_word, nearby_words, position):
"""Find the complete word that contains the partial word."""
partial_lower = partial_word.lower()
candidates = []
for word_rect, full_word in nearby_words:
full_word_lower = full_word.lower()
if position == 'start':
# For start position, the partial word should be at the end of the complete word
if full_word_lower.endswith(partial_lower) and len(full_word) > len(partial_word):
candidates.append((full_word, len(full_word)))
elif position == 'end':
# For end position, the partial word should be at the start of the complete word
if full_word_lower.startswith(partial_lower) and len(full_word) > len(partial_word):
candidates.append((full_word, len(full_word)))
# Return the longest candidate (most likely to be the complete word)
if candidates:
candidates.sort(key=lambda x: x[1], reverse=True)
return candidates[0][0]
return partial_word
def _extract_text_from_rect_pymupdf(self, page, rect):
"""Extract text from rectangle using multiple PyMuPDF methods."""
try:
# Method 1: Direct text extraction
text = page.get_text("text", clip=rect)
if text and text.strip():
return text.strip()
# Method 2: Textbox method
text = page.get_textbox(rect)
if text and text.strip():
return text.strip()
# Method 3: Expanded rectangle
expanded_rect = fitz.Rect(rect.x0 - 2, rect.y0 - 2, rect.x1 + 2, rect.y1 + 2)
text_dict = page.get_text("dict", clip=expanded_rect)
text_parts = []
for block in text_dict.get("blocks", []):
if "lines" in block:
for line in block["lines"]:
for span in line["spans"]:
if span["text"].strip():
text_parts.append(span["text"])
return " ".join(text_parts)
except:
return ""
def _analyze_highlight_color(self, colors):
"""Analyze highlight color with improved detection."""
if not colors:
return 'unknown'
# Check fill color first (highlight background)
if 'fill' in colors and colors['fill']:
return self._rgb_to_color_name(colors['fill'])
elif 'stroke' in colors and colors['stroke']:
return self._rgb_to_color_name(colors['stroke'])
return 'unknown'
def _get_color_from_annot(self, annot):
"""Get color from pdfplumber annotation."""
try:
color = annot.get('color', [])
if color:
return self._rgb_to_color_name(color)
except:
pass
return 'unknown'
def _rgb_to_color_name(self, rgb):
"""Convert RGB values to color names with improved precision."""
if not rgb or len(rgb) < 3:
return 'unknown'
r, g, b = rgb[:3]
# Precise color detection
if r > 0.7 and g > 0.7 and b < 0.6:
return 'yellow'
elif r < 0.6 and g > 0.7 and b < 0.6:
return 'green'
elif r < 0.6 and g < 0.8 and b > 0.7:
return 'blue'
elif r > 0.7 and g < 0.6 and b > 0.7:
return 'pink'
elif r > 0.8 and g > 0.5 and b < 0.5:
return 'orange'
elif r > 0.7 and g < 0.5 and b < 0.5:
return 'red'
elif r < 0.5 and g > 0.7 and b > 0.7:
return 'cyan'
else:
return f'rgb({r:.2f},{g:.2f},{b:.2f})'
def _clean_text(self, text):
"""Clean and normalize text."""
if not text:
return ""
try:
# Remove extra whitespace and normalize
text = re.sub(r'\s+', ' ', text.strip())
# Remove line break hyphens
text = re.sub(r'-\s+', '', text)
# Fix punctuation spacing
text = re.sub(r'\s+([.,;:!?])', r'\1', text)
return text
except:
return str(text) if text else ""
def _smart_deduplicate(self, items):
"""Smart deduplication that merges similar highlights."""
if not items:
return items
# Sort by page and position
items.sort(key=lambda x: (x['page'], x['y_position'], len(x['text'])))
unique_items = []
for item in items:
is_duplicate = False
for existing in unique_items:
# Check if this is a duplicate or subset
if (item['page'] == existing['page'] and
item['color'] == existing['color'] and
abs(item['y_position'] - existing['y_position']) < 10):
# Check text similarity
item_text = item['text'].lower().strip()
existing_text = existing['text'].lower().strip()
# If one is substring of another, keep the longer one
if item_text in existing_text:
is_duplicate = True
break
elif existing_text in item_text:
# Replace existing with longer text
existing['text'] = item['text']
is_duplicate = True
break
# If very similar (90% overlap), it's a duplicate
elif self._text_similarity(item_text, existing_text) > 0.9:
is_duplicate = True
break
if not is_duplicate:
unique_items.append(item)
return unique_items
def _text_similarity(self, text1, text2):
"""Calculate text similarity ratio."""
if not text1 or not text2:
return 0
# Simple word-based similarity
words1 = set(text1.split())
words2 = set(text2.split())
if not words1 or not words2:
return 0
intersection = len(words1.intersection(words2))
union = len(words1.union(words2))
return intersection / union if union > 0 else 0
def extract_all_highlights(self):
"""Extract and process all highlights and annotations."""
print("🔍 PDF Highlight & Annotation Extractor")
print("=" * 50)
# Extract annotations
self.annotations = self.extract_annotation_highlights()
# Extract highlights
self.highlights = self.extract_background_highlights()
# Smart deduplication
self.highlights = self._smart_deduplicate(self.highlights)
print(f"\n✨ Processing complete!")
print(f" 📝 Annotations: {len(self.annotations)}")
print(f" 🎨 Highlights: {len(self.highlights)}")
return self.annotations, self.highlights
def sort_by_position(self, items):
"""Sort items by page, then top to bottom."""
return sorted(items, key=lambda x: (x['page'], x['y_position']))
def save_to_json(self, annotations, highlights, output_path):
"""Save results to JSON file."""
data = {
'annotations': annotations,
'highlights': highlights,
'summary': {
'total_annotations': len(annotations),
'total_highlights': len(highlights),
'annotation_colors': list(set(a['color'] for a in annotations)),
'highlight_colors': list(set(h['color'] for h in highlights))
}
}
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"💾 Saved to {output_path}")
def save_to_csv(self, annotations, highlights, output_path):
"""Save results to CSV file."""
all_items = []
for item in annotations:
item_copy = item.copy()
item_copy['category'] = 'annotation'
all_items.append(item_copy)
for item in highlights:
item_copy = item.copy()
item_copy['category'] = 'highlight'
all_items.append(item_copy)
df = pd.DataFrame(all_items)
df.to_csv(output_path, index=False, encoding='utf-8')
print(f"📊 Saved to {output_path}")
def display_results(self):
"""Display results with clean formatting."""
print("\n" + "="*60)
print("📋 EXTRACTION RESULTS")
print("="*60)
# Display Annotations
if self.annotations:
sorted_annotations = self.sort_by_position(self.annotations)
print(f"\n📝 ANNOTATIONS ({len(sorted_annotations)} items)")
print("-" * 40)
for i, item in enumerate(sorted_annotations, 1):
color_code = self._get_color_code(item['color'])
print(f"\n{i:2d}. Page {item['page']} | {color_code}{item['color'].upper()}{Style.RESET_ALL}")
print(f" Type: {item['type']}")
print(f" Text: \"{item['text']}\"")
else:
print(f"\n📝 ANNOTATIONS: None found")
# Display Highlights
if self.highlights:
sorted_highlights = self.sort_by_position(self.highlights)
print(f"\n🎨 BACKGROUND HIGHLIGHTS ({len(sorted_highlights)} items)")
print("-" * 40)
for i, item in enumerate(sorted_highlights, 1):
color_code = self._get_color_code(item['color'])
print(f"\n{i:2d}. Page {item['page']} | {color_code}{item['color'].upper()}{Style.RESET_ALL}")
print(f" Text: \"{item['text']}\"")
else:
print(f"\n🎨 BACKGROUND HIGHLIGHTS: None found")
print("\n" + "="*60)
def _get_color_code(self, color_name):
"""Get terminal color code for display."""
color_map = {
'yellow': Back.YELLOW + Fore.BLACK,
'green': Back.GREEN + Fore.BLACK,
'blue': Back.BLUE + Fore.WHITE,
'red': Back.RED + Fore.WHITE,
'pink': Back.MAGENTA + Fore.WHITE,
'orange': Back.YELLOW + Fore.RED,
'cyan': Back.CYAN + Fore.BLACK,
'unknown': Back.WHITE + Fore.BLACK
}
return color_map.get(color_name, Back.WHITE + Fore.BLACK)
def main():
print("🎨 PDF Highlight & Annotation Extractor")
print("🚀 Enhanced with smart word completion and deduplication")
print()
# Get PDF file path
pdf_path = input("📄 Enter PDF file path: ").strip('"')
if not Path(pdf_path).exists():
print("❌ File not found!")
return
# Get output options
print("\n📤 Output Options:")
output_json = input("💾 JSON file (or Enter to skip): ").strip('"')
output_csv = input("📊 CSV file (or Enter to skip): ").strip('"')
# Process PDF
extractor = PDFHighlightExtractor(pdf_path)
annotations, highlights = extractor.extract_all_highlights()
# Display results
extractor.display_results()
# Save results
if output_json:
extractor.save_to_json(annotations, highlights, output_json)
if output_csv:
extractor.save_to_csv(annotations, highlights, output_csv)
if __name__ == '__main__':
main()