541 lines
21 KiB
Python
541 lines
21 KiB
Python
import pdfplumber
|
|
import fitz # PyMuPDF
|
|
import json
|
|
from colorama import init, Fore, Back, Style
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
import re
|
|
|
|
# Initialize colorama for colored terminal output
|
|
init(autoreset=True)
|
|
|
|
class PDFHighlightExtractor:
|
|
def __init__(self, pdf_path):
|
|
self.pdf_path = Path(pdf_path)
|
|
self.annotations = []
|
|
self.highlights = []
|
|
|
|
def extract_annotation_highlights(self):
|
|
"""Extract ALL types of annotations with improved processing."""
|
|
annotations = []
|
|
try:
|
|
with pdfplumber.open(self.pdf_path) as pdf:
|
|
print(f"📄 Processing annotations...")
|
|
for page_num, page in enumerate(pdf.pages, 1):
|
|
if hasattr(page, 'annots') and page.annots:
|
|
page_annotations = 0
|
|
for i, annot in enumerate(page.annots):
|
|
try:
|
|
annot_type = annot.get('subtype', 'Unknown')
|
|
|
|
# Process all annotation types
|
|
if annot_type in ['Highlight', 'Squiggly', 'StrikeOut', 'Underline', 'FreeText', 'Text']:
|
|
rect = annot.get('rect', [])
|
|
|
|
# Try multiple text extraction methods
|
|
text = self._get_annotation_text(page, annot, rect)
|
|
color = self._get_color_from_annot(annot)
|
|
|
|
if text and text.strip():
|
|
annotations.append({
|
|
'page': page_num,
|
|
'text': self._clean_text(text),
|
|
'color': color,
|
|
'type': f'annotation_{annot_type.lower()}',
|
|
'coordinates': rect,
|
|
'y_position': rect[1] if len(rect) >= 4 else 0
|
|
})
|
|
page_annotations += 1
|
|
except Exception as e:
|
|
continue
|
|
|
|
if page_annotations > 0:
|
|
print(f" ✅ Page {page_num}: Found {page_annotations} annotations")
|
|
|
|
print(f" 📊 Total annotations: {len(annotations)}")
|
|
except Exception as e:
|
|
print(f"❌ Error reading annotations: {e}")
|
|
|
|
return annotations
|
|
|
|
def _get_annotation_text(self, page, annot, rect):
|
|
"""Try multiple methods to extract annotation text."""
|
|
# Method 1: From annotation contents
|
|
text = annot.get('contents', '').strip()
|
|
if text:
|
|
return text
|
|
|
|
# Method 2: From rect area
|
|
if rect and len(rect) == 4:
|
|
try:
|
|
x0, y0, x1, y1 = rect
|
|
cropped = page.crop((x0-1, y0-1, x1+1, y1+1))
|
|
text = cropped.extract_text()
|
|
if text and text.strip():
|
|
return text.strip()
|
|
except:
|
|
pass
|
|
|
|
# Method 3: From annotation object properties
|
|
for prop in ['label', 'title', 'subject']:
|
|
text = annot.get(prop, '').strip()
|
|
if text:
|
|
return text
|
|
|
|
return ""
|
|
|
|
def extract_background_highlights(self):
|
|
"""Extract background highlights with word completion."""
|
|
highlights = []
|
|
try:
|
|
print(f"\n🎨 Processing highlights...")
|
|
doc = fitz.open(str(self.pdf_path))
|
|
|
|
for page_num in range(doc.page_count):
|
|
page = doc[page_num]
|
|
page_highlights = 0
|
|
|
|
# Get all text words on the page for word completion
|
|
all_words = page.get_text("words") # [(x0, y0, x1, y1, "word", block_no, line_no, word_no)]
|
|
|
|
annotations = page.annots()
|
|
for annot in annotations:
|
|
try:
|
|
if annot.type[1] == 'Highlight':
|
|
# Get color information
|
|
colors = annot.colors
|
|
color_name = self._analyze_highlight_color(colors)
|
|
|
|
if color_name != 'unknown':
|
|
# Extract text from highlighted area
|
|
rect = annot.rect
|
|
highlight_text = self._extract_text_from_rect_pymupdf(page, rect)
|
|
|
|
if highlight_text and len(highlight_text.strip()) > 2:
|
|
# Complete partial words at start and end
|
|
completed_text = self._complete_partial_words(highlight_text, rect, all_words)
|
|
clean_text = self._clean_text(completed_text)
|
|
|
|
# Create highlight entry
|
|
highlight_entry = {
|
|
'page': page_num + 1,
|
|
'text': clean_text,
|
|
'color': color_name,
|
|
'type': 'highlight',
|
|
'coordinates': list(rect),
|
|
'y_position': rect.y0
|
|
}
|
|
|
|
highlights.append(highlight_entry)
|
|
page_highlights += 1
|
|
except Exception as e:
|
|
continue
|
|
|
|
if page_highlights > 0:
|
|
print(f" ✅ Page {page_num + 1}: Found {page_highlights} highlights")
|
|
|
|
doc.close()
|
|
print(f" 📊 Total highlights: {len(highlights)}")
|
|
except Exception as e:
|
|
print(f"❌ Error reading highlights: {e}")
|
|
|
|
return highlights
|
|
|
|
def _complete_partial_words(self, highlight_text, rect, all_words):
|
|
"""Complete partial words at the beginning and end of highlights."""
|
|
if not highlight_text or not all_words:
|
|
return highlight_text
|
|
|
|
words = highlight_text.split()
|
|
if not words:
|
|
return highlight_text
|
|
|
|
first_word = words[0]
|
|
last_word = words[-1]
|
|
|
|
# Find words that intersect with the highlight rectangle
|
|
highlight_rect = fitz.Rect(rect)
|
|
nearby_words = []
|
|
|
|
for word_info in all_words:
|
|
word_rect = fitz.Rect(word_info[:4])
|
|
word_text = word_info[4]
|
|
|
|
# Check if word is near the highlight area (within expanded boundaries)
|
|
expanded_rect = fitz.Rect(
|
|
highlight_rect.x0 - 50, # Expand left
|
|
highlight_rect.y0 - 5, # Expand up
|
|
highlight_rect.x1 + 50, # Expand right
|
|
highlight_rect.y1 + 5 # Expand down
|
|
)
|
|
|
|
if word_rect.intersects(expanded_rect):
|
|
nearby_words.append((word_rect, word_text))
|
|
|
|
# Sort by position (left to right, top to bottom)
|
|
nearby_words.sort(key=lambda x: (x[0].y0, x[0].x0))
|
|
|
|
# Complete first word if it seems partial
|
|
if len(first_word) >= 3 and self._is_likely_partial(first_word):
|
|
completed_first = self._find_complete_word(first_word, nearby_words, 'start')
|
|
if completed_first and completed_first != first_word:
|
|
words[0] = completed_first
|
|
print(f" 🔧 Completed first word: '{first_word}' → '{completed_first}'")
|
|
|
|
# Complete last word if it seems partial
|
|
if len(last_word) >= 3 and self._is_likely_partial(last_word):
|
|
completed_last = self._find_complete_word(last_word, nearby_words, 'end')
|
|
if completed_last and completed_last != last_word:
|
|
words[-1] = completed_last
|
|
print(f" 🔧 Completed last word: '{last_word}' → '{completed_last}'")
|
|
|
|
return ' '.join(words)
|
|
|
|
def _is_likely_partial(self, word):
|
|
"""Check if a word is likely partial/incomplete."""
|
|
if not word:
|
|
return False
|
|
|
|
# Common indicators of partial words
|
|
partial_indicators = [
|
|
len(word) < 3, # Very short
|
|
word.endswith('-'), # Hyphenated break
|
|
not word.isalpha() and not word[-1].isalpha(), # Ends with punctuation
|
|
word.lower() in ['the', 'and', 'of', 'to', 'in', 'for', 'with'], # Complete common words
|
|
]
|
|
|
|
# If it's a common complete word, it's not partial
|
|
if word.lower() in ['the', 'and', 'of', 'to', 'in', 'for', 'with', 'a', 'an', 'is', 'are', 'was', 'were']:
|
|
return False
|
|
|
|
# Check for incomplete endings (consonant clusters that suggest more letters)
|
|
if len(word) >= 4:
|
|
ending = word[-2:].lower()
|
|
incomplete_endings = ['th', 'st', 'nd', 'rd', 'ch', 'sh', 'nt', 'mp', 'ck', 'ng']
|
|
if any(word.lower().endswith(end) for end in incomplete_endings):
|
|
return True
|
|
|
|
# Check if it doesn't end with typical word endings
|
|
common_endings = ['ed', 'ing', 'er', 'est', 'ly', 'ion', 'tion', 'ment', 'ness', 'ful', 'less', 'able', 'ible']
|
|
if len(word) >= 4 and not any(word.lower().endswith(end) for end in common_endings):
|
|
return True
|
|
|
|
return False
|
|
|
|
def _find_complete_word(self, partial_word, nearby_words, position):
|
|
"""Find the complete word that contains the partial word."""
|
|
partial_lower = partial_word.lower()
|
|
|
|
candidates = []
|
|
|
|
for word_rect, full_word in nearby_words:
|
|
full_word_lower = full_word.lower()
|
|
|
|
if position == 'start':
|
|
# For start position, the partial word should be at the end of the complete word
|
|
if full_word_lower.endswith(partial_lower) and len(full_word) > len(partial_word):
|
|
candidates.append((full_word, len(full_word)))
|
|
elif position == 'end':
|
|
# For end position, the partial word should be at the start of the complete word
|
|
if full_word_lower.startswith(partial_lower) and len(full_word) > len(partial_word):
|
|
candidates.append((full_word, len(full_word)))
|
|
|
|
# Return the longest candidate (most likely to be the complete word)
|
|
if candidates:
|
|
candidates.sort(key=lambda x: x[1], reverse=True)
|
|
return candidates[0][0]
|
|
|
|
return partial_word
|
|
|
|
def _extract_text_from_rect_pymupdf(self, page, rect):
|
|
"""Extract text from rectangle using multiple PyMuPDF methods."""
|
|
try:
|
|
# Method 1: Direct text extraction
|
|
text = page.get_text("text", clip=rect)
|
|
if text and text.strip():
|
|
return text.strip()
|
|
|
|
# Method 2: Textbox method
|
|
text = page.get_textbox(rect)
|
|
if text and text.strip():
|
|
return text.strip()
|
|
|
|
# Method 3: Expanded rectangle
|
|
expanded_rect = fitz.Rect(rect.x0 - 2, rect.y0 - 2, rect.x1 + 2, rect.y1 + 2)
|
|
text_dict = page.get_text("dict", clip=expanded_rect)
|
|
|
|
text_parts = []
|
|
for block in text_dict.get("blocks", []):
|
|
if "lines" in block:
|
|
for line in block["lines"]:
|
|
for span in line["spans"]:
|
|
if span["text"].strip():
|
|
text_parts.append(span["text"])
|
|
|
|
return " ".join(text_parts)
|
|
except:
|
|
return ""
|
|
|
|
def _analyze_highlight_color(self, colors):
|
|
"""Analyze highlight color with improved detection."""
|
|
if not colors:
|
|
return 'unknown'
|
|
|
|
# Check fill color first (highlight background)
|
|
if 'fill' in colors and colors['fill']:
|
|
return self._rgb_to_color_name(colors['fill'])
|
|
elif 'stroke' in colors and colors['stroke']:
|
|
return self._rgb_to_color_name(colors['stroke'])
|
|
|
|
return 'unknown'
|
|
|
|
def _get_color_from_annot(self, annot):
|
|
"""Get color from pdfplumber annotation."""
|
|
try:
|
|
color = annot.get('color', [])
|
|
if color:
|
|
return self._rgb_to_color_name(color)
|
|
except:
|
|
pass
|
|
return 'unknown'
|
|
|
|
def _rgb_to_color_name(self, rgb):
|
|
"""Convert RGB values to color names with improved precision."""
|
|
if not rgb or len(rgb) < 3:
|
|
return 'unknown'
|
|
|
|
r, g, b = rgb[:3]
|
|
|
|
# Precise color detection
|
|
if r > 0.7 and g > 0.7 and b < 0.6:
|
|
return 'yellow'
|
|
elif r < 0.6 and g > 0.7 and b < 0.6:
|
|
return 'green'
|
|
elif r < 0.6 and g < 0.8 and b > 0.7:
|
|
return 'blue'
|
|
elif r > 0.7 and g < 0.6 and b > 0.7:
|
|
return 'pink'
|
|
elif r > 0.8 and g > 0.5 and b < 0.5:
|
|
return 'orange'
|
|
elif r > 0.7 and g < 0.5 and b < 0.5:
|
|
return 'red'
|
|
elif r < 0.5 and g > 0.7 and b > 0.7:
|
|
return 'cyan'
|
|
else:
|
|
return f'rgb({r:.2f},{g:.2f},{b:.2f})'
|
|
|
|
def _clean_text(self, text):
|
|
"""Clean and normalize text."""
|
|
if not text:
|
|
return ""
|
|
|
|
try:
|
|
# Remove extra whitespace and normalize
|
|
text = re.sub(r'\s+', ' ', text.strip())
|
|
# Remove line break hyphens
|
|
text = re.sub(r'-\s+', '', text)
|
|
# Fix punctuation spacing
|
|
text = re.sub(r'\s+([.,;:!?])', r'\1', text)
|
|
return text
|
|
except:
|
|
return str(text) if text else ""
|
|
|
|
def _smart_deduplicate(self, items):
|
|
"""Smart deduplication that merges similar highlights."""
|
|
if not items:
|
|
return items
|
|
|
|
# Sort by page and position
|
|
items.sort(key=lambda x: (x['page'], x['y_position'], len(x['text'])))
|
|
|
|
unique_items = []
|
|
for item in items:
|
|
is_duplicate = False
|
|
|
|
for existing in unique_items:
|
|
# Check if this is a duplicate or subset
|
|
if (item['page'] == existing['page'] and
|
|
item['color'] == existing['color'] and
|
|
abs(item['y_position'] - existing['y_position']) < 10):
|
|
|
|
# Check text similarity
|
|
item_text = item['text'].lower().strip()
|
|
existing_text = existing['text'].lower().strip()
|
|
|
|
# If one is substring of another, keep the longer one
|
|
if item_text in existing_text:
|
|
is_duplicate = True
|
|
break
|
|
elif existing_text in item_text:
|
|
# Replace existing with longer text
|
|
existing['text'] = item['text']
|
|
is_duplicate = True
|
|
break
|
|
# If very similar (90% overlap), it's a duplicate
|
|
elif self._text_similarity(item_text, existing_text) > 0.9:
|
|
is_duplicate = True
|
|
break
|
|
|
|
if not is_duplicate:
|
|
unique_items.append(item)
|
|
|
|
return unique_items
|
|
|
|
def _text_similarity(self, text1, text2):
|
|
"""Calculate text similarity ratio."""
|
|
if not text1 or not text2:
|
|
return 0
|
|
|
|
# Simple word-based similarity
|
|
words1 = set(text1.split())
|
|
words2 = set(text2.split())
|
|
|
|
if not words1 or not words2:
|
|
return 0
|
|
|
|
intersection = len(words1.intersection(words2))
|
|
union = len(words1.union(words2))
|
|
|
|
return intersection / union if union > 0 else 0
|
|
|
|
def extract_all_highlights(self):
|
|
"""Extract and process all highlights and annotations."""
|
|
print("🔍 PDF Highlight & Annotation Extractor")
|
|
print("=" * 50)
|
|
|
|
# Extract annotations
|
|
self.annotations = self.extract_annotation_highlights()
|
|
|
|
# Extract highlights
|
|
self.highlights = self.extract_background_highlights()
|
|
|
|
# Smart deduplication
|
|
self.highlights = self._smart_deduplicate(self.highlights)
|
|
|
|
print(f"\n✨ Processing complete!")
|
|
print(f" 📝 Annotations: {len(self.annotations)}")
|
|
print(f" 🎨 Highlights: {len(self.highlights)}")
|
|
|
|
return self.annotations, self.highlights
|
|
|
|
def sort_by_position(self, items):
|
|
"""Sort items by page, then top to bottom."""
|
|
return sorted(items, key=lambda x: (x['page'], x['y_position']))
|
|
|
|
def save_to_json(self, annotations, highlights, output_path):
|
|
"""Save results to JSON file."""
|
|
data = {
|
|
'annotations': annotations,
|
|
'highlights': highlights,
|
|
'summary': {
|
|
'total_annotations': len(annotations),
|
|
'total_highlights': len(highlights),
|
|
'annotation_colors': list(set(a['color'] for a in annotations)),
|
|
'highlight_colors': list(set(h['color'] for h in highlights))
|
|
}
|
|
}
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
print(f"💾 Saved to {output_path}")
|
|
|
|
def save_to_csv(self, annotations, highlights, output_path):
|
|
"""Save results to CSV file."""
|
|
all_items = []
|
|
for item in annotations:
|
|
item_copy = item.copy()
|
|
item_copy['category'] = 'annotation'
|
|
all_items.append(item_copy)
|
|
for item in highlights:
|
|
item_copy = item.copy()
|
|
item_copy['category'] = 'highlight'
|
|
all_items.append(item_copy)
|
|
|
|
df = pd.DataFrame(all_items)
|
|
df.to_csv(output_path, index=False, encoding='utf-8')
|
|
print(f"📊 Saved to {output_path}")
|
|
|
|
def display_results(self):
|
|
"""Display results with clean formatting."""
|
|
|
|
print("\n" + "="*60)
|
|
print("📋 EXTRACTION RESULTS")
|
|
print("="*60)
|
|
|
|
# Display Annotations
|
|
if self.annotations:
|
|
sorted_annotations = self.sort_by_position(self.annotations)
|
|
print(f"\n📝 ANNOTATIONS ({len(sorted_annotations)} items)")
|
|
print("-" * 40)
|
|
|
|
for i, item in enumerate(sorted_annotations, 1):
|
|
color_code = self._get_color_code(item['color'])
|
|
print(f"\n{i:2d}. Page {item['page']} | {color_code}{item['color'].upper()}{Style.RESET_ALL}")
|
|
print(f" Type: {item['type']}")
|
|
print(f" Text: \"{item['text']}\"")
|
|
else:
|
|
print(f"\n📝 ANNOTATIONS: None found")
|
|
|
|
# Display Highlights
|
|
if self.highlights:
|
|
sorted_highlights = self.sort_by_position(self.highlights)
|
|
print(f"\n🎨 BACKGROUND HIGHLIGHTS ({len(sorted_highlights)} items)")
|
|
print("-" * 40)
|
|
|
|
for i, item in enumerate(sorted_highlights, 1):
|
|
color_code = self._get_color_code(item['color'])
|
|
print(f"\n{i:2d}. Page {item['page']} | {color_code}{item['color'].upper()}{Style.RESET_ALL}")
|
|
print(f" Text: \"{item['text']}\"")
|
|
else:
|
|
print(f"\n🎨 BACKGROUND HIGHLIGHTS: None found")
|
|
|
|
print("\n" + "="*60)
|
|
|
|
def _get_color_code(self, color_name):
|
|
"""Get terminal color code for display."""
|
|
color_map = {
|
|
'yellow': Back.YELLOW + Fore.BLACK,
|
|
'green': Back.GREEN + Fore.BLACK,
|
|
'blue': Back.BLUE + Fore.WHITE,
|
|
'red': Back.RED + Fore.WHITE,
|
|
'pink': Back.MAGENTA + Fore.WHITE,
|
|
'orange': Back.YELLOW + Fore.RED,
|
|
'cyan': Back.CYAN + Fore.BLACK,
|
|
'unknown': Back.WHITE + Fore.BLACK
|
|
}
|
|
return color_map.get(color_name, Back.WHITE + Fore.BLACK)
|
|
|
|
|
|
def main():
|
|
print("🎨 PDF Highlight & Annotation Extractor")
|
|
print("🚀 Enhanced with smart word completion and deduplication")
|
|
print()
|
|
|
|
# Get PDF file path
|
|
pdf_path = input("📄 Enter PDF file path: ").strip('"')
|
|
|
|
if not Path(pdf_path).exists():
|
|
print("❌ File not found!")
|
|
return
|
|
|
|
# Get output options
|
|
print("\n📤 Output Options:")
|
|
output_json = input("💾 JSON file (or Enter to skip): ").strip('"')
|
|
output_csv = input("📊 CSV file (or Enter to skip): ").strip('"')
|
|
|
|
# Process PDF
|
|
extractor = PDFHighlightExtractor(pdf_path)
|
|
annotations, highlights = extractor.extract_all_highlights()
|
|
|
|
# Display results
|
|
extractor.display_results()
|
|
|
|
# Save results
|
|
if output_json:
|
|
extractor.save_to_json(annotations, highlights, output_json)
|
|
if output_csv:
|
|
extractor.save_to_csv(annotations, highlights, output_csv)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|