hilitehero/main.py

import pdfplumber
import fitz  # PyMuPDF
import json
from colorama import init, Fore, Back, Style
import pandas as pd
from pathlib import Path
import re

# Initialize colorama for colored terminal output
init(autoreset=True)

class PDFHighlightExtractor:
    def __init__(self, pdf_path):
        self.pdf_path = Path(pdf_path)
        self.annotations = []
        self.highlights = []

    def extract_annotation_highlights(self):
        """Extract ALL types of annotations with improved processing."""
        annotations = []
        try:
            with pdfplumber.open(self.pdf_path) as pdf:
                print(f"📄 Processing annotations...")
                for page_num, page in enumerate(pdf.pages, 1):
                    if hasattr(page, 'annots') and page.annots:
                        page_annotations = 0
                        for i, annot in enumerate(page.annots):
                            try:
                                annot_type = annot.get('subtype', 'Unknown')

                                # Process all annotation types
                                if annot_type in ['Highlight', 'Squiggly', 'StrikeOut', 'Underline', 'FreeText', 'Text']:
                                    rect = annot.get('rect', [])

                                    # Try multiple text extraction methods
                                    text = self._get_annotation_text(page, annot, rect)
                                    color = self._get_color_from_annot(annot)

                                    if text and text.strip():
                                        annotations.append({
                                            'page': page_num,
                                            'text': self._clean_text(text),
                                            'color': color,
                                            'type': f'annotation_{annot_type.lower()}',
                                            'coordinates': rect,
                                            'y_position': rect[1] if len(rect) >= 4 else 0
                                        })
                                        page_annotations += 1
                            except Exception as e:
                                continue

                        if page_annotations > 0:
                            print(f"  ✅ Page {page_num}: Found {page_annotations} annotations")

            print(f"  📊 Total annotations: {len(annotations)}")
        except Exception as e:
            print(f"❌ Error reading annotations: {e}")

        return annotations

    def _get_annotation_text(self, page, annot, rect):
        """Try multiple methods to extract annotation text."""
        # Method 1: From annotation contents
        text = annot.get('contents', '').strip()
        if text:
            return text

        # Method 2: From rect area
        if rect and len(rect) == 4:
            try:
                x0, y0, x1, y1 = rect
                cropped = page.crop((x0-1, y0-1, x1+1, y1+1))
                text = cropped.extract_text()
                if text and text.strip():
                    return text.strip()
            except:
                pass

        # Method 3: From annotation object properties
        for prop in ['label', 'title', 'subject']:
            text = annot.get(prop, '').strip()
            if text:
                return text

        return ""

    def extract_background_highlights(self):
        """Extract background highlights with word completion."""
        highlights = []
        try:
            print(f"\n🎨 Processing highlights...")
            doc = fitz.open(str(self.pdf_path))

            for page_num in range(doc.page_count):
                page = doc[page_num]
                page_highlights = 0

                # Get all text words on the page for word completion
                all_words = page.get_text("words")  # [(x0, y0, x1, y1, "word", block_no, line_no, word_no)]

                annotations = page.annots()
                for annot in annotations:
                    try:
                        if annot.type[1] == 'Highlight':
                            # Get color information
                            colors = annot.colors
                            color_name = self._analyze_highlight_color(colors)

                            if color_name != 'unknown':
                                # Extract text from highlighted area
                                rect = annot.rect
                                highlight_text = self._extract_text_from_rect_pymupdf(page, rect)

                                if highlight_text and len(highlight_text.strip()) > 2:
                                    # Complete partial words at start and end
                                    completed_text = self._complete_partial_words(highlight_text, rect, all_words)
                                    clean_text = self._clean_text(completed_text)

                                    # Create highlight entry
                                    highlight_entry = {
                                        'page': page_num + 1,
                                        'text': clean_text,
                                        'color': color_name,
                                        'type': 'highlight',
                                        'coordinates': list(rect),
                                        'y_position': rect.y0
                                    }

                                    highlights.append(highlight_entry)
                                    page_highlights += 1
                    except Exception as e:
                        continue

                if page_highlights > 0:
                    print(f"  ✅ Page {page_num + 1}: Found {page_highlights} highlights")

            doc.close()
            print(f"  📊 Total highlights: {len(highlights)}")
        except Exception as e:
            print(f"❌ Error reading highlights: {e}")

        return highlights

    def _complete_partial_words(self, highlight_text, rect, all_words):
        """Complete partial words at the beginning and end of highlights."""
        if not highlight_text or not all_words:
            return highlight_text

        words = highlight_text.split()
        if not words:
            return highlight_text

        first_word = words[0]
        last_word = words[-1]

        # Find words that intersect with the highlight rectangle
        highlight_rect = fitz.Rect(rect)
        nearby_words = []

        for word_info in all_words:
            word_rect = fitz.Rect(word_info[:4])
            word_text = word_info[4]

            # Check if word is near the highlight area (within expanded boundaries)
            expanded_rect = fitz.Rect(
                highlight_rect.x0 - 50,  # Expand left
                highlight_rect.y0 - 5,   # Expand up
                highlight_rect.x1 + 50,  # Expand right
                highlight_rect.y1 + 5    # Expand down
            )

            if word_rect.intersects(expanded_rect):
                nearby_words.append((word_rect, word_text))

        # Sort by position (left to right, top to bottom)
        nearby_words.sort(key=lambda x: (x[0].y0, x[0].x0))

        # Complete first word if it seems partial
        if len(first_word) >= 3 and self._is_likely_partial(first_word):
            completed_first = self._find_complete_word(first_word, nearby_words, 'start')
            if completed_first and completed_first != first_word:
                words[0] = completed_first
                print(f"    🔧 Completed first word: '{first_word}' → '{completed_first}'")

        # Complete last word if it seems partial
        if len(last_word) >= 3 and self._is_likely_partial(last_word):
            completed_last = self._find_complete_word(last_word, nearby_words, 'end')
            if completed_last and completed_last != last_word:
                words[-1] = completed_last
                print(f"    🔧 Completed last word: '{last_word}' → '{completed_last}'")

        return ' '.join(words)

    def _is_likely_partial(self, word):
        """Check if a word is likely partial/incomplete."""
        if not word:
            return False

        # Common indicators of partial words
        partial_indicators = [
            len(word) < 3,  # Very short
            word.endswith('-'),  # Hyphenated break
            not word.isalpha() and not word[-1].isalpha(),  # Ends with punctuation
            word.lower() in ['the', 'and', 'of', 'to', 'in', 'for', 'with'],  # Complete common words
        ]

        # If it's a common complete word, it's not partial
        if word.lower() in ['the', 'and', 'of', 'to', 'in', 'for', 'with', 'a', 'an', 'is', 'are', 'was', 'were']:
            return False

        # Check for incomplete endings (consonant clusters that suggest more letters)
        if len(word) >= 4:
            ending = word[-2:].lower()
            incomplete_endings = ['th', 'st', 'nd', 'rd', 'ch', 'sh', 'nt', 'mp', 'ck', 'ng']
            if any(word.lower().endswith(end) for end in incomplete_endings):
                return True

        # Check if it doesn't end with typical word endings
        common_endings = ['ed', 'ing', 'er', 'est', 'ly', 'ion', 'tion', 'ment', 'ness', 'ful', 'less', 'able', 'ible']
        if len(word) >= 4 and not any(word.lower().endswith(end) for end in common_endings):
            return True

        return False

    def _find_complete_word(self, partial_word, nearby_words, position):
        """Find the complete word that contains the partial word."""
        partial_lower = partial_word.lower()

        candidates = []

        for word_rect, full_word in nearby_words:
            full_word_lower = full_word.lower()

            if position == 'start':
                # For start position, the partial word should be at the end of the complete word
                if full_word_lower.endswith(partial_lower) and len(full_word) > len(partial_word):
                    candidates.append((full_word, len(full_word)))
            elif position == 'end':
                # For end position, the partial word should be at the start of the complete word
                if full_word_lower.startswith(partial_lower) and len(full_word) > len(partial_word):
                    candidates.append((full_word, len(full_word)))

        # Return the longest candidate (most likely to be the complete word)
        if candidates:
            candidates.sort(key=lambda x: x[1], reverse=True)
            return candidates[0][0]

        return partial_word

    def _extract_text_from_rect_pymupdf(self, page, rect):
        """Extract text from rectangle using multiple PyMuPDF methods."""
        try:
            # Method 1: Direct text extraction
            text = page.get_text("text", clip=rect)
            if text and text.strip():
                return text.strip()

            # Method 2: Textbox method
            text = page.get_textbox(rect)
            if text and text.strip():
                return text.strip()

            # Method 3: Expanded rectangle
            expanded_rect = fitz.Rect(rect.x0 - 2, rect.y0 - 2, rect.x1 + 2, rect.y1 + 2)
            text_dict = page.get_text("dict", clip=expanded_rect)

            text_parts = []
            for block in text_dict.get("blocks", []):
                if "lines" in block:
                    for line in block["lines"]:
                        for span in line["spans"]:
                            if span["text"].strip():
                                text_parts.append(span["text"])

            return " ".join(text_parts)
        except:
            return ""

    def _analyze_highlight_color(self, colors):
        """Analyze highlight color with improved detection."""
        if not colors:
            return 'unknown'

        # Check fill color first (highlight background)
        if 'fill' in colors and colors['fill']:
            return self._rgb_to_color_name(colors['fill'])
        elif 'stroke' in colors and colors['stroke']:
            return self._rgb_to_color_name(colors['stroke'])

        return 'unknown'

    def _get_color_from_annot(self, annot):
        """Get color from pdfplumber annotation."""
        try:
            color = annot.get('color', [])
            if color:
                return self._rgb_to_color_name(color)
        except:
            pass
        return 'unknown'

    def _rgb_to_color_name(self, rgb):
        """Convert RGB values to color names with improved precision."""
        if not rgb or len(rgb) < 3:
            return 'unknown'

        r, g, b = rgb[:3]

        # Precise color detection
        if r > 0.7 and g > 0.7 and b < 0.6:
            return 'yellow'
        elif r < 0.6 and g > 0.7 and b < 0.6:
            return 'green'
        elif r < 0.6 and g < 0.8 and b > 0.7:
            return 'blue'
        elif r > 0.7 and g < 0.6 and b > 0.7:
            return 'pink'
        elif r > 0.8 and g > 0.5 and b < 0.5:
            return 'orange'
        elif r > 0.7 and g < 0.5 and b < 0.5:
            return 'red'
        elif r < 0.5 and g > 0.7 and b > 0.7:
            return 'cyan'
        else:
            return f'rgb({r:.2f},{g:.2f},{b:.2f})'

    def _clean_text(self, text):
        """Clean and normalize text."""
        if not text:
            return ""

        try:
            # Remove extra whitespace and normalize
            text = re.sub(r'\s+', ' ', text.strip())
            # Remove line break hyphens
            text = re.sub(r'-\s+', '', text)
            # Fix punctuation spacing
            text = re.sub(r'\s+([.,;:!?])', r'\1', text)
            return text
        except:
            return str(text) if text else ""

    def _smart_deduplicate(self, items):
        """Smart deduplication that merges similar highlights."""
        if not items:
            return items

        # Sort by page and position
        items.sort(key=lambda x: (x['page'], x['y_position'], len(x['text'])))

        unique_items = []
        for item in items:
            is_duplicate = False

            for existing in unique_items:
                # Check if this is a duplicate or subset
                if (item['page'] == existing['page'] and
                    item['color'] == existing['color'] and
                    abs(item['y_position'] - existing['y_position']) < 10):

                    # Check text similarity
                    item_text = item['text'].lower().strip()
                    existing_text = existing['text'].lower().strip()

                    # If one is substring of another, keep the longer one
                    if item_text in existing_text:
                        is_duplicate = True
                        break
                    elif existing_text in item_text:
                        # Replace existing with longer text
                        existing['text'] = item['text']
                        is_duplicate = True
                        break
                    # If very similar (90% overlap), it's a duplicate
                    elif self._text_similarity(item_text, existing_text) > 0.9:
                        is_duplicate = True
                        break

            if not is_duplicate:
                unique_items.append(item)

        return unique_items

    def _text_similarity(self, text1, text2):
        """Calculate text similarity ratio."""
        if not text1 or not text2:
            return 0

        # Simple word-based similarity
        words1 = set(text1.split())
        words2 = set(text2.split())

        if not words1 or not words2:
            return 0

        intersection = len(words1.intersection(words2))
        union = len(words1.union(words2))

        return intersection / union if union > 0 else 0

    def extract_all_highlights(self):
        """Extract and process all highlights and annotations."""
        print("🔍 PDF Highlight & Annotation Extractor")
        print("=" * 50)

        # Extract annotations
        self.annotations = self.extract_annotation_highlights()

        # Extract highlights
        self.highlights = self.extract_background_highlights()

        # Smart deduplication
        self.highlights = self._smart_deduplicate(self.highlights)

        print(f"\n✨ Processing complete!")
        print(f"   📝 Annotations: {len(self.annotations)}")
        print(f"   🎨 Highlights: {len(self.highlights)}")

        return self.annotations, self.highlights

    def sort_by_position(self, items):
        """Sort items by page, then top to bottom."""
        return sorted(items, key=lambda x: (x['page'], x['y_position']))

    def save_to_json(self, annotations, highlights, output_path):
        """Save results to JSON file."""
        data = {
            'annotations': annotations,
            'highlights': highlights,
            'summary': {
                'total_annotations': len(annotations),
                'total_highlights': len(highlights),
                'annotation_colors': list(set(a['color'] for a in annotations)),
                'highlight_colors': list(set(h['color'] for h in highlights))
            }
        }
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        print(f"💾 Saved to {output_path}")

    def save_to_csv(self, annotations, highlights, output_path):
        """Save results to CSV file."""
        all_items = []
        for item in annotations:
            item_copy = item.copy()
            item_copy['category'] = 'annotation'
            all_items.append(item_copy)
        for item in highlights:
            item_copy = item.copy()
            item_copy['category'] = 'highlight'
            all_items.append(item_copy)

        df = pd.DataFrame(all_items)
        df.to_csv(output_path, index=False, encoding='utf-8')
        print(f"📊 Saved to {output_path}")

    def display_results(self):
        """Display results with clean formatting."""

        print("\n" + "="*60)
        print("📋 EXTRACTION RESULTS")
        print("="*60)

        # Display Annotations
        if self.annotations:
            sorted_annotations = self.sort_by_position(self.annotations)
            print(f"\n📝 ANNOTATIONS ({len(sorted_annotations)} items)")
            print("-" * 40)

            for i, item in enumerate(sorted_annotations, 1):
                color_code = self._get_color_code(item['color'])
                print(f"\n{i:2d}. Page {item['page']} | {color_code}{item['color'].upper()}{Style.RESET_ALL}")
                print(f"    Type: {item['type']}")
                print(f"    Text: \"{item['text']}\"")
        else:
            print(f"\n📝 ANNOTATIONS: None found")

        # Display Highlights
        if self.highlights:
            sorted_highlights = self.sort_by_position(self.highlights)
            print(f"\n🎨 BACKGROUND HIGHLIGHTS ({len(sorted_highlights)} items)")
            print("-" * 40)

            for i, item in enumerate(sorted_highlights, 1):
                color_code = self._get_color_code(item['color'])
                print(f"\n{i:2d}. Page {item['page']} | {color_code}{item['color'].upper()}{Style.RESET_ALL}")
                print(f"    Text: \"{item['text']}\"")
        else:
            print(f"\n🎨 BACKGROUND HIGHLIGHTS: None found")

        print("\n" + "="*60)

    def _get_color_code(self, color_name):
        """Get terminal color code for display."""
        color_map = {
            'yellow': Back.YELLOW + Fore.BLACK,
            'green': Back.GREEN + Fore.BLACK,
            'blue': Back.BLUE + Fore.WHITE,
            'red': Back.RED + Fore.WHITE,
            'pink': Back.MAGENTA + Fore.WHITE,
            'orange': Back.YELLOW + Fore.RED,
            'cyan': Back.CYAN + Fore.BLACK,
            'unknown': Back.WHITE + Fore.BLACK
        }
        return color_map.get(color_name, Back.WHITE + Fore.BLACK)


def main():
    print("🎨 PDF Highlight & Annotation Extractor")
    print("🚀 Enhanced with smart word completion and deduplication")
    print()

    # Get PDF file path
    pdf_path = input("📄 Enter PDF file path: ").strip('"')

    if not Path(pdf_path).exists():
        print("❌ File not found!")
        return

    # Get output options
    print("\n📤 Output Options:")
    output_json = input("💾 JSON file (or Enter to skip): ").strip('"')
    output_csv = input("📊 CSV file (or Enter to skip): ").strip('"')

    # Process PDF
    extractor = PDFHighlightExtractor(pdf_path)
    annotations, highlights = extractor.extract_all_highlights()

    # Display results
    extractor.display_results()

    # Save results
    if output_json:
        extractor.save_to_json(annotations, highlights, output_json)
    if output_csv:
        extractor.save_to_csv(annotations, highlights, output_csv)


if __name__ == '__main__':
    main()