Refactor main.py for improved readability and functionality; update README for installation instructions and usage examples; modify requirements.txt for dependency versioning; update test PDFs.

This commit is contained in:
ilia 2025-09-26 11:32:33 -04:00
parent d28005e541
commit 41e63da3d4
6 changed files with 1898 additions and 769 deletions

261
Makefile Normal file
View File

@ -0,0 +1,261 @@
# HiLiteHero - PDF Highlight Extractor Makefile
# Description: Makefile for easy development, testing, and deployment
# Variables
PYTHON := python3
PIP := pip3
VENV := venv
VENV_BIN := $(VENV)/bin
VENV_PYTHON := $(VENV_BIN)/python
VENV_PIP := $(VENV_BIN)/pip
MAIN_SCRIPT := main.py
TEST_PDF := test/test2.pdf
REQUIREMENTS := requirements.txt
# Colors for output
RED := \033[0;31m
GREEN := \033[0;32m
YELLOW := \033[0;33m
BLUE := \033[0;34m
PURPLE := \033[0;35m
CYAN := \033[0;36m
WHITE := \033[0;37m
NC := \033[0m # No Color
# Helper function to get the right Python executable
define get_python
$(if $(wildcard $(VENV_PYTHON)),$(VENV_PYTHON),$(PYTHON))
endef
# Default target
.PHONY: help
help: ## Show this help message
@echo "$(CYAN)HiLiteHero - PDF Highlight Extractor$(NC)"
@echo "$(YELLOW)Available targets:$(NC)"
@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf " $(GREEN)%-15s$(NC) %s\n", $$1, $$2}' $(MAKEFILE_LIST)
# Installation targets
.PHONY: install
install: venv-install ## Install dependencies (creates venv if needed)
@echo "$(GREEN)Dependencies installed successfully!$(NC)"
.PHONY: install-system
install-system: ## Install dependencies system-wide (may require --break-system-packages)
@echo "$(YELLOW)Warning: Installing system-wide packages$(NC)"
@echo "$(BLUE)Installing dependencies...$(NC)"
$(PIP) install -r $(REQUIREMENTS) --break-system-packages
@echo "$(GREEN)Dependencies installed successfully!$(NC)"
.PHONY: install-dev
install-dev: venv-install ## Install development dependencies in virtual environment
@echo "$(BLUE)Installing development dependencies...$(NC)"
$(VENV_PIP) install black flake8 pytest pytest-cov
@echo "$(GREEN)Development dependencies installed!$(NC)"
.PHONY: venv
venv: ## Create virtual environment
@echo "$(BLUE)Creating virtual environment...$(NC)"
$(PYTHON) -m venv $(VENV)
@echo "$(GREEN)Virtual environment created!$(NC)"
@echo "$(YELLOW)To activate: source $(VENV)/bin/activate$(NC)"
.PHONY: venv-install
venv-install: venv ## Create venv and install dependencies
@echo "$(BLUE)Installing dependencies in virtual environment...$(NC)"
$(VENV_PIP) install --upgrade pip
$(VENV_PIP) install -r $(REQUIREMENTS)
@echo "$(GREEN)Virtual environment setup complete!$(NC)"
@echo "$(YELLOW)To activate: source $(VENV)/bin/activate$(NC)"
# Testing targets
.PHONY: test
test: ## Run test mode with default PDF
@echo "$(BLUE)Running test mode...$(NC)"
$(call get_python) $(MAIN_SCRIPT) --test
@echo "$(GREEN)Test completed!$(NC)"
.PHONY: test-interactive
test-interactive: ## Run test mode with interactive review
@echo "$(BLUE)Running test mode with interactive review...$(NC)"
$(call get_python) $(MAIN_SCRIPT) --test --interactive
.PHONY: test-debug
test-debug: ## Run test mode with debug output
@echo "$(BLUE)Running test mode with debug output...$(NC)"
$(call get_python) $(MAIN_SCRIPT) --test --debug
.PHONY: test-silent
test-silent: ## Run test mode silently (minimal output)
@echo "$(BLUE)Running test mode silently...$(NC)"
$(call get_python) $(MAIN_SCRIPT) --test --silent
@echo "$(GREEN)Silent test completed!$(NC)"
.PHONY: test-custom
test-custom: ## Run test with custom output file
@echo "$(BLUE)Running test with custom output...$(NC)"
$(call get_python) $(MAIN_SCRIPT) --test --output-json test_results.json
@echo "$(GREEN)Test results saved to test_results.json$(NC)"
# Development targets
.PHONY: dev
dev: ## Run in development mode (interactive with debug)
@echo "$(BLUE)Starting development mode...$(NC)"
$(call get_python) $(MAIN_SCRIPT) --debug --interactive
.PHONY: run
run: ## Run the script interactively
@echo "$(BLUE)Starting interactive mode...$(NC)"
$(call get_python) $(MAIN_SCRIPT)
.PHONY: run-file
run-file: ## Run with a specific PDF file (usage: make run-file FILE=path/to/file.pdf)
@if [ -z "$(FILE)" ]; then \
echo "$(RED)Error: Please specify FILE=path/to/file.pdf$(NC)"; \
echo "$(YELLOW)Example: make run-file FILE=document.pdf$(NC)"; \
exit 1; \
fi
@echo "$(BLUE)Processing $(FILE)...$(NC)"
$(call get_python) $(MAIN_SCRIPT) "$(FILE)"
.PHONY: run-pages
run-pages: ## Run with specific pages (usage: make run-pages FILE=doc.pdf PAGES="1,3-5")
@if [ -z "$(FILE)" ] || [ -z "$(PAGES)" ]; then \
echo "$(RED)Error: Please specify FILE and PAGES$(NC)"; \
echo "$(YELLOW)Example: make run-pages FILE=document.pdf PAGES=\"1,3-5\"$(NC)"; \
exit 1; \
fi
@echo "$(BLUE)Processing pages $(PAGES) of $(FILE)...$(NC)"
$(call get_python) $(MAIN_SCRIPT) "$(FILE)" --pages "$(PAGES)"
# Code quality targets
.PHONY: format
format: ## Format code with black
@echo "$(BLUE)Formatting code with black...$(NC)"
$(call get_python) -m black $(MAIN_SCRIPT)
@echo "$(GREEN)Code formatted!$(NC)"
.PHONY: lint
lint: ## Lint code with flake8
@echo "$(BLUE)Linting code with flake8...$(NC)"
$(call get_python) -m flake8 $(MAIN_SCRIPT) --max-line-length=120 --ignore=E203,W503
@echo "$(GREEN)Linting completed!$(NC)"
.PHONY: check
check: lint ## Run all code quality checks
@echo "$(GREEN)All checks passed!$(NC)"
# Utility targets
.PHONY: clean
clean: ## Clean up generated files
@echo "$(BLUE)Cleaning up generated files...$(NC)"
rm -f *.json
rm -f test_results.json
rm -rf pdf_page_images/
rm -rf __pycache__/
rm -rf .pytest_cache/
rm -rf *.pyc
@echo "$(GREEN)Cleanup completed!$(NC)"
.PHONY: clean-venv
clean-venv: ## Remove virtual environment
@echo "$(BLUE)Removing virtual environment...$(NC)"
rm -rf $(VENV)
@echo "$(GREEN)Virtual environment removed!$(NC)"
.PHONY: clean-all
clean-all: clean clean-venv ## Clean everything including virtual environment
@echo "$(GREEN)Complete cleanup finished!$(NC)"
.PHONY: status
status: ## Show project status
@echo "$(CYAN)=== HiLiteHero Project Status ===$(NC)"
@echo "$(YELLOW)Python version:$(NC) $$(python3 --version 2>/dev/null || echo 'Not found')"
@echo "$(YELLOW)Pip version:$(NC) $$(pip3 --version 2>/dev/null || echo 'Not found')"
@echo "$(YELLOW)Virtual environment:$(NC) $$(if [ -d $(VENV) ]; then echo 'Exists'; else echo 'Not created'; fi)"
@echo "$(YELLOW)Dependencies installed:$(NC) $$(pip3 list | grep -q PyMuPDF && echo 'Yes' || echo 'No')"
@echo "$(YELLOW)Test PDF exists:$(NC) $$(if [ -f $(TEST_PDF) ]; then echo 'Yes'; else echo 'No'; fi)"
@echo "$(YELLOW)Generated files:$(NC) $$(ls -1 *.json 2>/dev/null | wc -l) JSON files"
# Documentation targets
.PHONY: docs
docs: ## Show documentation
@echo "$(CYAN)=== HiLiteHero Documentation ===$(NC)"
@echo "$(YELLOW)Main script:$(NC) $(MAIN_SCRIPT)"
@echo "$(YELLOW)Test PDF:$(NC) $(TEST_PDF)"
@echo "$(YELLOW)Requirements:$(NC) $(REQUIREMENTS)"
@echo ""
@echo "$(YELLOW)Quick start:$(NC)"
@echo " make test # Run test mode"
@echo " make run # Interactive mode"
@echo " make dev # Development mode"
@echo ""
@echo "$(YELLOW)For more help:$(NC) make help"
# Batch processing targets
.PHONY: batch
batch: ## Run in batch mode (silent with auto-save)
@echo "$(BLUE)Running in batch mode...$(NC)"
$(call get_python) $(MAIN_SCRIPT) --silent --output-json batch_results_$(shell date +%Y%m%d_%H%M%S).json
@echo "$(GREEN)Batch processing completed!$(NC)"
.PHONY: batch-file
batch-file: ## Batch process specific file (usage: make batch-file FILE=doc.pdf)
@if [ -z "$(FILE)" ]; then \
echo "$(RED)Error: Please specify FILE=path/to/file.pdf$(NC)"; \
exit 1; \
fi
@echo "$(BLUE)Batch processing $(FILE)...$(NC)"
$(call get_python) $(MAIN_SCRIPT) "$(FILE)" --silent --output-json "$(shell basename "$(FILE)" .pdf)_batch_$(shell date +%Y%m%d_%H%M%S).json"
@echo "$(GREEN)Batch processing completed!$(NC)"
.PHONY: batch-all
batch-all: ## Process all PDFs in test folder
@echo "$(BLUE)Processing all PDFs in test folder...$(NC)"
@if [ ! -d "test" ]; then \
echo "$(RED)Error: test folder not found$(NC)"; \
exit 1; \
fi
@pdf_count=0; \
for pdf in test/*.pdf; do \
if [ -f "$$pdf" ]; then \
pdf_count=$$((pdf_count + 1)); \
echo "$(CYAN)Processing $$pdf...$(NC)"; \
$(call get_python) $(MAIN_SCRIPT) "$$pdf" --silent --output-json "$$(basename "$$pdf" .pdf)_batch_$(shell date +%Y%m%d_%H%M%S).json"; \
fi; \
done; \
if [ $$pdf_count -eq 0 ]; then \
echo "$(YELLOW)No PDF files found in test folder$(NC)"; \
else \
echo "$(GREEN)Processed $$pdf_count PDF file(s) successfully!$(NC)"; \
fi
# Installation verification
.PHONY: verify
verify: ## Verify installation
@echo "$(BLUE)Verifying installation...$(NC)"
@if [ -f $(VENV_PYTHON) ]; then \
echo "$(CYAN)Checking virtual environment...$(NC)"; \
$(VENV_PYTHON) -c "import fitz, colorama; print('$(GREEN)Virtual env dependencies OK$(NC)')" || (echo "$(RED)Virtual env dependencies missing$(NC)" && exit 1); \
else \
echo "$(YELLOW)Checking system Python...$(NC)"; \
$(PYTHON) -c "import fitz, colorama; print('$(GREEN)System dependencies OK$(NC)')" || (echo "$(RED)System dependencies missing$(NC)" && exit 1); \
fi
@if [ -f $(MAIN_SCRIPT) ]; then echo "$(GREEN)Main script found$(NC)"; else echo "$(RED)Main script missing$(NC)" && exit 1; fi
@echo "$(GREEN)Installation verified!$(NC)"
# Quick development workflow
.PHONY: quick-dev
quick-dev: clean test ## Quick development workflow (clean + test)
@echo "$(GREEN)Quick development cycle completed!$(NC)"
# Show available PDF files
.PHONY: list-pdfs
list-pdfs: ## List available PDF files in project
@echo "$(CYAN)Available PDF files:$(NC)"
@find . -name "*.pdf" -type f 2>/dev/null | head -10 || echo "$(YELLOW)No PDF files found$(NC)"
# Show recent JSON outputs
.PHONY: list-outputs
list-outputs: ## List recent JSON output files
@echo "$(CYAN)Recent JSON outputs:$(NC)"
@ls -lt *.json 2>/dev/null | head -5 || echo "$(YELLOW)No JSON output files found$(NC)"

224
README.md
View File

@ -15,12 +15,124 @@ A Python tool for extracting highlighted text from PDF files with precise text o
## Installation
Clone the repository:
git clone <repository-url>
cd pdf-highlight-extractor
### Prerequisites
- Python 3.7 or higher
- pip package manager
Install required packages:
pip install PyMuPDF pdfplumber colorama pandas
### Quick Installation
1. **Clone the repository:**
```bash
git clone <repository-url>
cd HiLiteHero
```
2. **Install dependencies:**
```bash
pip install -r requirements.txt
```
Or install manually:
```bash
pip install PyMuPDF colorama
```
### Alternative Installation Methods
**Using virtual environment (recommended):**
```bash
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
pip install -r requirements.txt
```
**Using conda:**
```bash
conda create -n hilitehero python=3.9
conda activate hilitehero
pip install -r requirements.txt
```
### Verify Installation
```bash
python main.py --test
```
This should process the default test file and create a JSON output file.
## Quick Start with Makefile
The project includes a comprehensive Makefile for easy development and testing:
### Essential Commands
```bash
# Show all available commands
make help
# Quick test (recommended first run)
make test
# Interactive mode
make run
# Development mode (debug + interactive)
make dev
# Install dependencies
make install
# Clean up generated files
make clean
```
### Common Workflows
**First-time setup:**
```bash
make install # Install dependencies
make test # Verify everything works
```
**Development workflow:**
```bash
make dev # Start development mode
make clean # Clean up when done
```
**Batch processing:**
```bash
make batch # Process default file silently
make batch-file FILE=document.pdf # Process specific file
```
**Code quality:**
```bash
make format # Format code
make lint # Check code quality
make check # Run all checks
```
### Advanced Makefile Usage
**Process specific pages:**
```bash
make run-pages FILE=document.pdf PAGES="1,3-5"
```
**Test different modes:**
```bash
make test-interactive # Test with interactive review
make test-debug # Test with debug output
make test-silent # Test silently
```
**Project management:**
```bash
make status # Show project status
make docs # Show documentation
make list-pdfs # List available PDF files
make list-outputs # Show recent outputs
```
## Dependencies
@ -32,19 +144,76 @@ pip install PyMuPDF pdfplumber colorama pandas
## Usage
### Quick Test Mode
python highlight_extractor.py --test
### Quick Start
Uses default file: `/mnt/c/Users/admin/Downloads/test2.pdf` and displays results only.
**Test Mode (Recommended for first-time users):**
```bash
python main.py --test
```
Uses default test file and automatically saves results to JSON.
### Interactive Mode
python highlight_extractor.py
**Interactive Mode:**
```bash
python main.py
```
Prompts for PDF file path and provides interactive review options.
Prompts for PDF file path and output options.
**Process Specific PDF:**
```bash
python main.py path/to/your/document.pdf
```
### Command Line Flags
- `--test`, `-t`, or `test` - Enable test mode with defaults
- No flags - Full interactive mode
### Command Line Options
| Flag | Description | Example |
|------|-------------|---------|
| `--test`, `-t` | Test mode with default settings | `python main.py -t` |
| `--interactive`, `-i` | Enable interactive review mode | `python main.py -i document.pdf` |
| `--pages`, `-p` | Process specific pages | `python main.py -p "1,3-5" doc.pdf` |
| `--silent`, `-s` | Minimal output, auto-save JSON | `python main.py -s` |
| `--debug`, `-d` | Enable detailed debug output | `python main.py -d document.pdf` |
| `--output-json` | Custom JSON output path | `python main.py --output-json results.json` |
### Usage Examples
**Basic extraction:**
```bash
python main.py document.pdf
```
**Process specific pages with interactive review:**
```bash
python main.py document.pdf -p "1,5-7" -i
```
**Silent mode for batch processing:**
```bash
python main.py document.pdf -s --output-json batch_results.json
```
**Debug mode for troubleshooting:**
```bash
python main.py document.pdf -d
```
**Test with custom output:**
```bash
python main.py -t --output-json test_results.json
```
### Interactive Review Mode
When using `-i` flag, you can:
- **[N]ext** - Move to next highlight
- **[P]rev** - Move to previous highlight
- **[U]p** - Move highlight up in order
- **[M]ove Down** - Move highlight down in order
- **[C]olor** - Change highlight color classification
- **[E]dit** - Edit highlight text
- **[D]elete** - Remove highlight
- **[O]pen Img** - View page image
- **[S]ave&Exit** - Save changes and exit
- **[Q]uit** - Quit without saving
## Output Formats
@ -97,9 +266,34 @@ Tabular format with columns: page, text, color, type, category
**Over-extraction**: The tool is designed to avoid this, but very close text might be included. Check highlight precision in your PDF.
**Installation Issues**:
- Ensure Python 3.7+ is installed
- Try using virtual environment: `make venv-install`
- Check dependencies: `make verify`
**Permission Errors**:
- On Linux/Mac: Ensure PDF files are readable
- On Windows: Run as administrator if needed
### Debug Output
Run with detailed logging to see extraction decisions:
python highlight_extractor.py --test
```bash
python main.py --test --debug
# or
make test-debug
```
### Getting Help
```bash
# Show all available commands
make help
# Check project status
make status
# Verify installation
make verify
```
## Contributing

2180
main.py

File diff suppressed because it is too large Load Diff

View File

@ -1,2 +1,2 @@
colorama==0.4.6
PyMuPDF==1.23.1
PyMuPDF==1.22.3

Binary file not shown.

BIN
test/test4.pdf Normal file

Binary file not shown.