Refactor main.py for improved readability and functionality; update README for installation instructions and usage examples; modify requirements.txt for dependency versioning; update test PDFs.
This commit is contained in:
parent
d28005e541
commit
41e63da3d4
261
Makefile
Normal file
261
Makefile
Normal file
@ -0,0 +1,261 @@
|
||||
# HiLiteHero - PDF Highlight Extractor Makefile
|
||||
# Description: Makefile for easy development, testing, and deployment
|
||||
|
||||
# Variables
|
||||
PYTHON := python3
|
||||
PIP := pip3
|
||||
VENV := venv
|
||||
VENV_BIN := $(VENV)/bin
|
||||
VENV_PYTHON := $(VENV_BIN)/python
|
||||
VENV_PIP := $(VENV_BIN)/pip
|
||||
MAIN_SCRIPT := main.py
|
||||
TEST_PDF := test/test2.pdf
|
||||
REQUIREMENTS := requirements.txt
|
||||
|
||||
# Colors for output
|
||||
RED := \033[0;31m
|
||||
GREEN := \033[0;32m
|
||||
YELLOW := \033[0;33m
|
||||
BLUE := \033[0;34m
|
||||
PURPLE := \033[0;35m
|
||||
CYAN := \033[0;36m
|
||||
WHITE := \033[0;37m
|
||||
NC := \033[0m # No Color
|
||||
|
||||
# Helper function to get the right Python executable
|
||||
define get_python
|
||||
$(if $(wildcard $(VENV_PYTHON)),$(VENV_PYTHON),$(PYTHON))
|
||||
endef
|
||||
|
||||
# Default target
|
||||
.PHONY: help
|
||||
help: ## Show this help message
|
||||
@echo "$(CYAN)HiLiteHero - PDF Highlight Extractor$(NC)"
|
||||
@echo "$(YELLOW)Available targets:$(NC)"
|
||||
@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf " $(GREEN)%-15s$(NC) %s\n", $$1, $$2}' $(MAKEFILE_LIST)
|
||||
|
||||
# Installation targets
|
||||
.PHONY: install
|
||||
install: venv-install ## Install dependencies (creates venv if needed)
|
||||
@echo "$(GREEN)Dependencies installed successfully!$(NC)"
|
||||
|
||||
.PHONY: install-system
|
||||
install-system: ## Install dependencies system-wide (may require --break-system-packages)
|
||||
@echo "$(YELLOW)Warning: Installing system-wide packages$(NC)"
|
||||
@echo "$(BLUE)Installing dependencies...$(NC)"
|
||||
$(PIP) install -r $(REQUIREMENTS) --break-system-packages
|
||||
@echo "$(GREEN)Dependencies installed successfully!$(NC)"
|
||||
|
||||
.PHONY: install-dev
|
||||
install-dev: venv-install ## Install development dependencies in virtual environment
|
||||
@echo "$(BLUE)Installing development dependencies...$(NC)"
|
||||
$(VENV_PIP) install black flake8 pytest pytest-cov
|
||||
@echo "$(GREEN)Development dependencies installed!$(NC)"
|
||||
|
||||
.PHONY: venv
|
||||
venv: ## Create virtual environment
|
||||
@echo "$(BLUE)Creating virtual environment...$(NC)"
|
||||
$(PYTHON) -m venv $(VENV)
|
||||
@echo "$(GREEN)Virtual environment created!$(NC)"
|
||||
@echo "$(YELLOW)To activate: source $(VENV)/bin/activate$(NC)"
|
||||
|
||||
.PHONY: venv-install
|
||||
venv-install: venv ## Create venv and install dependencies
|
||||
@echo "$(BLUE)Installing dependencies in virtual environment...$(NC)"
|
||||
$(VENV_PIP) install --upgrade pip
|
||||
$(VENV_PIP) install -r $(REQUIREMENTS)
|
||||
@echo "$(GREEN)Virtual environment setup complete!$(NC)"
|
||||
@echo "$(YELLOW)To activate: source $(VENV)/bin/activate$(NC)"
|
||||
|
||||
# Testing targets
|
||||
.PHONY: test
|
||||
test: ## Run test mode with default PDF
|
||||
@echo "$(BLUE)Running test mode...$(NC)"
|
||||
$(call get_python) $(MAIN_SCRIPT) --test
|
||||
@echo "$(GREEN)Test completed!$(NC)"
|
||||
|
||||
.PHONY: test-interactive
|
||||
test-interactive: ## Run test mode with interactive review
|
||||
@echo "$(BLUE)Running test mode with interactive review...$(NC)"
|
||||
$(call get_python) $(MAIN_SCRIPT) --test --interactive
|
||||
|
||||
.PHONY: test-debug
|
||||
test-debug: ## Run test mode with debug output
|
||||
@echo "$(BLUE)Running test mode with debug output...$(NC)"
|
||||
$(call get_python) $(MAIN_SCRIPT) --test --debug
|
||||
|
||||
.PHONY: test-silent
|
||||
test-silent: ## Run test mode silently (minimal output)
|
||||
@echo "$(BLUE)Running test mode silently...$(NC)"
|
||||
$(call get_python) $(MAIN_SCRIPT) --test --silent
|
||||
@echo "$(GREEN)Silent test completed!$(NC)"
|
||||
|
||||
.PHONY: test-custom
|
||||
test-custom: ## Run test with custom output file
|
||||
@echo "$(BLUE)Running test with custom output...$(NC)"
|
||||
$(call get_python) $(MAIN_SCRIPT) --test --output-json test_results.json
|
||||
@echo "$(GREEN)Test results saved to test_results.json$(NC)"
|
||||
|
||||
# Development targets
|
||||
.PHONY: dev
|
||||
dev: ## Run in development mode (interactive with debug)
|
||||
@echo "$(BLUE)Starting development mode...$(NC)"
|
||||
$(call get_python) $(MAIN_SCRIPT) --debug --interactive
|
||||
|
||||
.PHONY: run
|
||||
run: ## Run the script interactively
|
||||
@echo "$(BLUE)Starting interactive mode...$(NC)"
|
||||
$(call get_python) $(MAIN_SCRIPT)
|
||||
|
||||
.PHONY: run-file
|
||||
run-file: ## Run with a specific PDF file (usage: make run-file FILE=path/to/file.pdf)
|
||||
@if [ -z "$(FILE)" ]; then \
|
||||
echo "$(RED)Error: Please specify FILE=path/to/file.pdf$(NC)"; \
|
||||
echo "$(YELLOW)Example: make run-file FILE=document.pdf$(NC)"; \
|
||||
exit 1; \
|
||||
fi
|
||||
@echo "$(BLUE)Processing $(FILE)...$(NC)"
|
||||
$(call get_python) $(MAIN_SCRIPT) "$(FILE)"
|
||||
|
||||
.PHONY: run-pages
|
||||
run-pages: ## Run with specific pages (usage: make run-pages FILE=doc.pdf PAGES="1,3-5")
|
||||
@if [ -z "$(FILE)" ] || [ -z "$(PAGES)" ]; then \
|
||||
echo "$(RED)Error: Please specify FILE and PAGES$(NC)"; \
|
||||
echo "$(YELLOW)Example: make run-pages FILE=document.pdf PAGES=\"1,3-5\"$(NC)"; \
|
||||
exit 1; \
|
||||
fi
|
||||
@echo "$(BLUE)Processing pages $(PAGES) of $(FILE)...$(NC)"
|
||||
$(call get_python) $(MAIN_SCRIPT) "$(FILE)" --pages "$(PAGES)"
|
||||
|
||||
# Code quality targets
|
||||
.PHONY: format
|
||||
format: ## Format code with black
|
||||
@echo "$(BLUE)Formatting code with black...$(NC)"
|
||||
$(call get_python) -m black $(MAIN_SCRIPT)
|
||||
@echo "$(GREEN)Code formatted!$(NC)"
|
||||
|
||||
.PHONY: lint
|
||||
lint: ## Lint code with flake8
|
||||
@echo "$(BLUE)Linting code with flake8...$(NC)"
|
||||
$(call get_python) -m flake8 $(MAIN_SCRIPT) --max-line-length=120 --ignore=E203,W503
|
||||
@echo "$(GREEN)Linting completed!$(NC)"
|
||||
|
||||
.PHONY: check
|
||||
check: lint ## Run all code quality checks
|
||||
@echo "$(GREEN)All checks passed!$(NC)"
|
||||
|
||||
# Utility targets
|
||||
.PHONY: clean
|
||||
clean: ## Clean up generated files
|
||||
@echo "$(BLUE)Cleaning up generated files...$(NC)"
|
||||
rm -f *.json
|
||||
rm -f test_results.json
|
||||
rm -rf pdf_page_images/
|
||||
rm -rf __pycache__/
|
||||
rm -rf .pytest_cache/
|
||||
rm -rf *.pyc
|
||||
@echo "$(GREEN)Cleanup completed!$(NC)"
|
||||
|
||||
.PHONY: clean-venv
|
||||
clean-venv: ## Remove virtual environment
|
||||
@echo "$(BLUE)Removing virtual environment...$(NC)"
|
||||
rm -rf $(VENV)
|
||||
@echo "$(GREEN)Virtual environment removed!$(NC)"
|
||||
|
||||
.PHONY: clean-all
|
||||
clean-all: clean clean-venv ## Clean everything including virtual environment
|
||||
@echo "$(GREEN)Complete cleanup finished!$(NC)"
|
||||
|
||||
.PHONY: status
|
||||
status: ## Show project status
|
||||
@echo "$(CYAN)=== HiLiteHero Project Status ===$(NC)"
|
||||
@echo "$(YELLOW)Python version:$(NC) $$(python3 --version 2>/dev/null || echo 'Not found')"
|
||||
@echo "$(YELLOW)Pip version:$(NC) $$(pip3 --version 2>/dev/null || echo 'Not found')"
|
||||
@echo "$(YELLOW)Virtual environment:$(NC) $$(if [ -d $(VENV) ]; then echo 'Exists'; else echo 'Not created'; fi)"
|
||||
@echo "$(YELLOW)Dependencies installed:$(NC) $$(pip3 list | grep -q PyMuPDF && echo 'Yes' || echo 'No')"
|
||||
@echo "$(YELLOW)Test PDF exists:$(NC) $$(if [ -f $(TEST_PDF) ]; then echo 'Yes'; else echo 'No'; fi)"
|
||||
@echo "$(YELLOW)Generated files:$(NC) $$(ls -1 *.json 2>/dev/null | wc -l) JSON files"
|
||||
|
||||
# Documentation targets
|
||||
.PHONY: docs
|
||||
docs: ## Show documentation
|
||||
@echo "$(CYAN)=== HiLiteHero Documentation ===$(NC)"
|
||||
@echo "$(YELLOW)Main script:$(NC) $(MAIN_SCRIPT)"
|
||||
@echo "$(YELLOW)Test PDF:$(NC) $(TEST_PDF)"
|
||||
@echo "$(YELLOW)Requirements:$(NC) $(REQUIREMENTS)"
|
||||
@echo ""
|
||||
@echo "$(YELLOW)Quick start:$(NC)"
|
||||
@echo " make test # Run test mode"
|
||||
@echo " make run # Interactive mode"
|
||||
@echo " make dev # Development mode"
|
||||
@echo ""
|
||||
@echo "$(YELLOW)For more help:$(NC) make help"
|
||||
|
||||
# Batch processing targets
|
||||
.PHONY: batch
|
||||
batch: ## Run in batch mode (silent with auto-save)
|
||||
@echo "$(BLUE)Running in batch mode...$(NC)"
|
||||
$(call get_python) $(MAIN_SCRIPT) --silent --output-json batch_results_$(shell date +%Y%m%d_%H%M%S).json
|
||||
@echo "$(GREEN)Batch processing completed!$(NC)"
|
||||
|
||||
.PHONY: batch-file
|
||||
batch-file: ## Batch process specific file (usage: make batch-file FILE=doc.pdf)
|
||||
@if [ -z "$(FILE)" ]; then \
|
||||
echo "$(RED)Error: Please specify FILE=path/to/file.pdf$(NC)"; \
|
||||
exit 1; \
|
||||
fi
|
||||
@echo "$(BLUE)Batch processing $(FILE)...$(NC)"
|
||||
$(call get_python) $(MAIN_SCRIPT) "$(FILE)" --silent --output-json "$(shell basename "$(FILE)" .pdf)_batch_$(shell date +%Y%m%d_%H%M%S).json"
|
||||
@echo "$(GREEN)Batch processing completed!$(NC)"
|
||||
|
||||
.PHONY: batch-all
|
||||
batch-all: ## Process all PDFs in test folder
|
||||
@echo "$(BLUE)Processing all PDFs in test folder...$(NC)"
|
||||
@if [ ! -d "test" ]; then \
|
||||
echo "$(RED)Error: test folder not found$(NC)"; \
|
||||
exit 1; \
|
||||
fi
|
||||
@pdf_count=0; \
|
||||
for pdf in test/*.pdf; do \
|
||||
if [ -f "$$pdf" ]; then \
|
||||
pdf_count=$$((pdf_count + 1)); \
|
||||
echo "$(CYAN)Processing $$pdf...$(NC)"; \
|
||||
$(call get_python) $(MAIN_SCRIPT) "$$pdf" --silent --output-json "$$(basename "$$pdf" .pdf)_batch_$(shell date +%Y%m%d_%H%M%S).json"; \
|
||||
fi; \
|
||||
done; \
|
||||
if [ $$pdf_count -eq 0 ]; then \
|
||||
echo "$(YELLOW)No PDF files found in test folder$(NC)"; \
|
||||
else \
|
||||
echo "$(GREEN)Processed $$pdf_count PDF file(s) successfully!$(NC)"; \
|
||||
fi
|
||||
|
||||
# Installation verification
|
||||
.PHONY: verify
|
||||
verify: ## Verify installation
|
||||
@echo "$(BLUE)Verifying installation...$(NC)"
|
||||
@if [ -f $(VENV_PYTHON) ]; then \
|
||||
echo "$(CYAN)Checking virtual environment...$(NC)"; \
|
||||
$(VENV_PYTHON) -c "import fitz, colorama; print('$(GREEN)Virtual env dependencies OK$(NC)')" || (echo "$(RED)Virtual env dependencies missing$(NC)" && exit 1); \
|
||||
else \
|
||||
echo "$(YELLOW)Checking system Python...$(NC)"; \
|
||||
$(PYTHON) -c "import fitz, colorama; print('$(GREEN)System dependencies OK$(NC)')" || (echo "$(RED)System dependencies missing$(NC)" && exit 1); \
|
||||
fi
|
||||
@if [ -f $(MAIN_SCRIPT) ]; then echo "$(GREEN)Main script found$(NC)"; else echo "$(RED)Main script missing$(NC)" && exit 1; fi
|
||||
@echo "$(GREEN)Installation verified!$(NC)"
|
||||
|
||||
# Quick development workflow
|
||||
.PHONY: quick-dev
|
||||
quick-dev: clean test ## Quick development workflow (clean + test)
|
||||
@echo "$(GREEN)Quick development cycle completed!$(NC)"
|
||||
|
||||
# Show available PDF files
|
||||
.PHONY: list-pdfs
|
||||
list-pdfs: ## List available PDF files in project
|
||||
@echo "$(CYAN)Available PDF files:$(NC)"
|
||||
@find . -name "*.pdf" -type f 2>/dev/null | head -10 || echo "$(YELLOW)No PDF files found$(NC)"
|
||||
|
||||
# Show recent JSON outputs
|
||||
.PHONY: list-outputs
|
||||
list-outputs: ## List recent JSON output files
|
||||
@echo "$(CYAN)Recent JSON outputs:$(NC)"
|
||||
@ls -lt *.json 2>/dev/null | head -5 || echo "$(YELLOW)No JSON output files found$(NC)"
|
||||
224
README.md
224
README.md
@ -15,12 +15,124 @@ A Python tool for extracting highlighted text from PDF files with precise text o
|
||||
|
||||
## Installation
|
||||
|
||||
Clone the repository:
|
||||
git clone <repository-url>
|
||||
cd pdf-highlight-extractor
|
||||
### Prerequisites
|
||||
- Python 3.7 or higher
|
||||
- pip package manager
|
||||
|
||||
Install required packages:
|
||||
pip install PyMuPDF pdfplumber colorama pandas
|
||||
### Quick Installation
|
||||
|
||||
1. **Clone the repository:**
|
||||
```bash
|
||||
git clone <repository-url>
|
||||
cd HiLiteHero
|
||||
```
|
||||
|
||||
2. **Install dependencies:**
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
Or install manually:
|
||||
```bash
|
||||
pip install PyMuPDF colorama
|
||||
```
|
||||
|
||||
### Alternative Installation Methods
|
||||
|
||||
**Using virtual environment (recommended):**
|
||||
```bash
|
||||
python -m venv venv
|
||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
**Using conda:**
|
||||
```bash
|
||||
conda create -n hilitehero python=3.9
|
||||
conda activate hilitehero
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### Verify Installation
|
||||
```bash
|
||||
python main.py --test
|
||||
```
|
||||
This should process the default test file and create a JSON output file.
|
||||
|
||||
## Quick Start with Makefile
|
||||
|
||||
The project includes a comprehensive Makefile for easy development and testing:
|
||||
|
||||
### Essential Commands
|
||||
|
||||
```bash
|
||||
# Show all available commands
|
||||
make help
|
||||
|
||||
# Quick test (recommended first run)
|
||||
make test
|
||||
|
||||
# Interactive mode
|
||||
make run
|
||||
|
||||
# Development mode (debug + interactive)
|
||||
make dev
|
||||
|
||||
# Install dependencies
|
||||
make install
|
||||
|
||||
# Clean up generated files
|
||||
make clean
|
||||
```
|
||||
|
||||
### Common Workflows
|
||||
|
||||
**First-time setup:**
|
||||
```bash
|
||||
make install # Install dependencies
|
||||
make test # Verify everything works
|
||||
```
|
||||
|
||||
**Development workflow:**
|
||||
```bash
|
||||
make dev # Start development mode
|
||||
make clean # Clean up when done
|
||||
```
|
||||
|
||||
**Batch processing:**
|
||||
```bash
|
||||
make batch # Process default file silently
|
||||
make batch-file FILE=document.pdf # Process specific file
|
||||
```
|
||||
|
||||
**Code quality:**
|
||||
```bash
|
||||
make format # Format code
|
||||
make lint # Check code quality
|
||||
make check # Run all checks
|
||||
```
|
||||
|
||||
### Advanced Makefile Usage
|
||||
|
||||
**Process specific pages:**
|
||||
```bash
|
||||
make run-pages FILE=document.pdf PAGES="1,3-5"
|
||||
```
|
||||
|
||||
**Test different modes:**
|
||||
```bash
|
||||
make test-interactive # Test with interactive review
|
||||
make test-debug # Test with debug output
|
||||
make test-silent # Test silently
|
||||
```
|
||||
|
||||
**Project management:**
|
||||
```bash
|
||||
make status # Show project status
|
||||
make docs # Show documentation
|
||||
make list-pdfs # List available PDF files
|
||||
make list-outputs # Show recent outputs
|
||||
```
|
||||
|
||||
|
||||
## Dependencies
|
||||
@ -32,19 +144,76 @@ pip install PyMuPDF pdfplumber colorama pandas
|
||||
|
||||
## Usage
|
||||
|
||||
### Quick Test Mode
|
||||
python highlight_extractor.py --test
|
||||
### Quick Start
|
||||
|
||||
Uses default file: `/mnt/c/Users/admin/Downloads/test2.pdf` and displays results only.
|
||||
**Test Mode (Recommended for first-time users):**
|
||||
```bash
|
||||
python main.py --test
|
||||
```
|
||||
Uses default test file and automatically saves results to JSON.
|
||||
|
||||
### Interactive Mode
|
||||
python highlight_extractor.py
|
||||
**Interactive Mode:**
|
||||
```bash
|
||||
python main.py
|
||||
```
|
||||
Prompts for PDF file path and provides interactive review options.
|
||||
|
||||
Prompts for PDF file path and output options.
|
||||
**Process Specific PDF:**
|
||||
```bash
|
||||
python main.py path/to/your/document.pdf
|
||||
```
|
||||
|
||||
### Command Line Flags
|
||||
- `--test`, `-t`, or `test` - Enable test mode with defaults
|
||||
- No flags - Full interactive mode
|
||||
### Command Line Options
|
||||
|
||||
| Flag | Description | Example |
|
||||
|------|-------------|---------|
|
||||
| `--test`, `-t` | Test mode with default settings | `python main.py -t` |
|
||||
| `--interactive`, `-i` | Enable interactive review mode | `python main.py -i document.pdf` |
|
||||
| `--pages`, `-p` | Process specific pages | `python main.py -p "1,3-5" doc.pdf` |
|
||||
| `--silent`, `-s` | Minimal output, auto-save JSON | `python main.py -s` |
|
||||
| `--debug`, `-d` | Enable detailed debug output | `python main.py -d document.pdf` |
|
||||
| `--output-json` | Custom JSON output path | `python main.py --output-json results.json` |
|
||||
|
||||
### Usage Examples
|
||||
|
||||
**Basic extraction:**
|
||||
```bash
|
||||
python main.py document.pdf
|
||||
```
|
||||
|
||||
**Process specific pages with interactive review:**
|
||||
```bash
|
||||
python main.py document.pdf -p "1,5-7" -i
|
||||
```
|
||||
|
||||
**Silent mode for batch processing:**
|
||||
```bash
|
||||
python main.py document.pdf -s --output-json batch_results.json
|
||||
```
|
||||
|
||||
**Debug mode for troubleshooting:**
|
||||
```bash
|
||||
python main.py document.pdf -d
|
||||
```
|
||||
|
||||
**Test with custom output:**
|
||||
```bash
|
||||
python main.py -t --output-json test_results.json
|
||||
```
|
||||
|
||||
### Interactive Review Mode
|
||||
|
||||
When using `-i` flag, you can:
|
||||
- **[N]ext** - Move to next highlight
|
||||
- **[P]rev** - Move to previous highlight
|
||||
- **[U]p** - Move highlight up in order
|
||||
- **[M]ove Down** - Move highlight down in order
|
||||
- **[C]olor** - Change highlight color classification
|
||||
- **[E]dit** - Edit highlight text
|
||||
- **[D]elete** - Remove highlight
|
||||
- **[O]pen Img** - View page image
|
||||
- **[S]ave&Exit** - Save changes and exit
|
||||
- **[Q]uit** - Quit without saving
|
||||
|
||||
## Output Formats
|
||||
|
||||
@ -97,9 +266,34 @@ Tabular format with columns: page, text, color, type, category
|
||||
|
||||
**Over-extraction**: The tool is designed to avoid this, but very close text might be included. Check highlight precision in your PDF.
|
||||
|
||||
**Installation Issues**:
|
||||
- Ensure Python 3.7+ is installed
|
||||
- Try using virtual environment: `make venv-install`
|
||||
- Check dependencies: `make verify`
|
||||
|
||||
**Permission Errors**:
|
||||
- On Linux/Mac: Ensure PDF files are readable
|
||||
- On Windows: Run as administrator if needed
|
||||
|
||||
### Debug Output
|
||||
Run with detailed logging to see extraction decisions:
|
||||
python highlight_extractor.py --test
|
||||
```bash
|
||||
python main.py --test --debug
|
||||
# or
|
||||
make test-debug
|
||||
```
|
||||
|
||||
### Getting Help
|
||||
```bash
|
||||
# Show all available commands
|
||||
make help
|
||||
|
||||
# Check project status
|
||||
make status
|
||||
|
||||
# Verify installation
|
||||
make verify
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
colorama==0.4.6
|
||||
PyMuPDF==1.23.1
|
||||
PyMuPDF==1.22.3
|
||||
|
||||
BIN
test/test2.pdf
BIN
test/test2.pdf
Binary file not shown.
BIN
test/test4.pdf
Normal file
BIN
test/test4.pdf
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user