diff --git a/.gitignore b/.gitignore index aa3c660..1da9376 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,12 @@ wheels/ compose.yaml otel-collector-config.yaml + +# Agents configuration +.agents/skills/* +.claude/skills/* + +# Testing data folders + +workspace/ +data/ diff --git a/docs/howto/configure_landingai.md b/docs/howto/configure_landingai.md new file mode 100644 index 0000000..42d0ab5 --- /dev/null +++ b/docs/howto/configure_landingai.md @@ -0,0 +1,313 @@ +# How to Configure LandingAI ADE + +This guide shows you how to configure the LandingAI ADE (Agentic Document Extraction) driver for document processing, including setting default options and overriding them on a per-document basis. + +## Prerequisites + +- Parxy installed with LandingAI support: `pip install parxy[landingai]` or via UV `uv add parxy[landingai]` +- A LandingAI API key from [LandingAI](https://landing.ai/) + +## Quick Start + +### Step 1: Set Your API Key + +Create a `.env` file in your project directory: + +```bash +PARXY_LANDINGAI_API_KEY=your-api-key-here +``` + +Or set it as an environment variable: + +```bash +export PARXY_LANDINGAI_API_KEY=your-api-key-here +``` + +### Step 2: Parse a Document + +```python +from parxy_core.facade.parxy import Parxy + +doc = Parxy.parse("document.pdf", driver_name="landingai") +print(f"Processed {len(doc.pages)} pages") +``` + +## Configuration Options + +LandingAI ADE supports configuration options that control API connectivity. These can be set via environment variables or programmatic configuration. + +### Environment Variables + +All LandingAI configuration uses environment variables with the `PARXY_LANDINGAI_` prefix: + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `PARXY_LANDINGAI_API_KEY` | string | None | Your LandingAI API key | +| `PARXY_LANDINGAI_ENVIRONMENT` | string | `eu` | API environment (`production` or `eu`) | +| `PARXY_LANDINGAI_BASE_URL` | string | None | Custom API endpoint (overrides environment) | + +### Environment Options + +LandingAI offers two hosted environments: + +| Environment | API Endpoint | Description | +|-------------|--------------|-------------| +| `production` | `https://api.va.landing.ai` | US-based production environment | +| `eu` | `https://api.va.eu-west-1.landing.ai` | EU-based environment (default) | + +To use the US production environment: + +```bash +PARXY_LANDINGAI_ENVIRONMENT=production +``` + +### Custom Base URL + +If you need to use a custom endpoint (e.g., self-hosted or enterprise deployment), set the base URL directly and set environment to `None`: + +```bash +PARXY_LANDINGAI_BASE_URL=https://your-custom-endpoint.example.com +PARXY_LANDINGAI_ENVIRONMENT= +``` + +## Document Structure and Roles + +LandingAI ADE extracts structured content from documents and categorizes each chunk by type. Parxy maps these types to WAI-ARIA document structure roles for semantic understanding. + +### Chunk Type Mappings + +| LandingAI Type | Parxy Role | Description | +|----------------|------------|-------------| +| `text` | `paragraph` | Regular text content | +| `table` | `table` | Tabular data | +| `figure` | `figure` | Images and diagrams | +| `logo` | `figure` | Company logos (DPT-2 model) | +| `card` | `figure` | ID cards, driver licenses (DPT-2 model) | +| `attestation` | `figure` | Signatures, stamps, seals (DPT-2 model) | +| `scan_code` | `figure` | QR codes, barcodes (DPT-2 model) | +| `marginalia` | `generic` | Mixed content in margins | +| `heading` | `heading` | Section headings | +| `title` | `doc-title` | Document title | +| `subtitle` | `doc-subtitle` | Document subtitle | +| `chapter` | `doc-chapter` | Chapter markers | +| `page-header` / `header` | `doc-pageheader` | Page headers | +| `page-footer` / `footer` | `doc-pagefooter` | Page footers | +| `page-number` | `doc-pagefooter` | Page numbers | +| `footnote` / `note` | `doc-footnote` | Footnotes | +| `endnote` | `doc-endnotes` | Endnotes | + +## Programmatic Configuration + +You can configure the driver programmatically: + +```python +from parxy_core.facade.parxy import Parxy +from parxy_core.models.config import LandingAIConfig + +# Create custom configuration for EU environment +config = LandingAIConfig( + api_key="your-api-key", + environment="eu", +) + +# Get driver with custom config +driver = Parxy.driver("landingai", config=config) + +# Parse documents +doc = driver.handle("document.pdf", level="block") +``` + +### Using US Production Environment + +```python +from parxy_core.facade.parxy import Parxy +from parxy_core.models.config import LandingAIConfig + +config = LandingAIConfig( + api_key="your-api-key", + environment="production", # Use US endpoint +) + +driver = Parxy.driver("landingai", config=config) +doc = driver.handle("document.pdf") +``` + +### Using Custom Endpoint + +```python +from parxy_core.facade.parxy import Parxy +from parxy_core.models.config import LandingAIConfig + +config = LandingAIConfig( + api_key="your-api-key", + environment=None, # Disable default environment + base_url="https://your-custom-endpoint.example.com", +) + +driver = Parxy.driver("landingai", config=config) +doc = driver.handle("document.pdf") +``` + +## Cost Estimation + +Parxy automatically tracks parsing costs in the document metadata: + +```python +doc = Parxy.parse("document.pdf", driver_name="landingai") + +# Access cost information +metadata = doc.parsing_metadata +print(f"Credit usage: {metadata.get('cost_estimation')} {metadata.get('cost_estimation_unit')}") +``` + +## Document Metadata + +After parsing, the document contains additional metadata from LandingAI ADE: + +```python +doc = Parxy.parse("document.pdf", driver_name="landingai") + +metadata = doc.parsing_metadata + +# ADE-specific details +details = metadata.get('ade_details', {}) +print(f"Processing time: {details.get('duration_ms')} ms") +print(f"Job ID: {details.get('job_id')}") +print(f"Page count: {details.get('page_count')}") +print(f"API version: {details.get('version')}") +print(f"Filename: {details.get('filename')}") + +# Check for any failed pages (partial content) +if 'failed_pages' in details: + print(f"Failed pages: {details.get('failed_pages')}") +``` + +## Working with Extracted Content + +### Accessing Blocks by Role + +```python +doc = Parxy.parse("document.pdf", driver_name="landingai") + +for page in doc.pages: + # Get all tables + tables = [b for b in page.blocks if b.role == 'table'] + + # Get all headings + headings = [b for b in page.blocks if b.role == 'heading'] + + # Get document title + titles = [b for b in page.blocks if b.role == 'doc-title'] + + # Get figures (images, logos, etc.) + figures = [b for b in page.blocks if b.role == 'figure'] + + print(f"Page {page.number}: {len(tables)} tables, {len(headings)} headings") +``` + +### Accessing Bounding Boxes + +LandingAI ADE provides bounding box coordinates for each extracted chunk: + +```python +doc = Parxy.parse("document.pdf", driver_name="landingai") + +for page in doc.pages: + for block in page.blocks: + if block.bbox: + print(f"Block at ({block.bbox.x0}, {block.bbox.y0}) - ({block.bbox.x1}, {block.bbox.y1})") + print(f" Type: {block.category}") + print(f" Role: {block.role}") + print(f" Text: {block.text[:50]}...") +``` + +### Accessing Original Chunk Data + +The original LandingAI chunk data is preserved in `source_data`: + +```python +doc = Parxy.parse("document.pdf", driver_name="landingai") + +for page in doc.pages: + for block in page.blocks: + original = block.source_data + # Access any LandingAI-specific fields + print(f"Original type: {original.get('type')}") + print(f"Markdown: {original.get('markdown')}") +``` + +## Troubleshooting + +### Authentication Errors + +If you see authentication errors: + +1. Verify your API key is correct +2. Check the key has not expired +3. Ensure you're using the correct environment for your account + +```python +# Test authentication +from parxy_core.facade.parxy import Parxy +from parxy_core.models.config import LandingAIConfig + +config = LandingAIConfig(api_key="your-key", environment="eu") +driver = Parxy.driver("landingai", config=config) +# If no error, authentication is working +``` + +### Rate Limiting + +If you encounter 429 errors (rate limiting): + +1. Reduce the frequency of API calls +2. Implement retry logic with exponential backoff +3. Contact LandingAI for higher rate limits if needed + +### Quota Exceeded + +If you see 402 errors (quota exceeded): + +1. Check your account's remaining credits +2. Purchase additional credits from LandingAI + +### Input Validation Errors + +If you see 422 errors (input validation): + +1. Ensure the file format is supported (PDF, images) +2. Check the file is not corrupted +3. Verify the file size is within limits + +### Partial Content / Failed Pages + +If some pages fail to process: + +```python +doc = Parxy.parse("document.pdf", driver_name="landingai") + +details = doc.parsing_metadata.get('ade_details', {}) +if 'failed_pages' in details: + failed = details['failed_pages'] + print(f"Warning: Pages {failed} failed to process") +``` + +This can happen with: +- Corrupted pages +- Pages with unsupported content +- Processing timeouts on complex pages + +### Wrong Environment + +If API calls fail with connection errors: + +1. Verify the environment setting matches your account region +2. Try explicitly setting the base URL +3. Check network connectivity to the LandingAI API + +## See Also + +- [LandingAI ADE Documentation](https://docs.landing.ai/ade/) +- [LandingAI ADE JSON Response](https://docs.landing.ai/ade/ade-json-response.md) +- [Document Structure Roles](../explanation/document-roles.md) +- [Getting Started Tutorial](../tutorials/getting_started.md) diff --git a/docs/howto/configure_llmwhisperer.md b/docs/howto/configure_llmwhisperer.md new file mode 100644 index 0000000..6b40b03 --- /dev/null +++ b/docs/howto/configure_llmwhisperer.md @@ -0,0 +1,268 @@ +# How to Configure LLMWhisperer + +This guide shows you how to configure the LLMWhisperer driver for document processing, including setting default options and overriding them on a per-document basis. + +## Prerequisites + +- Parxy installed with LLMWhisperer support: `pip install parxy[llmwhisperer]` or via UV `uv add parxy[llmwhisperer]` +- An LLMWhisperer API key from [Unstract](https://unstract.com/) + +## Quick Start + +### Step 1: Set Your API Key + +Create a `.env` file in your project directory: + +```bash +PARXY_LLMWHISPERER_API_KEY=your-api-key-here +``` + +Or set it as an environment variable: + +```bash +export PARXY_LLMWHISPERER_API_KEY=your-api-key-here +``` + +### Step 2: Parse a Document + +```python +from parxy_core.facade.parxy import Parxy + +doc = Parxy.parse("document.pdf", driver_name="llmwhisperer") +print(f"Processed {len(doc.pages)} pages") +``` + +## Configuration Options + +LLMWhisperer supports configuration options that control parsing behavior. These can be set at two levels: + +1. **Default configuration** - Applied to all documents via environment variables or config +2. **Per-call overrides** - Applied to specific documents via kwargs + +### Environment Variables + +All LLMWhisperer configuration uses environment variables with the `PARXY_LLMWHISPERER_` prefix: + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `PARXY_LLMWHISPERER_API_KEY` | string | None | Your LLMWhisperer API key | +| `PARXY_LLMWHISPERER_BASE_URL` | string | `https://llmwhisperer-api.eu-west.unstract.com/api/v2` | API endpoint | +| `PARXY_LLMWHISPERER_LOGGING_LEVEL` | string | `INFO` | Client logging level (`DEBUG`, `INFO`, `WARNING`, `ERROR`) | +| `PARXY_LLMWHISPERER_MODE` | string | `form` | Default parsing mode (see below) | + +### Parsing Modes + +LLMWhisperer offers several parsing modes with different accuracy and cost trade-offs: + +- `native_text`: Extract native/copyable text only (fastest, cheapest) +- `low_cost`: Basic OCR extraction +- `high_quality`:High quality OCR with better accuracy +- `form`: Optimized for forms and structured documents (default) +- `table`: Optimized for documents with tables + +## Per-Call Configuration Overrides + +You can override the default parsing mode for specific documents by passing kwargs to `Parxy.parse()`: + +```python +from parxy_core.facade.parxy import Parxy + +# Use default configuration (form mode) +doc1 = Parxy.parse("simple.pdf", driver_name="llmwhisperer") + +# Override mode for a specific document +doc2 = Parxy.parse( + "scanned-document.pdf", + driver_name="llmwhisperer", + mode="high_quality", # Use high quality OCR for scanned docs +) + +# Use table mode for documents with many tables +doc3 = Parxy.parse( + "financial-report.pdf", + driver_name="llmwhisperer", + mode="table", +) + +# Use native text mode for simple PDFs with selectable text +doc4 = Parxy.parse( + "digital-document.pdf", + driver_name="llmwhisperer", + mode="native_text", # Fastest and cheapest +) +``` + +### Supported Per-Call Options + +The following options can be overridden per-call: + +- `mode` - Parsing mode (`native_text`, `low_cost`, `high_quality`, `form`, `table`) + +## Use Cases + +### Scanned Documents + +For scanned documents that require OCR: + +```python +doc = Parxy.parse( + "scanned.pdf", + driver_name="llmwhisperer", + mode="high_quality", +) +``` + +### Forms and Structured Documents + +For forms, applications, or structured documents: + +```python +doc = Parxy.parse( + "application-form.pdf", + driver_name="llmwhisperer", + mode="form", +) +``` + +### Documents with Tables + +For spreadsheets, financial reports, or documents with complex tables: + +```python +doc = Parxy.parse( + "quarterly-report.pdf", + driver_name="llmwhisperer", + mode="table", +) +``` + +### Digital PDFs with Selectable Text + +For PDFs created digitally (not scanned) where text is already selectable: + +```python +doc = Parxy.parse( + "digital-report.pdf", + driver_name="llmwhisperer", + mode="native_text", # Skip OCR, extract text directly +) +``` + +### Cost-Optimized Processing + +When processing many documents and cost is a concern: + +```python +doc = Parxy.parse( + "bulk-document.pdf", + driver_name="llmwhisperer", + mode="low_cost", +) +``` + +## Cost Estimation + +Parxy automatically tracks parsing costs in the document metadata: + +```python +doc = Parxy.parse("document.pdf", driver_name="llmwhisperer") + +# Access cost information +metadata = doc.parsing_metadata +print(f"Estimated cost: {metadata.get('cost_estimation')} {metadata.get('cost_estimation_unit')}") +print(f"Parsing mode: {metadata.get('parsing_mode')}") +print(f"Pages processed: {metadata.get('pages_processed')}") +``` + +### Usage Information + +LLMWhisperer also provides usage information from the API: + +```python +doc = Parxy.parse("document.pdf", driver_name="llmwhisperer") + +# Access usage information +usage_info = doc.parsing_metadata.get('usage_info') +if usage_info: + print(f"Usage details: {usage_info}") +``` + +## Document Metadata + +After parsing, the document contains additional metadata from LLMWhisperer: + +```python +doc = Parxy.parse("document.pdf", driver_name="llmwhisperer") + +metadata = doc.parsing_metadata + +# Whisper-specific metadata +print(f"Whisper hash: {metadata.get('whisper_hash')}") + +# Processing details +details = metadata.get('whisper_details', {}) +print(f"Processing time: {details.get('processing_time_in_seconds')} seconds") +print(f"Total pages: {details.get('total_pages')}") +print(f"Processed pages: {details.get('processed_pages')}") +print(f"File size: {details.get('upload_file_size_in_kb')} KB") +``` + +## Troubleshooting + +### Authentication Errors + +If you see 401/403 errors: + +1. Verify your API key is correct +2. Check the key has not expired +3. Ensure the key has access to the requested features + + +### Rate Limiting + +If you encounter 429 errors (rate limiting): + +1. Reduce the frequency of API calls +2. Implement retry logic with exponential backoff +3. Contact Unstract for higher rate limits if needed + +### Quota Exceeded + +If you see 402 errors (quota exceeded): + +1. Check your account's remaining credits +2. Purchase additional credits from Unstract +3. Use lower-cost parsing modes for bulk processing + +### Missing or Poor Quality Text + +If text extraction is incomplete or low quality: + +1. Use `high_quality` mode for better OCR accuracy +2. Try `form` mode for structured documents +3. Try `table` mode for documents with tables +4. Ensure the document is not corrupted + +### Slow Processing + +If parsing is slow: + +1. Use `native_text` mode for digital PDFs (skips OCR) +2. Use `low_cost` mode for faster processing +3. Consider breaking large documents into smaller chunks + +## Supported Extraction Levels + +LLMWhisperer supports the following extraction levels: + +- `page` - Page-level text extraction +- `block` - Block-level extraction (internally uses page-level) + +Note: LLMWhisperer returns page-level text. When `block` level is requested, it internally uses `page` level as that is the native output format. + +## See Also + +- [LLMWhisperer Documentation](https://docs.unstract.com/llmwhisperer/) +- [LLMWhisperer Modes](https://docs.unstract.com/llmwhisperer/llm_whisperer/llm_whisperer_modes/) +- [Unstract Pricing](https://unstract.com/pricing/) +- [Getting Started Tutorial](../tutorials/getting_started.md) diff --git a/docs/howto/configure_pdfact.md b/docs/howto/configure_pdfact.md new file mode 100644 index 0000000..8330c15 --- /dev/null +++ b/docs/howto/configure_pdfact.md @@ -0,0 +1,291 @@ +# How to Configure PdfAct + +This guide shows you how to configure the PdfAct driver for document processing using a self-hosted or remote PdfAct service. + +## Prerequisites + +- Parxy installed (PdfAct is included in the base installation) +- A running PdfAct service instance (see [PdfAct](https://github.com/data-house/pdfact/) for setup) + +## Quick Start + +### Step 1: Start a PdfAct Service + +PdfAct runs as a REST API service. You can run it locally using Docker: + +```bash +docker run -p 4567:4567 ghcr.io/data-house/pdfact:main +``` + +### Step 2: Configure the Base URL + +If running locally on the default port, no configuration is needed. Otherwise, create a `.env` file: + +```bash +PARXY_PDFACT_BASE_URL=http://your-pdfact-server:4567/ +``` + +Or set it as an environment variable: + +```bash +export PARXY_PDFACT_BASE_URL=http://your-pdfact-server:4567/ +``` + +### Step 3: Parse a Document + +```python +from parxy_core.facade.parxy import Parxy + +doc = Parxy.parse("document.pdf", driver_name="pdfact") +print(f"Processed {len(doc.pages)} pages") +``` + +## Configuration Options + +PdfAct configuration is minimal compared to cloud-based services since it's typically self-hosted. + +### Environment Variables + +All PdfAct configuration uses environment variables with the `PARXY_PDFACT_` prefix: + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `PARXY_PDFACT_BASE_URL` | string | `http://localhost:4567/` | The base URL of the PdfAct API | +| `PARXY_PDFACT_API_KEY` | string | None | Authentication key (if your PdfAct instance requires it) | + +## Supported Extraction Levels + +PdfAct supports the following extraction levels: + +| Level | Description | +|-------|-------------| +| `page` | Extract text at page level only | +| `paragraph` | Extract text as paragraphs (alias for block) | +| `block` | Extract text as blocks with layout and style information | + +```python +# Page-level extraction +doc = Parxy.parse("document.pdf", driver_name="pdfact", level="page") + +# Block-level extraction (default) +doc = Parxy.parse("document.pdf", driver_name="pdfact", level="block") +``` + +## Input Types + +PdfAct accepts two types of input: + +### Local Files + +```python +doc = Parxy.parse("/path/to/document.pdf", driver_name="pdfact") +``` + +### URLs + +```python +doc = Parxy.parse("https://example.com/document.pdf", driver_name="pdfact") +``` + +**Note**: BytesIO and bytes inputs are not currently supported by the PdfAct driver. + +## Document Structure Roles + +PdfAct extracts semantic roles from documents. Parxy maps these to WAI-ARIA document structure roles for standardized output: + +| PdfAct Category | WAI-ARIA Role | Description | +|-----------------|---------------|-------------| +| `body` | `paragraph` | Main body text | +| `heading` | `heading` | Section headings | +| `title` | `doc-title` | Document title | +| `subtitle` | `doc-subtitle` | Document subtitle | +| `abstract` | `doc-abstract` | Document abstract | +| `caption` | `caption` | Figure/table captions | +| `figure` | `figure` | Figures and images | +| `table` | `table` | Tables | +| `formula` | `math` | Mathematical formulas | +| `reference` | `doc-biblioref` | Bibliography references | +| `footnote` | `doc-footnote` | Footnotes | +| `toc` | `doc-toc` | Table of contents | +| `appendix` | `doc-appendix` | Appendix sections | +| `itemize-item` | `listitem` | List items | +| `header` | `doc-pageheader` | Page headers | +| `footer` | `doc-pagefooter` | Page footers | + +Access the role in your code: + +```python +doc = Parxy.parse("document.pdf", driver_name="pdfact") + +for page in doc.pages: + for block in page.blocks: + print(f"Role: {block.role}, Text: {block.text[:50]}...") +``` + +## Style Information + +PdfAct extracts rich style information for each text block: + +```python +doc = Parxy.parse("document.pdf", driver_name="pdfact") + +for page in doc.pages: + for block in page.blocks: + if block.style: + print(f"Font: {block.style.font_name}") + print(f"Size: {block.style.font_size}") + print(f"Color: {block.style.color}") + print(f"Style: {block.style.font_style}") # e.g., 'italic' + print(f"Weight: {block.style.weight}") # e.g., 400 for bold +``` + +## Bounding Box Information + +Each text block includes bounding box coordinates: + +```python +doc = Parxy.parse("document.pdf", driver_name="pdfact") + +for page in doc.pages: + print(f"Page dimensions: {page.width} x {page.height}") + for block in page.blocks: + if block.bbox: + print(f"Block at: ({block.bbox.x0}, {block.bbox.y0}) to ({block.bbox.x1}, {block.bbox.y1})") +``` + +## Use Cases + +### Scientific Papers + +PdfAct excels at extracting structured content from scientific papers: + +```python +doc = Parxy.parse("paper.pdf", driver_name="pdfact") + +# Find abstract +for page in doc.pages: + for block in page.blocks: + if block.role == "doc-abstract": + print(f"Abstract: {block.text}") + +# Find references +for page in doc.pages: + for block in page.blocks: + if block.role == "doc-biblioref": + print(f"Reference: {block.text}") +``` + +### Processing Remote PDFs + +When processing PDFs from URLs, PdfAct handles the download internally: + +```python +doc = Parxy.parse( + "https://arxiv.org/pdf/2301.00001.pdf", + driver_name="pdfact" +) +``` + +### Filtering by Content Type + +Filter blocks by their semantic role: + +```python +doc = Parxy.parse("document.pdf", driver_name="pdfact") + +# Get only main body text (no headers, footers, footnotes) +body_text = [] +skip_roles = {"doc-pageheader", "doc-pagefooter", "doc-footnote"} + +for page in doc.pages: + for block in page.blocks: + if block.role not in skip_roles: + body_text.append(block.text) + +clean_text = "\n".join(body_text) +``` + +## Self-Hosting PdfAct + +### Docker Deployment + +The simplest way to run PdfAct is using Docker. + +```bash +# Basic deployment +docker run -d -p 4567:4567 --name pdfact ghcr.io/data-house/pdfact:main + +# With resource limits +docker run -d -p 4567:4567 --name pdfact \ + --memory="4g" \ + --cpus="2" \ + ghcr.io/data-house/pdfact:main +``` + +### Docker Compose + +Parxy offers the `parxy docker` command to generate a Docker Compose file for deploying PdfAct + +## Troubleshooting + +### Connection Errors + +If you see connection errors: + +1. Verify PdfAct is running: `curl http://localhost:4567/` +2. Check the base URL configuration +3. Ensure no firewall blocks the port + +```python +# Test connection +import requests + +try: + response = requests.get("http://localhost:4567/") + print(f"PdfAct status: {response.status_code}") +except requests.ConnectionError: + print("Cannot connect to PdfAct service") +``` + +### Invalid URL Errors + +The driver validates URLs before connecting: + +```python +# This will raise ValueError +try: + config = PdfActConfig(base_url="not-a-valid-url") +except ValueError as e: + print(f"Invalid URL: {e}") +``` + +### Unsupported Input Types + +BytesIO and bytes inputs are not supported: + +```python +# This will raise NotImplementedError +import io + +with open("document.pdf", "rb") as f: + data = io.BytesIO(f.read()) + +try: + doc = Parxy.parse(data, driver_name="pdfact") +except NotImplementedError: + print("Use file path or URL instead") +``` + +### Missing Text + +If text is missing from output: + +1. Verify the PDF contains extractable text (not just images) +2. Check if PdfAct logs show parsing errors +3. Try with a different PDF to isolate the issue + +## See Also + +- [PdfAct GitHub Repository](https://github.com/data-house/pdfact) +- [Document Structure Roles](../explanation/document-roles.md) +- [Getting Started Tutorial](../tutorials/getting_started.md) diff --git a/docs/howto/configure_pymupdf.md b/docs/howto/configure_pymupdf.md new file mode 100644 index 0000000..1a86ff1 --- /dev/null +++ b/docs/howto/configure_pymupdf.md @@ -0,0 +1,367 @@ +# How to Configure PyMuPDF + +This guide shows you how to use the PyMuPDF driver for document processing. PyMuPDF is the default driver in Parxy and requires no external services or API keys. + +## Prerequisites + +- Parxy installed (PyMuPDF is included in the base installation) + +## Quick Start + +PyMuPDF works out of the box with no configuration required: + +```python +from parxy_core.facade.parxy import Parxy + +# PyMuPDF is the default driver +doc = Parxy.parse("document.pdf") +print(f"Processed {len(doc.pages)} pages") + +# Or explicitly specify the driver +doc = Parxy.parse("document.pdf", driver_name="pymupdf") +``` + +## Supported Extraction Levels + +PyMuPDF supports the most comprehensive extraction hierarchy of all Parxy drivers: + +| Level | Description | Includes | +|-------|-------------|----------| +| `page` | Page-level text only | Page text and dimensions | +| `block` | Text blocks | Pages + blocks with bounding boxes | +| `line` | Text lines | Pages + blocks + lines | +| `span` | Text spans | Pages + blocks + lines + spans with styling | +| `character` | Individual characters | Full hierarchy including each character | + +```python +# Page-level extraction (fastest) +doc = Parxy.parse("document.pdf", level="page") + +# Block-level extraction (default) +doc = Parxy.parse("document.pdf", level="block") + +# Line-level extraction +doc = Parxy.parse("document.pdf", level="line") + +# Span-level extraction (includes font styling) +doc = Parxy.parse("document.pdf", level="span") + +# Character-level extraction (most detailed, slowest) +doc = Parxy.parse("document.pdf", level="character") +``` + +## Input Types + +PyMuPDF accepts multiple input formats: + +### Local Files + +```python +doc = Parxy.parse("/path/to/document.pdf") +``` + +### BytesIO Streams + +```python +import io + +with open("document.pdf", "rb") as f: + stream = io.BytesIO(f.read()) + +doc = Parxy.parse(stream, driver_name="pymupdf") +``` + +### Raw Bytes + +```python +with open("document.pdf", "rb") as f: + data = f.read() + +doc = Parxy.parse(data, driver_name="pymupdf") +``` + +## Document Metadata + +PyMuPDF extracts PDF metadata automatically: + +```python +doc = Parxy.parse("document.pdf") + +if doc.metadata: + print(f"Title: {doc.metadata.title}") + print(f"Author: {doc.metadata.author}") + print(f"Subject: {doc.metadata.subject}") + print(f"Keywords: {doc.metadata.keywords}") + print(f"Creator: {doc.metadata.creator}") + print(f"Producer: {doc.metadata.producer}") + print(f"Created: {doc.metadata.created_at}") + print(f"Modified: {doc.metadata.updated_at}") +``` + +## Style Information + +At `span` level and above, PyMuPDF extracts rich styling information: + +```python +doc = Parxy.parse("document.pdf", level="span") + +for page in doc.pages: + for block in page.blocks: + for line in block.lines: + for span in line.spans: + if span.style: + print(f"Font: {span.style.font_name}") + print(f"Size: {span.style.font_size}") + print(f"Color: {span.style.color}") + print(f"Italic: {span.style.font_style == 'italic'}") + print(f"Bold: {span.style.weight == 400}") + print(f"Alpha: {span.style.alpha}") +``` + +## Bounding Box Information + +All extraction levels include bounding box coordinates: + +```python +doc = Parxy.parse("document.pdf", level="block") + +for page in doc.pages: + print(f"Page {page.number}: {page.width} x {page.height}") + for block in page.blocks: + if block.bbox: + print(f" Block: ({block.bbox.x0}, {block.bbox.y0}) to ({block.bbox.x1}, {block.bbox.y1})") +``` + +## Character-Level Extraction + +For applications requiring precise character positioning (e.g., text overlay, redaction): + +```python +doc = Parxy.parse("document.pdf", level="character") + +for page in doc.pages: + for block in page.blocks: + for line in block.lines: + for span in line.spans: + for char in span.characters: + print(f"'{char.text}' at ({char.bbox.x0}, {char.bbox.y0})") +``` + +## Source Data + +PyMuPDF preserves original data in the `source_data` field: + +```python +doc = Parxy.parse("document.pdf", level="span") + +for page in doc.pages: + for block in page.blocks: + # Block number from PyMuPDF + print(f"Block number: {block.source_data.get('number')}") + + for line in block.lines: + # Writing mode and direction + print(f"Writing mode: {line.source_data.get('wmode')}") + print(f"Direction: {line.source_data.get('dir')}") + + for span in line.spans: + # Font flags, bidirectional info, metrics + print(f"Flags: {span.source_data.get('flags')}") + print(f"Ascender: {span.source_data.get('ascender')}") + print(f"Descender: {span.source_data.get('descender')}") +``` + +## Parsing Warnings + +PyMuPDF captures PDF parsing warnings: + +```python +doc = Parxy.parse("document.pdf") + +if doc.parsing_metadata and doc.parsing_metadata.get('warnings'): + print(f"Warnings: {doc.parsing_metadata['warnings']}") +``` + +## Use Cases + +### Text Extraction + +Simple text extraction from a PDF: + +```python +doc = Parxy.parse("document.pdf", level="page") + +full_text = "\n\n".join(page.text for page in doc.pages) +print(full_text) +``` + +### Structured Content Analysis + +Analyze document structure at block level: + +```python +doc = Parxy.parse("document.pdf", level="block") + +for page in doc.pages: + print(f"\n=== Page {page.number} ===") + for i, block in enumerate(page.blocks): + print(f"Block {i}: {block.text[:50]}...") +``` + +### Font Analysis + +Identify fonts used in a document: + +```python +doc = Parxy.parse("document.pdf", level="span") + +fonts = set() +for page in doc.pages: + for block in page.blocks: + for line in block.lines: + for span in line.spans: + if span.style and span.style.font_name: + fonts.add(span.style.font_name) + +print(f"Fonts used: {fonts}") +``` + +### Text Position Mapping + +Map text to coordinates for overlay or annotation: + +```python +doc = Parxy.parse("document.pdf", level="line") + +for page in doc.pages: + for block in page.blocks: + for line in block.lines: + if "keyword" in line.text.lower(): + print(f"Found at page {page.number}: {line.bbox}") +``` + +### Highlighting Detection + +Detect text styling patterns: + +```python +doc = Parxy.parse("document.pdf", level="span") + +bold_text = [] +italic_text = [] + +for page in doc.pages: + for block in page.blocks: + for line in block.lines: + for span in line.spans: + if span.style: + if span.style.weight == 400: + bold_text.append(span.text) + if span.style.font_style == "italic": + italic_text.append(span.text) + +print(f"Bold sections: {len(bold_text)}") +print(f"Italic sections: {len(italic_text)}") +``` + +## Performance Considerations + +### Extraction Level Impact + +Higher extraction levels require more processing: + +| Level | Relative Speed | Memory Usage | +|-------|---------------|--------------| +| `page` | Fastest | Lowest | +| `block` | Fast | Low | +| `line` | Moderate | Moderate | +| `span` | Slower | Higher | +| `character` | Slowest | Highest | + +Choose the minimum level needed for your use case. + +### Large Documents + +For large documents, consider processing page by page: + +```python +import pymupdf + +# Direct PyMuPDF access for memory efficiency +with pymupdf.open("large-document.pdf") as pdf: + for page_num in range(len(pdf)): + page = pdf[page_num] + text = page.get_text() + # Process page text + del text # Free memory +``` + +## Comparison with Other Drivers + +Choose PyMuPDF when: + +- You need fast, local processing +- Privacy is important (no data leaves your system) +- You need detailed extraction (character-level) +- You need PDF metadata +- You want the simplest setup (no configuration) + +## Troubleshooting + +### File Not Found + +```python +from parxy_core.exceptions import FileNotFoundException + +try: + doc = Parxy.parse("missing.pdf") +except FileNotFoundException as e: + print(f"File not found: {e}") +``` + +### Corrupted PDFs + +PyMuPDF handles many malformed PDFs gracefully. Check warnings: + +```python +doc = Parxy.parse("possibly-corrupted.pdf") + +if doc.parsing_metadata: + warnings = doc.parsing_metadata.get('warnings', '') + if warnings: + print(f"PDF warnings: {warnings}") +``` + +### Empty Pages + +Some PDFs contain image-only pages with no extractable text: + +```python +doc = Parxy.parse("scanned-document.pdf") + +for page in doc.pages: + if not page.text.strip(): + print(f"Page {page.number} has no extractable text (may be scanned)") +``` + +For scanned documents, consider using LlamaParse or LLMWhisperer with OCR support. + +### Memory Issues + +For very large documents or batch processing: + +```python +import gc + +for pdf_path in pdf_files: + doc = Parxy.parse(pdf_path) + # Process document + del doc + gc.collect() # Force garbage collection +``` + +## See Also + +- [PyMuPDF Documentation](https://pymupdf.readthedocs.io/) +- [Getting Started Tutorial](../tutorials/getting_started.md) +- [Document Model Reference](../reference/document_model.md) diff --git a/docs/howto/configure_unstructured_local.md b/docs/howto/configure_unstructured_local.md new file mode 100644 index 0000000..bc50a19 --- /dev/null +++ b/docs/howto/configure_unstructured_local.md @@ -0,0 +1,415 @@ +# How to Configure Unstructured Local + +This guide shows you how to configure the Unstructured Local driver for document processing. This driver uses the open-source `unstructured` library for local document parsing without requiring external services. + +## Prerequisites + +- Parxy installed with Unstructured support: `pip install parxy[unstructured_local]` or via UV `uv add parxy[unstructured_local]` + +## Quick Start + +### Step 1: Install Dependencies + +```bash +pip install parxy[unstructured_local] +``` + +Or with UV: + +```bash +uv add parxy[unstructured_local] +``` + +### Step 2: Parse a Document + +```python +from parxy_core.facade.parxy import Parxy + +doc = Parxy.parse("document.pdf", driver_name="unstructured_local") +print(f"Processed {len(doc.pages)} pages") +``` + +## Configuration Options + +The Unstructured Local driver has minimal configuration since it runs locally. Environment variables use the `PARXY_UNSTRUCTURED_LOCAL_` prefix. + +Currently, no specific configuration options are required. The driver uses sensible defaults from the `unstructured` library. + +## Supported Extraction Levels + +| Level | Description | +|-------|-------------| +| `page` | Extract text at page level only | +| `block` | Extract text as blocks with layout information | + +```python +# Page-level extraction +doc = Parxy.parse("document.pdf", driver_name="unstructured_local", level="page") + +# Block-level extraction (default) +doc = Parxy.parse("document.pdf", driver_name="unstructured_local", level="block") +``` + +## Input Types + +The Unstructured Local driver accepts multiple input formats: + +### Local Files + +```python +doc = Parxy.parse("/path/to/document.pdf", driver_name="unstructured_local") +``` + +### BytesIO Streams + +```python +import io + +with open("document.pdf", "rb") as f: + stream = io.BytesIO(f.read()) + +doc = Parxy.parse(stream, driver_name="unstructured_local") +``` + +### Raw Bytes + +```python +with open("document.pdf", "rb") as f: + data = f.read() + +doc = Parxy.parse(data, driver_name="unstructured_local") +``` + +## Supported File Formats + +The `unstructured` library supports many document formats: + +- PDF (`.pdf`) +- Microsoft Word (`.docx`, `.doc`) +- PowerPoint (`.pptx`, `.ppt`) +- Excel (`.xlsx`, `.xls`) +- Plain Text (`.txt`) +- HTML (`.html`, `.htm`) +- Markdown (`.md`) +- Rich Text Format (`.rtf`) +- Email (`.eml`, `.msg`) +- Images (`.png`, `.jpg` - with OCR) + +```python +# Process different file types +doc_pdf = Parxy.parse("report.pdf", driver_name="unstructured_local") +doc_word = Parxy.parse("document.docx", driver_name="unstructured_local") +doc_html = Parxy.parse("page.html", driver_name="unstructured_local") +``` + +## Element Categories + +Unstructured identifies semantic categories for each text block: + +```python +doc = Parxy.parse("document.pdf", driver_name="unstructured_local") + +for page in doc.pages: + for block in page.blocks: + print(f"Category: {block.category}") + print(f"Text: {block.text[:50]}...") +``` + +Common categories include: +- `Title` - Document or section titles +- `NarrativeText` - Body paragraphs +- `ListItem` - List items +- `Table` - Table content +- `Image` - Image elements +- `Header` - Page headers +- `Footer` - Page footers +- `FigureCaption` - Figure captions +- `Address` - Address blocks +- `EmailAddress` - Email addresses + +## Bounding Box Information + +Each text block includes coordinate information: + +```python +doc = Parxy.parse("document.pdf", driver_name="unstructured_local") + +for page in doc.pages: + for block in page.blocks: + if block.bbox: + print(f"Block at: ({block.bbox.x0}, {block.bbox.y0})") + print(f"Size: {block.bbox.x1} x {block.bbox.y1}") +``` + +## Language Detection + +The driver detects document language automatically: + +```python +doc = Parxy.parse("document.pdf", driver_name="unstructured_local") + +print(f"Document language: {doc.language}") +``` + +## Source Data + +Original Unstructured metadata is preserved: + +```python +doc = Parxy.parse("document.pdf", driver_name="unstructured_local") + +for page in doc.pages: + for block in page.blocks: + metadata = block.source_data + print(f"Filename: {metadata.get('filename')}") + print(f"Page: {metadata.get('page_number')}") + print(f"Languages: {metadata.get('languages')}") +``` + +## Passing Options to Unstructured + +You can pass additional options directly to the `unstructured` partitioner: + +```python +doc = Parxy.parse( + "document.pdf", + driver_name="unstructured_local", + # Options passed to unstructured.partition.auto.partition() + strategy="hi_res", # Use high-resolution strategy + infer_table_structure=True, # Extract table structure + include_page_breaks=True, # Include page break elements +) +``` + +### Common Options + +| Option | Type | Description | +|--------|------|-------------| +| `strategy` | str | Partitioning strategy: `auto`, `fast`, `hi_res`, `ocr_only` | +| `infer_table_structure` | bool | Extract table structure as HTML | +| `include_page_breaks` | bool | Include page break elements | +| `languages` | list | Languages for OCR (e.g., `["eng", "deu"]`) | +| `ocr_languages` | str | Tesseract language codes | +| `encoding` | str | Text encoding for text files | + +## Use Cases + +### Multi-Format Processing + +Process various document types uniformly: + +```python +from pathlib import Path + +documents = Path("docs/").glob("*.*") + +for doc_path in documents: + if doc_path.suffix.lower() in [".pdf", ".docx", ".html", ".txt"]: + doc = Parxy.parse(str(doc_path), driver_name="unstructured_local") + print(f"{doc_path.name}: {len(doc.pages)} pages") +``` + +### Content Classification + +Classify content by element type: + +```python +doc = Parxy.parse("document.pdf", driver_name="unstructured_local") + +titles = [] +body_text = [] +lists = [] + +for page in doc.pages: + for block in page.blocks: + if block.category == "Title": + titles.append(block.text) + elif block.category == "NarrativeText": + body_text.append(block.text) + elif block.category == "ListItem": + lists.append(block.text) + +print(f"Titles: {len(titles)}") +print(f"Paragraphs: {len(body_text)}") +print(f"List items: {len(lists)}") +``` + +### Table Extraction + +Extract tables with structure: + +```python +doc = Parxy.parse( + "document.pdf", + driver_name="unstructured_local", + infer_table_structure=True, +) + +for page in doc.pages: + for block in page.blocks: + if block.category == "Table": + # Table HTML is in source_data + table_html = block.source_data.get("text_as_html") + print(f"Table found: {block.text[:100]}...") +``` + +### Email Processing + +Process email files: + +```python +doc = Parxy.parse("message.eml", driver_name="unstructured_local") + +for page in doc.pages: + for block in page.blocks: + print(f"[{block.category}] {block.text}") +``` + +### OCR Processing + +Process scanned documents with OCR: + +```python +doc = Parxy.parse( + "scanned.pdf", + driver_name="unstructured_local", + strategy="ocr_only", + languages=["eng"], +) + +full_text = "\n".join(page.text for page in doc.pages) +print(full_text) +``` + +## Installation Options + +The `unstructured` library has optional dependencies for different features: + +```bash +# Basic installation (PDF, text, HTML) +pip install parxy[unstructured_local] + +# With all document types +pip install "unstructured[all-docs]" + +# With specific formats +pip install "unstructured[pdf,docx,pptx]" +``` + +### OCR Dependencies + +For OCR support, install Tesseract: + +**Ubuntu/Debian:** +```bash +sudo apt-get install tesseract-ocr +``` + +**macOS:** +```bash +brew install tesseract +``` + +**Windows:** +Download from [Tesseract GitHub](https://github.com/UB-Mannheim/tesseract/wiki) + +## Comparison with Other Drivers + +| Feature | Unstructured Local | PyMuPDF | PdfAct | LlamaParse | +|---------|-------------------|---------|--------|------------| +| Installation | Local | Local | Self-hosted | Cloud | +| Multi-format | Yes | PDF only | PDF only | PDF only | +| API Key | No | No | Optional | Required | +| Cost | Free | Free | Free | Per-page | +| Extraction Levels | 2 | 5 | 3 | 2 | +| Element Categories | Yes | No | Yes | Yes | +| OCR Support | Yes | Yes | No | Yes | +| Table Structure | Yes | No | No | Yes | + +Choose Unstructured Local when: +- You need to process multiple document formats +- You want semantic element categorization +- You need OCR capabilities locally +- Privacy is important (local processing) +- You need table structure extraction + +## Troubleshooting + +### Import Errors + +If you see import errors, ensure dependencies are installed: + +```python +try: + doc = Parxy.parse("document.pdf", driver_name="unstructured_local") +except ImportError as e: + print("Install with: pip install parxy[unstructured_local]") +``` + +### File Not Found + +```python +from parxy_core.exceptions import FileNotFoundException + +try: + doc = Parxy.parse("missing.pdf", driver_name="unstructured_local") +except FileNotFoundException as e: + print(f"File not found: {e}") +``` + +### Parsing Errors + +```python +from parxy_core.exceptions import ParsingException + +try: + doc = Parxy.parse("corrupted.pdf", driver_name="unstructured_local") +except ParsingException as e: + print(f"Parsing failed: {e}") +``` + +### OCR Not Working + +1. Verify Tesseract is installed: `tesseract --version` +2. Check language packs are available: `tesseract --list-langs` +3. Specify the correct language: + +```python +doc = Parxy.parse( + "scanned.pdf", + driver_name="unstructured_local", + languages=["eng"], # Must match installed language pack +) +``` + +### Slow Processing + +For faster processing of simple documents: + +```python +doc = Parxy.parse( + "document.pdf", + driver_name="unstructured_local", + strategy="fast", # Skip expensive operations +) +``` + +### Memory Issues + +For large documents, process in batches or use streaming: + +```python +import gc + +for pdf_path in large_pdf_list: + doc = Parxy.parse(pdf_path, driver_name="unstructured_local") + # Process document + del doc + gc.collect() +``` + +## See Also + +- [Unstructured Documentation](https://docs.unstructured.io/) +- [Unstructured GitHub](https://github.com/Unstructured-IO/unstructured) +- [Getting Started Tutorial](../tutorials/getting_started.md) diff --git a/docs/tutorials/agentic_usage.md b/docs/tutorials/agentic_usage.md new file mode 100644 index 0000000..65a9659 --- /dev/null +++ b/docs/tutorials/agentic_usage.md @@ -0,0 +1,82 @@ +# Agentic Usage + +You can use Parxy with AI coding assistants such as Claude Code, GitHub Copilot, Cursor, and other AI-powered development tools. + +## Overview + +Parxy provides a CLI command to generate documentation that helps AI agents understand how to use Parxy effectively within your codebase. + +### AGENTS.md + +[`AGENTS.md`](https://agents.md/) is a file used to guide coding agents. It provides code examples and precise instructions for agents. The `AGENTS.md` file is always loaded into the coding agent’s context. + +To set up `AGENTS.md`, or to update an existing file with Parxy-specific instructions, run: + +```bash +parxy agents +``` + +The `AGENTS.md` file contains: + +* Quick-start examples for parsing documents +* Available drivers and their use cases +* Explanations of extraction levels +* Document model usage patterns +* CLI command references +* Configuration environment variables +* Common tasks and code snippets +* Error-handling patterns + +The Parxy-specific content is wrapped in `` tags, allowing you to maintain your own project documentation alongside Parxy instructions. + +When Parxy releases new features, you can update your agent documentation by running: + +```bash +parxy agents --overwrite +``` + +This updates only the `` section while preserving any custom content you have added to `AGENTS.md`. + +### Skills + +[Skills](https://agentskills.io/what-are-skills) are specialized workflows and instructions. They are used to progressively disclose information to agents, rather than filling the available context space. A skill may not be read by a coding agent if it is considered irrelevant. + +You can add Parxy skills using the [skillsmd CLI](https://github.com/avvertix/skillsmd): + +```bash +uvx skillsmd add https://github.com/OneOffTech/parxy +``` + +## Troubleshooting + +### AI Not Finding Parxy Documentation + +Ensure that `AGENTS.md` is located in the project root or in a directory scanned by your AI tool. Most tools look for: + +* `AGENTS.md` in the project root +* `.github/AGENTS.md` +* `docs/AGENTS.md` + +### Skills Not Appearing + +Verify that the skills directory exists: + +```bash +ls -la .claude/skills/ +``` + +Restart Claude Code after adding new skills. + +### Outdated Instructions + +If AI suggestions seem outdated, update the Parxy section by running: + +```bash +parxy agents --overwrite +``` + +## See Also + +* [Getting Started Tutorial](./getting_started.md) +* [CLI Usage Guide](./using_cli.md) +* [Driver Configuration Guides](../howto/configure_pymupdf.md) diff --git a/skills/SKILL.md b/skills/SKILL.md new file mode 100644 index 0000000..d2bbbc1 --- /dev/null +++ b/skills/SKILL.md @@ -0,0 +1,196 @@ +--- +name: parxy +description: | + Document parsing and PDF manipulation using the Parxy library. Use when: + (1) Parsing PDFs or documents to extract text/structure + (2) Converting documents to markdown + (3) Batch processing multiple documents + (4) Merging, splitting, or optimizing PDFs + (5) Managing PDF attachments +--- + +# Parxy Document Processing + +## Parsing Documents + +```python +from parxy_core.facade.parxy import Parxy + +# Basic parsing (default: pymupdf driver, block level) +doc = Parxy.parse("document.pdf") + +# With specific driver and level +doc = Parxy.parse("document.pdf", driver_name="llamaparse", level="span") + +# From bytes or BytesIO +doc = Parxy.parse(pdf_bytes) +``` + +### Extraction Levels + +| Level | Description | +|-------|-------------| +| `page` | Page text only | +| `block` | Text blocks (default) | +| `line` | Individual lines | +| `span` | Text spans with styling | +| `character` | Individual characters | + +### Document Structure + +```python +doc.filename # Original filename +doc.pages # List of Page objects +doc.metadata # Document metadata + +page.number # Page number (1-indexed) +page.text # Full page text +page.blocks # List of TextBlock objects + +block.text # Block text content +block.bbox # Bounding box (x0, y0, x1, y1) +block.role # Semantic role (paragraph, heading, etc.) +``` + +## Available Drivers + +| Driver | Constant | Type | Best For | +|--------|----------|------|----------| +| PyMuPDF | `Parxy.PYMUPDF` | Local | Fast local processing | +| PdfAct | `Parxy.PDFACT` | Self-hosted | Scientific papers, semantic roles | +| LlamaParse | `Parxy.LLAMAPARSE` | Cloud | Complex docs with OCR/tables | +| LLMWhisperer | `Parxy.LLMWHISPERER` | Cloud | Form extraction | +| Unstructured | `Parxy.UNSTRUCTURED_LIBRARY` | Local | Multi-format (DOCX, HTML) | + +```python +# List all drivers +drivers = Parxy.drivers() + +# Get specific driver +driver = Parxy.driver(Parxy.LLAMAPARSE) +``` + +## Batch Processing + +```python +from parxy_core.facade.parxy import Parxy +from parxy_core.models import BatchTask + +# Simple batch +results = Parxy.batch( + tasks=["doc1.pdf", "doc2.pdf"], + drivers=["pymupdf"], + workers=4, +) + +# Per-file configuration +results = Parxy.batch(tasks=[ + BatchTask(file="simple.pdf"), + BatchTask(file="complex.pdf", drivers=["llamaparse"], level="line"), +]) + +# Streaming results +for result in Parxy.batch_iter(tasks=["doc1.pdf", "doc2.pdf"]): + if result.success: + print(f"{result.file}: {len(result.document.pages)} pages") + else: + print(f"{result.file} failed: {result.error}") +``` + +## PDF Manipulation + +### Merge PDFs + +```python +from pathlib import Path +from parxy_core.facade.parxy import Parxy + +# Merge entire PDFs +Parxy.pdf.merge( + inputs=[ + (Path("doc1.pdf"), None, None), # All pages + (Path("doc2.pdf"), None, None), + ], + output=Path("merged.pdf"), +) + +# Merge specific page ranges (0-indexed) +Parxy.pdf.merge( + inputs=[ + (Path("doc1.pdf"), 0, 4), # Pages 1-5 + (Path("doc2.pdf"), 0, 0), # Page 1 only + ], + output=Path("selected.pdf"), +) +``` + +### Split PDF + +```python +# Split into individual pages +pages = Parxy.pdf.split( + input_path=Path("document.pdf"), + output_dir=Path("./pages"), + prefix="doc", +) +# Returns: [Path('pages/doc_page_1.pdf'), Path('pages/doc_page_2.pdf'), ...] +``` + +### Optimize PDF + +```python +result = Parxy.pdf.optimize( + input_path=Path("large.pdf"), + output_path=Path("small.pdf"), + scrub_metadata=True, # Remove metadata, thumbnails + subset_fonts=True, # Keep only used glyphs + compress_images=True, # Downsample and compress + dpi_target=72, + image_quality=60, + convert_to_grayscale=False, +) +print(f"Reduced by {result['reduction_percent']:.1f}%") +``` + +## PDF Attachments + +```python +from pathlib import Path +from parxy_core.services.pdf_service import PdfService + +with PdfService(Path("document.pdf")) as pdf: + # List attachments + names = pdf.list_attachments() + + # Add attachment + pdf.add_attachment(Path("data.csv"), name="data", desc="Sales data") + + # Extract attachment + content = pdf.extract_attachment("data") + + # Remove attachment + pdf.remove_attachment("data") + + # Save changes + pdf.save(Path("output.pdf")) +``` + +## CLI Commands + +Assuming uvx is installed, pipx can be used as well. + +```bash +# Parse document +uvx parxy parse document.pdf --driver llamaparse --level block --format json + +# Convert to markdown +uvx parxy markdown document.pdf -o output/ +uvx parxy markdown *.pdf --combine -o combined.md + +# PDF operations +uvx parxy pdf split document.pdf --pages 1-5 -o output/ +uvx parxy pdf merge doc1.pdf doc2.pdf -o combined.pdf + +# List drivers +uvx parxy drivers +``` diff --git a/src/parxy_cli/agents.template.md b/src/parxy_cli/agents.template.md new file mode 100644 index 0000000..c8316f8 --- /dev/null +++ b/src/parxy_cli/agents.template.md @@ -0,0 +1,205 @@ + + + +## Parxy Document Processing + +This project uses Parxy for document processing. Parxy is a document processing gateway that provides a unified text extraction interface across multiple services. + +### Quick Start + +```python +from parxy_core.facade.parxy import Parxy + +# Using default driver (PyMuPDF) +doc = Parxy.parse("document.pdf") + +# Using a specific driver +doc = Parxy.parse("document.pdf", driver_name="llamaparse") + +# With extraction level +doc = Parxy.parse("document.pdf", level="span") +``` + +### Available Drivers + +| Driver | Type | Installation | Best For | +|--------|------|--------------|----------| +| `pymupdf` | Local | Base install | Fast local processing, detailed extraction | +| `pdfact` | Self-hosted | Base install | Semantic roles, scientific papers | +| `llamaparse` | Cloud | `parxy[llama]` | Complex documents, OCR | +| `llmwhisperer` | Cloud | `parxy[llmwhisperer]` | Form extraction | +| `unstructured_local` | Local | `parxy[unstructured_local]` | Multi-format support | +| `landingai` | Cloud | `parxy[landingai]` | Vision-based extraction | + +### Extraction Levels + +The document model hierarchy: `page -> block -> line -> span -> character` + +- `page`: Page-level text only (fastest) +- `block`: Text blocks with bounding boxes (default) +- `line`: Individual text lines +- `span`: Text spans with font styling +- `character`: Individual characters with positions + +### Document Model + +```python +doc = Parxy.parse("document.pdf", level="block") + +# Access pages +for page in doc.pages: + print(f"Page {page.number}: {page.width}x{page.height}") + + # Access blocks + for block in page.blocks: + print(f" {block.role}: {block.text[:50]}...") + + # Bounding box + if block.bbox: + print(f" Position: ({block.bbox.x0}, {block.bbox.y0})") +``` + +### CLI Commands + +```bash +# Parse documents +parxy parse document.pdf +parxy parse document.pdf --driver llamaparse --level span + +# Convert to markdown +parxy markdown document.pdf -o output/ + +# List available drivers +parxy drivers + +# PDF manipulation +parxy pdf split document.pdf --pages 1-5 +parxy pdf merge doc1.pdf doc2.pdf -o combined.pdf + +# Manage attachments +parxy attach list document.pdf +parxy attach extract document.pdf -o attachments/ +``` + +### Configuration + +Environment variables use `PARXY__` prefix: + +```bash +# LlamaParse +PARXY_LLAMAPARSE_API_KEY=llx-your-key + +# PdfAct (self-hosted) +PARXY_PDFACT_BASE_URL=http://localhost:4567/ + +# LLMWhisperer +PARXY_LLMWHISPERER_API_KEY=your-key + +# Observability +PARXY_TRACING_ENABLE=true +PARXY_TRACING_ENDPOINT=http://localhost:4318/ +``` + +### Common Tasks + +#### Extract Text from PDF + +```python +from parxy_core.facade.parxy import Parxy + +doc = Parxy.parse("document.pdf") +text = "\n\n".join(page.text for page in doc.pages) +``` + +#### Process with Specific Driver + +```python +# Cloud processing with LlamaParse +doc = Parxy.parse( + "complex-document.pdf", + driver_name="llamaparse", + parse_mode="parse_page_with_lvm", # Vision model for tables + continuous_mode=True, # Multi-page tables +) +``` + +#### Batch Processing + +```python +from pathlib import Path +from parxy_core.facade.parxy import Parxy + +for pdf in Path("docs/").glob("*.pdf"): + doc = Parxy.parse(str(pdf)) + print(f"{pdf.name}: {len(doc.pages)} pages") +``` + +#### Filter by Content Role + +```python +doc = Parxy.parse("paper.pdf", driver_name="pdfact") + +# Get only body text (skip headers/footers) +skip_roles = {"doc-pageheader", "doc-pagefooter", "doc-footnote"} +body_text = [] + +for page in doc.pages: + for block in page.blocks: + if block.role not in skip_roles: + body_text.append(block.text) +``` + +### Error Handling + +```python +from parxy_core.facade.parxy import Parxy +from parxy_core.exceptions import ( + FileNotFoundException, + AuthenticationException, + ParsingException, +) + +try: + doc = Parxy.parse("document.pdf", driver_name="llamaparse") +except FileNotFoundException: + print("File not found") +except AuthenticationException: + print("API key invalid or missing") +except ParsingException as e: + print(f"Parsing failed: {e}") +``` + +### Development Commands + +```bash +# Install all dependencies +uv sync --all-extras + +# Run tests +uv run pytest + +# Format code +uv run ruff format + +# Run CLI +uv run parxy parse document.pdf +``` + +### Documentation + +For detailed guides, see the `docs/` directory: + +- `docs/tutorials/` - Getting started, CLI usage +- `docs/howto/` - Driver configuration guides +- `docs/explanation/` - Architecture and concepts + +### Key Files + +When installed as a library in a virtual environment + +- `.venv/Lib/site-packages/parxy_core/facade/parxy.py` - Main public API +- `.venv/Lib/site-packages/parxy_core/drivers/` - Driver implementations +- `.venv/Lib/site-packages/parxy_core/models/config.py` - Configuration classes +- `.venv/Lib/site-packages/parxy_core/models/models.py` - Document model + + diff --git a/src/parxy_cli/cli.py b/src/parxy_cli/cli.py index 359f872..9db4ac5 100644 --- a/src/parxy_cli/cli.py +++ b/src/parxy_cli/cli.py @@ -19,6 +19,7 @@ from parxy_cli.commands.pdf import app as pdf_command from parxy_cli.commands.attach import app as attach_command from parxy_cli.commands.tui import app as tui_command +from parxy_cli.commands.agents import app as agents_command # Create typer app @@ -77,6 +78,7 @@ def main( app.add_typer(pdf_command) app.add_typer(attach_command) app.add_typer(tui_command) +app.add_typer(agents_command) def main(): diff --git a/src/parxy_cli/commands/agents.py b/src/parxy_cli/commands/agents.py new file mode 100644 index 0000000..588b4a1 --- /dev/null +++ b/src/parxy_cli/commands/agents.py @@ -0,0 +1,159 @@ +"""Command to set up AI agent configuration files for Parxy projects.""" + +import re +from importlib.resources import files +from pathlib import Path +from typing import Optional + +import typer +from typing_extensions import Annotated + +from parxy_cli.console.console import Console + +app = typer.Typer() + +console = Console() + +# Tags used to identify Parxy section in AGENTS.md +PARXY_START_TAG = '' +PARXY_END_TAG = '' + +NEW_AGENTS_MD_TEMPLATE = """# AI Agent Guide + +Welcome, AI Assistant! This guide provides context for working with this project. + +{parxy_section} +""" + + +def _get_parxy_section_template() -> str: + """Load the Parxy section template for AGENTS.md.""" + return files('parxy_cli').joinpath('agents.template.md').read_text() + + +def _has_parxy_section(content: str) -> bool: + """Check if content already has a Parxy section.""" + return PARXY_START_TAG in content and PARXY_END_TAG in content + + +def _update_parxy_section(content: str, parxy_section: str) -> str: + """Replace existing Parxy section with new content.""" + pattern = re.compile( + rf'{re.escape(PARXY_START_TAG)}.*?{re.escape(PARXY_END_TAG)}', + re.DOTALL, + ) + return pattern.sub(parxy_section, content) + + +def _append_parxy_section(content: str, parxy_section: str) -> str: + """Append Parxy section to existing content.""" + # Ensure there's proper spacing + if not content.endswith('\n'): + content += '\n' + if not content.endswith('\n\n'): + content += '\n' + return content + parxy_section + + +@app.command() +def agents( + output_dir: Annotated[ + Optional[Path], + typer.Option( + '--output', + '-o', + help='Output directory for agent files. Defaults to current directory.', + ), + ] = None, + force: Annotated[ + bool, + typer.Option( + '--overwrite', + '-f', + help='Overwrite existing Parxy section without prompting.', + ), + ] = False, +): + """Set up AI agent configuration files for Parxy projects. + + Creates or updates an AGENTS.md file with Parxy usage documentation. + If AGENTS.md exists, the Parxy section (marked with tags) is + added or updated while preserving other content. + + Optionally creates Claude Code skill files for common operations. + """ + output_path = output_dir or Path.cwd() + + console.print('[bold]Setting up Parxy agent configuration[/bold]') + console.newline() + + # Handle AGENTS.md + agents_file = output_path / 'AGENTS.md' + + if agents_file.exists(): + existing_content = agents_file.read_text(encoding='utf-8') + + if _has_parxy_section(existing_content): + # Update existing Parxy section + console.print('[yellow]AGENTS.md already has a Parxy section[/yellow]') + + if not force: + update = typer.confirm( + 'Do you want to update the Parxy section?', default=True + ) + if not update: + console.print('[dim]Leaving Parxy section as is.[/dim]') + else: + new_content = _update_parxy_section( + existing_content, _get_parxy_section_template() + ) + agents_file.write_text(new_content, encoding='utf-8') + console.print('[green]Updated[/green] Parxy section in AGENTS.md') + else: + new_content = _update_parxy_section( + existing_content, _get_parxy_section_template() + ) + agents_file.write_text(new_content, encoding='utf-8') + console.print('[green]Updated[/green] Parxy section in AGENTS.md') + else: + # Append Parxy section to existing file + console.print('[yellow]AGENTS.md exists without Parxy section[/yellow]') + + if not force: + append = typer.confirm( + 'Do you want to add the Parxy section?', default=True + ) + if not append: + console.print('[dim]Leaving AGENTS.md as is.[/dim]') + else: + new_content = _append_parxy_section( + existing_content, _get_parxy_section_template() + ) + agents_file.write_text(new_content, encoding='utf-8') + console.print('[green]Added[/green] Parxy section to AGENTS.md') + else: + new_content = _append_parxy_section( + existing_content, _get_parxy_section_template() + ) + agents_file.write_text(new_content, encoding='utf-8') + console.print('[green]Added[/green] Parxy section to AGENTS.md') + else: + # Create new AGENTS.md with Parxy section + new_content = NEW_AGENTS_MD_TEMPLATE.format( + parxy_section=_get_parxy_section_template() + ) + agents_file.write_text(new_content, encoding='utf-8') + console.print('[green]Created[/green] AGENTS.md with Parxy section') + + console.newline() + console.print('[green]Agent configuration complete![/green]') + console.newline() + console.print( + 'Your project is now configured for AI assistants. ' + 'The AGENTS.md file provides context about Parxy usage.' + ) + console.newline() + console.print( + '[dim]Tip: Run `parxy agents` again to update the Parxy section ' + 'when new features are available.[/dim]' + ) diff --git a/tests/commands/test_agents.py b/tests/commands/test_agents.py new file mode 100644 index 0000000..e13d86e --- /dev/null +++ b/tests/commands/test_agents.py @@ -0,0 +1,240 @@ +"""Test suite for the agents command.""" + +from pathlib import Path +from unittest.mock import patch + +import pytest +from click.utils import strip_ansi +from typer.testing import CliRunner + +from parxy_cli.commands.agents import app + + +@pytest.fixture +def runner(): + """Fixture providing a CLI runner.""" + return CliRunner() + + +@pytest.fixture +def mock_template_content(): + """Fixture providing mock agents template content.""" + return """ + + +## Parxy Document Processing + +Test content for agents template. + +""" + + +def test_agents_command_creates_agents_file(runner, mock_template_content): + """Test that the agents command creates AGENTS.md when it doesn't exist.""" + + with ( + patch('parxy_cli.commands.agents.files') as mock_files, + runner.isolated_filesystem(), + ): + mock_files.return_value.joinpath.return_value.read_text.return_value = ( + mock_template_content + ) + + result = runner.invoke(app) + + assert result.exit_code == 0 + + cleaned_output = strip_ansi(result.stdout) + assert 'Created AGENTS.md with Parxy section' in cleaned_output + assert 'Agent configuration complete!' in cleaned_output + + agents_file = Path.cwd() / 'AGENTS.md' + assert agents_file.exists() + content = agents_file.read_text() + assert '' in content + assert '' in content + assert 'Test content for agents template' in content + + +def test_agents_command_appends_to_existing_file_without_parxy_section( + runner, mock_template_content +): + """Test that the agents command appends Parxy section to existing AGENTS.md.""" + + with ( + patch('parxy_cli.commands.agents.files') as mock_files, + runner.isolated_filesystem(), + ): + existing_content = """# My Project + +Custom project documentation. +""" + agents_file = Path.cwd() / 'AGENTS.md' + agents_file.write_text(existing_content) + + mock_files.return_value.joinpath.return_value.read_text.return_value = ( + mock_template_content + ) + + result = runner.invoke(app, input='y\n') + + assert result.exit_code == 0 + + cleaned_output = strip_ansi(result.stdout) + assert 'AGENTS.md exists without Parxy section' in cleaned_output + assert 'Added Parxy section to AGENTS.md' in cleaned_output + + content = agents_file.read_text() + assert '# My Project' in content + assert 'Custom project documentation' in content + assert '' in content + assert '' in content + + +def test_agents_command_updates_existing_parxy_section(runner, mock_template_content): + """Test that the agents command updates existing Parxy section.""" + + with ( + patch('parxy_cli.commands.agents.files') as mock_files, + runner.isolated_filesystem(), + ): + existing_content = """# My Project + +Custom documentation. + + +Old parxy content that should be replaced. + + +More custom content. +""" + agents_file = Path.cwd() / 'AGENTS.md' + agents_file.write_text(existing_content) + + mock_files.return_value.joinpath.return_value.read_text.return_value = ( + mock_template_content + ) + + result = runner.invoke(app, input='y\n') + + assert result.exit_code == 0 + + cleaned_output = strip_ansi(result.stdout) + assert 'AGENTS.md already has a Parxy section' in cleaned_output + assert 'Updated Parxy section in AGENTS.md' in cleaned_output + + content = agents_file.read_text() + assert '# My Project' in content + assert 'Custom documentation' in content + assert 'More custom content' in content + assert 'Old parxy content that should be replaced' not in content + assert 'Test content for agents template' in content + + +def test_agents_command_respects_no_to_update_prompt(runner, mock_template_content): + """Test that the agents command respects 'no' answer to update prompt.""" + + with ( + patch('parxy_cli.commands.agents.files') as mock_files, + runner.isolated_filesystem(), + ): + existing_content = """ +Original content. +""" + agents_file = Path.cwd() / 'AGENTS.md' + agents_file.write_text(existing_content) + + mock_files.return_value.joinpath.return_value.read_text.return_value = ( + mock_template_content + ) + + result = runner.invoke(app, input='n\n') + + assert result.exit_code == 0 + + cleaned_output = strip_ansi(result.stdout) + assert 'Leaving Parxy section as is' in cleaned_output + + content = agents_file.read_text() + assert 'Original content' in content + assert 'Test content for agents template' not in content + + +def test_agents_command_respects_no_to_append_prompt(runner, mock_template_content): + """Test that the agents command respects 'no' answer to append prompt.""" + + with ( + patch('parxy_cli.commands.agents.files') as mock_files, + runner.isolated_filesystem(), + ): + existing_content = '# My Project\n' + agents_file = Path.cwd() / 'AGENTS.md' + agents_file.write_text(existing_content) + + mock_files.return_value.joinpath.return_value.read_text.return_value = ( + mock_template_content + ) + + result = runner.invoke(app, input='n\n') + + assert result.exit_code == 0 + + cleaned_output = strip_ansi(result.stdout) + assert 'Leaving AGENTS.md as is' in cleaned_output + + content = agents_file.read_text() + assert content == existing_content + + +def test_agents_command_force_flag_skips_prompts(runner, mock_template_content): + """Test that the --overwrite flag skips confirmation prompts.""" + + with ( + patch('parxy_cli.commands.agents.files') as mock_files, + runner.isolated_filesystem(), + ): + existing_content = """ +Old content. +""" + agents_file = Path.cwd() / 'AGENTS.md' + agents_file.write_text(existing_content) + + mock_files.return_value.joinpath.return_value.read_text.return_value = ( + mock_template_content + ) + + result = runner.invoke(app, ['--overwrite']) + + assert result.exit_code == 0 + + cleaned_output = strip_ansi(result.stdout) + assert 'Updated Parxy section in AGENTS.md' in cleaned_output + + content = agents_file.read_text() + assert 'Old content' not in content + assert 'Test content for agents template' in content + + +def test_agents_command_output_option(runner, mock_template_content): + """Test that the --output option creates files in specified directory.""" + + with ( + patch('parxy_cli.commands.agents.files') as mock_files, + runner.isolated_filesystem(), + ): + output_dir = Path.cwd() / 'subdir' + output_dir.mkdir() + + mock_files.return_value.joinpath.return_value.read_text.return_value = ( + mock_template_content + ) + + result = runner.invoke(app, ['--output', str(output_dir)]) + + assert result.exit_code == 0 + + agents_file = output_dir / 'AGENTS.md' + assert agents_file.exists() + + root_agents_file = Path.cwd() / 'AGENTS.md' + assert not root_agents_file.exists()