Overview
This tutorial walks you through building a production-ready invoice processing system with Documind. You’ll learn how to extract invoice data, handle reviews, and build a complete automation pipeline.What You’ll Build
By the end of this tutorial, you’ll have:- A robust invoice extraction system
- Automated review workflow handling
- Error handling and retry logic
- Batch processing capabilities
- Cost optimization strategies
Prerequisites
- Documind API key
- Python 3.8+ or Node.js 16+
- Basic understanding of REST APIs
- Sample invoices (we’ll provide examples)
Step 1: Design the Invoice Schema
First, create a comprehensive schema for invoice data:invoice_schema.json
Copy
{
"type": "object",
"title": "Invoice Extraction Schema",
"named_entities": {
"invoice_number": {
"type": "string",
"description": "Unique invoice identifier (e.g., INV-2024-001)"
},
"invoice_date": {
"type": "string",
"description": "Date the invoice was issued in YYYY-MM-DD format"
},
"due_date": {
"type": "string",
"description": "Payment due date in YYYY-MM-DD format"
},
"vendor": {
"type": "object",
"description": "Vendor/seller information",
"named_entities": {
"name": {
"type": "string",
"description": "Vendor company name"
},
"address": {
"type": "string",
"description": "Vendor mailing address"
},
"tax_id": {
"type": "string",
"description": "Vendor tax ID or EIN"
}
},
"required": ["name"]
},
"customer": {
"type": "object",
"description": "Customer/buyer information",
"named_entities": {
"name": {
"type": "string",
"description": "Customer company name"
},
"address": {
"type": "string",
"description": "Customer billing address"
}
}
},
"line_items": {
"type": "array",
"description": "Invoice line items",
"items": {
"type": "object",
"named_entities": {
"description": {
"type": "string",
"description": "Item or service description"
},
"quantity": {
"type": "number",
"description": "Quantity ordered"
},
"unit_price": {
"type": "number",
"description": "Price per unit"
},
"amount": {
"type": "number",
"description": "Line total (quantity × unit_price)"
}
},
"required": ["description", "amount"]
}
},
"subtotal": {
"type": "number",
"description": "Subtotal before tax"
},
"tax": {
"type": "number",
"description": "Tax amount"
},
"total": {
"type": "number",
"description": "Total amount due"
},
"currency": {
"type": "string",
"description": "Currency code (e.g., USD, EUR, GBP)",
"default": "USD"
},
"payment_terms": {
"type": "string",
"description": "Payment terms (e.g., Net 30, Due on Receipt)"
}
},
"required": ["invoice_number", "invoice_date", "vendor", "total"]
}
Schema Design Tips:
- Mark only critical fields as
required(invoice_number, total) - Include descriptions for better extraction accuracy
- Use proper data types (number for amounts, string for text)
- Nest related data (vendor, customer) for better organization
Step 2: Create the Invoice Processor
Build a Python class to handle invoice processing:Copy
import requests
import time
import logging
from typing import Dict, Any, Optional, List
logger = logging.getLogger(__name__)
class InvoiceProcessor:
def __init__(self, api_key: str, base_url: str = "https://api.documind.cloud/api/v1"):
self.api_key = api_key
self.base_url = base_url
self.headers = {"X-API-Key": api_key}
self.schema = self._load_schema()
def _load_schema(self) -> Dict[str, Any]:
"""Load invoice schema"""
import json
with open("invoice_schema.json") as f:
return json.load(f)
def upload_invoice(self, file_path: str) -> str:
"""Upload an invoice and return document ID"""
logger.info(f"Uploading {file_path}...")
with open(file_path, "rb") as f:
files = {"files": f}
response = requests.post(
f"{self.base_url}/upload",
headers=self.headers,
files=files
)
response.raise_for_status()
document_id = response.json()[0]
logger.info(f"Uploaded successfully: {document_id}")
return document_id
def extract_invoice(
self,
document_id: str,
mode: str = "basic",
model: str = "google-gemini-2.0-flash"
) -> Dict[str, Any]:
"""Extract invoice data"""
logger.info(f"Extracting invoice {document_id} in {mode} mode...")
payload = {
"schema": self.schema,
"prompt": self._get_extraction_prompt()
}
# Add mode-specific parameters
if mode == "basic":
payload["model"] = model
elif mode == "advanced":
payload["review_threshold"] = 85
response = requests.post(
f"{self.base_url}/extract/{document_id}",
headers=self.headers,
json=payload,
timeout=120
)
response.raise_for_status()
result = response.json()
logger.info(f"Extraction complete. Needs review: {result['needs_review']}")
return result
def _get_extraction_prompt(self) -> str:
"""Get extraction prompt for invoices"""
return """
Extract invoice information accurately from this document.
Instructions:
- Invoice number is typically in the header, labeled 'Invoice #' or 'Invoice Number'
- Use YYYY-MM-DD format for dates
- Extract ALL line items from the table, including description, quantity, unit price, and amount
- The total is the final amount due, including tax
- Vendor information is usually at the top or 'From' section
- Customer information is in the 'Bill To' or 'Customer' section
- If a field is not found, return null
Be precise with numeric values and ensure line items sum correctly.
"""
def wait_for_review(
self,
document_id: str,
timeout: int = 300,
poll_interval: int = 10
) -> Optional[Dict[str, Any]]:
"""Wait for human review to complete"""
logger.info(f"Waiting for review of {document_id}...")
start_time = time.time()
while (time.time() - start_time) < timeout:
response = requests.get(
f"{self.base_url}/data/extractions",
headers=self.headers,
params={"document_id": document_id}
)
response.raise_for_status()
data = response.json()
if data["items"]:
extraction = data["items"][0]
if extraction["is_reviewed"]:
logger.info("Review complete!")
return extraction["reviewed_results"]
time.sleep(poll_interval)
logger.error(f"Review not completed within {timeout}s")
return None
def process_invoice(
self,
file_path: str,
mode: str = "basic"
) -> Dict[str, Any]:
"""Complete invoice processing workflow"""
# Upload
document_id = self.upload_invoice(file_path)
# Extract
result = self.extract_invoice(document_id, mode=mode)
# Handle review if needed
if result["needs_review"]:
logger.info("Invoice flagged for review. Waiting...")
reviewed_results = self.wait_for_review(document_id)
if reviewed_results:
result["results"] = reviewed_results
result["is_reviewed"] = True
return {
"document_id": document_id,
"data": result["results"],
"needs_review": result["needs_review"],
"is_reviewed": result.get("is_reviewed", False)
}
# Usage
processor = InvoiceProcessor(api_key="your_api_key_here")
result = processor.process_invoice("invoice.pdf", mode="basic")
print(f"Invoice Number: {result['data']['invoice_number']}")
print(f"Total: ${result['data']['total']}")
Step 3: Handle Batch Processing
Process multiple invoices efficiently:Copy
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List
def process_invoice_batch(
processor: InvoiceProcessor,
file_paths: List[str],
max_workers: int = 3
) -> List[Dict[str, Any]]:
"""Process multiple invoices in parallel"""
results = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all tasks
futures = {
executor.submit(processor.process_invoice, file_path, mode="basic"): file_path
for file_path in file_paths
}
# Collect results as they complete
for future in as_completed(futures):
file_path = futures[future]
try:
result = future.result()
results.append({
"file": file_path,
"success": True,
"data": result
})
logger.info(f"Successfully processed {file_path}")
except Exception as e:
logger.error(f"Failed to process {file_path}: {e}")
results.append({
"file": file_path,
"success": False,
"error": str(e)
})
return results
# Usage
invoice_files = [
"invoices/invoice_001.pdf",
"invoices/invoice_002.pdf",
"invoices/invoice_003.pdf",
]
results = process_invoice_batch(processor, invoice_files, max_workers=3)
# Summarize results
successful = sum(1 for r in results if r["success"])
print(f"Processed {successful}/{len(results)} invoices successfully")
Step 4: Add Error Handling
Robust error handling for production:Copy
from requests.exceptions import HTTPError, Timeout
def process_invoice_with_retry(
processor: InvoiceProcessor,
file_path: str,
max_retries: int = 3
) -> Optional[Dict[str, Any]]:
"""Process invoice with retry logic"""
for attempt in range(max_retries):
try:
return processor.process_invoice(file_path)
except HTTPError as e:
if e.response.status_code == 402:
logger.error("Insufficient credits. Stopping.")
return None
elif e.response.status_code == 429:
# Rate limited - wait and retry
wait_time = 2 ** attempt
logger.warning(f"Rate limited. Retrying in {wait_time}s...")
time.sleep(wait_time)
continue
elif e.response.status_code >= 500:
# Server error - retry
if attempt < max_retries - 1:
logger.error(f"Server error. Retrying attempt {attempt + 2}...")
time.sleep(5)
continue
logger.error(f"HTTP error {e.response.status_code}: {e}")
return None
except Timeout:
logger.error(f"Request timed out on attempt {attempt + 1}")
if attempt < max_retries - 1:
continue
return None
except Exception as e:
logger.error(f"Unexpected error: {e}")
return None
return None
Step 5: Validate and Store Results
Validate extraction results and store them:Copy
from decimal import Decimal
def validate_invoice_data(data: Dict[str, Any]) -> bool:
"""Validate extracted invoice data"""
errors = []
# Required fields
if not data.get("invoice_number"):
errors.append("Missing invoice_number")
if not data.get("total") or data["total"] <= 0:
errors.append("Invalid or missing total")
# Validate line items sum
if "line_items" in data and data["line_items"]:
items_total = sum(item.get("amount", 0) for item in data["line_items"])
expected_subtotal = data.get("subtotal", items_total)
if abs(items_total - expected_subtotal) > 0.01:
errors.append(f"Line items sum ({items_total}) doesn't match subtotal ({expected_subtotal})")
# Validate dates
if "invoice_date" in data and "due_date" in data:
from datetime import datetime
try:
invoice_date = datetime.fromisoformat(data["invoice_date"])
due_date = datetime.fromisoformat(data["due_date"])
if due_date < invoice_date:
errors.append("Due date is before invoice date")
except ValueError as e:
errors.append(f"Invalid date format: {e}")
if errors:
logger.error(f"Validation errors: {', '.join(errors)}")
return False
return True
def store_invoice_data(data: Dict[str, Any], database):
"""Store invoice in database"""
if not validate_invoice_data(data):
raise ValueError("Invalid invoice data")
# Example: Store in database
invoice_record = {
"invoice_number": data["invoice_number"],
"invoice_date": data["invoice_date"],
"vendor_name": data["vendor"]["name"],
"total_amount": Decimal(str(data["total"])),
"currency": data.get("currency", "USD"),
"line_items": data["line_items"],
"raw_data": data # Store complete extraction
}
database.invoices.insert_one(invoice_record)
logger.info(f"Stored invoice {data['invoice_number']}")
Step 6: Build a Complete Pipeline
Put it all together:Copy
import sys
from pathlib import Path
def main():
# Initialize processor
api_key = os.getenv("DOCUMIND_API_KEY")
if not api_key:
print("Error: DOCUMIND_API_KEY environment variable not set")
sys.exit(1)
processor = InvoiceProcessor(api_key)
# Find all invoices in directory
invoice_dir = Path("invoices/pending")
invoice_files = list(invoice_dir.glob("*.pdf"))
if not invoice_files:
print("No invoices found")
return
print(f"Found {len(invoice_files)} invoices to process")
# Process in batches
results = process_invoice_batch(processor, [str(f) for f in invoice_files])
# Validate and store successful results
for result in results:
if result["success"]:
data = result["data"]["data"]
if validate_invoice_data(data):
# store_invoice_data(data, database)
print(f"✓ {result['file']}: {data['invoice_number']} - ${data['total']}")
else:
print(f"✗ {result['file']}: Validation failed")
else:
print(f"✗ {result['file']}: {result['error']}")
# Summary
successful = sum(1 for r in results if r["success"])
print(f"\nProcessed {successful}/{len(results)} invoices successfully")
if __name__ == "__main__":
main()
Next Steps
Form Extraction
Learn to extract form data
Batch Processing
Scale to thousands of documents
Schema Design
Master schema design patterns
Error Handling
Build robust production systems
Complete Example Repository
Download the complete working example:Copy
git clone https://github.com/documind/invoice-processing-tutorial
cd invoice-processing-tutorial
pip install -r requirements.txt
export DOCUMIND_API_KEY=your_key_here
python process_invoices.py
Production Tips:
- Use environment variables for API keys
- Implement proper logging and monitoring
- Add database storage for results
- Set up alerts for failed extractions
- Monitor credit usage daily