This page provides practical examples of using Kreuzberg for text extraction in various scenarios.
| import asyncio
from kreuzberg import extract_file
async def main():
# Extract text from a PDF file
result = await extract_file("document.pdf")
print(result.content)
# Access metadata
if result.metadata.get("title"):
print(f"Document title: {result.metadata['title']}")
asyncio.run(main())
|
OCR Configuration
Kreuzberg provides options to configure OCR for different languages and document layouts:
| from kreuzberg import extract_file, TesseractConfig, PSMMode, ExtractionConfig
async def extract_with_ocr():
# Extract from a German document
result = await extract_file(
"german_document.pdf",
config=ExtractionConfig(
force_ocr=True,
ocr_config=TesseractConfig(
language="deu", psm=PSMMode.SINGLE_BLOCK # German language # Treat as a single text block
),
),
)
print(result.content)
# Extract from a multilingual document
result = await extract_file(
"multilingual.pdf",
config=ExtractionConfig(
force_ocr=True,
ocr_config=TesseractConfig(
language="eng+deu", psm=PSMMode.AUTO # English primary, German secondary # Automatic page segmentation
),
),
)
print(result.content)
|
Alternative OCR Backends
Kreuzberg supports multiple OCR backends:
| from kreuzberg import extract_file, ExtractionConfig, EasyOCRConfig, PaddleOCRConfig
async def extract_with_different_backends():
# Using EasyOCR
result = await extract_file(
"document.jpg", config=ExtractionConfig(ocr_backend="easyocr", ocr_config=EasyOCRConfig(language_list=["en", "de"]))
)
print(f"EasyOCR result: {result.content[:100]}...")
# Using PaddleOCR
result = await extract_file(
"chinese_document.jpg",
config=ExtractionConfig(ocr_backend="paddleocr", ocr_config=PaddleOCRConfig(language="ch")), # Chinese
)
print(f"PaddleOCR result: {result.content[:100]}...")
# Disable OCR completely
result = await extract_file("searchable_pdf.pdf", config=ExtractionConfig(ocr_backend=None))
print(f"No OCR result: {result.content[:100]}...")
|
Language Detection
| from kreuzberg import extract_file, ExtractionConfig, LanguageDetectionConfig
async def detect_document_language():
# Simple automatic language detection
result = await extract_file("document.pdf", config=ExtractionConfig(auto_detect_language=True))
# Access detected languages
if result.detected_languages:
print(f"Detected languages: {', '.join(result.detected_languages)}")
# Example output: "Detected languages: en, de, fr"
async def detect_multilingual_document():
# Advanced multilingual detection with custom configuration
lang_config = LanguageDetectionConfig(
multilingual=True, # Detect multiple languages in mixed text
top_k=5, # Return top 5 languages
low_memory=False, # Use high accuracy mode
)
result = await extract_file(
"multilingual_document.pdf", config=ExtractionConfig(auto_detect_language=True, language_detection_config=lang_config)
)
if result.detected_languages:
print(f"Detected languages: {result.detected_languages}")
# Use detected languages for OCR
from kreuzberg import TesseractConfig
# Create language string for Tesseract (e.g., "eng+deu+fra")
tesseract_langs = "+".join(result.detected_languages[:3])
result_with_ocr = await extract_file(
"multilingual_document.pdf",
config=ExtractionConfig(force_ocr=True, ocr_config=TesseractConfig(language=tesseract_langs)),
)
|
GMFT Deprecation
GMFTConfig is deprecated and will be removed in Kreuzberg v4.0. The example below is maintained for legacy users only—migrate to the upcoming TATR-based table extraction when upgrading.
| from kreuzberg import extract_file, ExtractionConfig, GMFTConfig
async def extract_tables_from_pdf():
# Enable table extraction with default settings
result = await extract_file("document_with_tables.pdf", config=ExtractionConfig(extract_tables=True))
# Process extracted tables
print(f"Found {len(result.tables)} tables")
for i, table in enumerate(result.tables):
print(f"Table {i+1} on page {table['page_number']}:")
print(table["text"]) # Markdown formatted table
# Work with the polars DataFrame
df = table["df"]
print(f"Table shape: {df.shape}")
# The cropped table image is also available
# table['cropped_image'].save(f"table_{i+1}.png")
# With custom GMFT configuration
custom_config = ExtractionConfig(
extract_tables=True,
gmft_config=GMFTConfig(
detector_base_threshold=0.85, # Min confidence for table detection
enable_multi_header=True, # Support multi-level headers
semantic_spanning_cells=True, # Handle spanning cells
semantic_hierarchical_left_fill="deep", # Handle hierarchical headers
),
)
result = await extract_file("complex_tables.pdf", config=custom_config)
# Process tables...
|
Kreuzberg's Tesseract backend supports multiple output formats, each optimized for different use cases.
Use the text
format for the fastest extraction when you don't need formatting:
| from kreuzberg import extract_file, ExtractionConfig, TesseractConfig
async def extract_plain_text():
result = await extract_file("document.jpg", config=ExtractionConfig(ocr_config=TesseractConfig(output_format="text")))
print(result.content)
|
Default Markdown with Structure
The default markdown
format preserves document structure:
| from kreuzberg import extract_file
async def extract_with_markdown():
# Markdown is the default format
result = await extract_file("document.jpg")
print(result.content) # Structured markdown output
|
Use TSV format with table detection to extract tables from images:
| from kreuzberg import extract_file, ExtractionConfig, TesseractConfig
async def extract_tables():
result = await extract_file(
"scanned_table.png",
config=ExtractionConfig(
ocr_config=TesseractConfig(
output_format="tsv",
enable_table_detection=True,
table_column_threshold=20, # Adjust for column spacing
table_row_threshold_ratio=0.5, # Adjust for row spacing
)
),
)
# Access extracted tables
for table in result.tables:
print("Extracted table in markdown format:")
print(table["text"])
print(f"Page number: {table['page_number']}")
|
Get Word Positions with hOCR
Use hOCR format to access detailed position information:
| from kreuzberg import extract_file, ExtractionConfig, TesseractConfig
async def extract_with_positions():
result = await extract_file("document.jpg", config=ExtractionConfig(ocr_config=TesseractConfig(output_format="hocr")))
# result.content contains HTML with position data
print(result.content[:500]) # hOCR HTML output
|
Processing Scanned Invoices
Complete example for extracting data from scanned invoices:
| from kreuzberg import extract_file, ExtractionConfig, TesseractConfig, PSMMode
async def process_invoice():
# Configure for invoice processing
config = ExtractionConfig(
ocr_config=TesseractConfig(
output_format="tsv",
enable_table_detection=True,
table_column_threshold=25, # Adjust for invoice layout
psm=PSMMode.SPARSE_TEXT, # Good for forms and invoices
language="eng",
)
)
result = await extract_file("invoice_scan.pdf", config=config)
# Get the main text content
print("Invoice text:")
print(result.content)
# Extract tables (line items)
if result.tables:
print("\nInvoice line items:")
for table in result.tables:
print(table["text"])
|
Kreuzberg can extract embedded images from various document formats and optionally run OCR on them to extract text content.
| from kreuzberg import extract_file, ExtractionConfig
import base64
from pathlib import Path
async def extract_images_from_pdf():
# Extract embedded images from a PDF document
result = await extract_file("document_with_images.pdf", config=ExtractionConfig(extract_images=True))
print(f"Document content: {result.content[:100]}...")
print(f"Found {len(result.images)} images")
# Save extracted images to files
for i, image in enumerate(result.images):
filename = image.filename or f"image_{i+1}.{image.format.lower()}"
filepath = Path("extracted_images") / filename
filepath.parent.mkdir(exist_ok=True)
# Write image data to file
filepath.write_bytes(image.data)
print(f"Saved image: {filename}")
print(f" Format: {image.format}")
print(f" Dimensions: {image.dimensions}")
if image.page_number:
print(f" Page: {image.page_number}")
if image.description:
print(f" Description: {image.description}")
|
Image OCR Processing
Extract text content from images using OCR:
| from kreuzberg import extract_file, ExtractionConfig, ImageOCRConfig
async def extract_and_ocr_images():
# Extract images and run OCR on them
config = ExtractionConfig(
extract_images=True,
deduplicate_images=True, # Remove duplicate images
image_ocr_config=ImageOCRConfig(
enabled=True,
backend="tesseract",
# Only process reasonably sized images
min_dimensions=(100, 100),
max_dimensions=(3000, 3000),
),
)
result = await extract_file("presentation.pptx", config=config)
print(f"Main content: {len(result.content)} characters")
print(f"Extracted {len(result.images)} unique images")
print(f"OCR processed {len(result.image_ocr_results)} images")
# Process OCR results
for i, ocr_result in enumerate(result.image_ocr_results):
image = ocr_result.image
text = ocr_result.ocr_result.content
confidence = ocr_result.confidence_score
print(f"\nImage {i+1}: {image.filename or 'unnamed'}")
print(f" Dimensions: {image.dimensions}")
print(f" OCR Confidence: {confidence:.2f}" if confidence else " No confidence score")
if ocr_result.skipped_reason:
print(f" Skipped: {ocr_result.skipped_reason}")
else:
print(f" Extracted text: {text[:100]}...")
|
Advanced Image OCR Configuration
Use different OCR backends and configurations for optimal results:
| from kreuzberg import extract_file, ExtractionConfig, TesseractConfig, EasyOCRConfig, PSMMode
async def advanced_image_ocr():
# Tesseract with multilingual support for technical documents
tesseract_config = TesseractConfig(
language="eng+deu", # English and German
psm=PSMMode.SINGLE_BLOCK, # Treat each image as single text block
output_format="text",
tessedit_char_whitelist="0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .-,",
)
config = ExtractionConfig(
extract_images=True,
image_ocr_config=ImageOCRConfig(
enabled=True,
backend="tesseract",
backend_config=tesseract_config,
min_dimensions=(150, 50), # Allow narrow images like table headers
max_dimensions=(4000, 4000),
),
)
result = await extract_file("technical_manual.pdf", config=config)
# EasyOCR for natural scene text and photos
easyocr_config = EasyOCRConfig(
language_list=["en"],
gpu=False, # Use CPU processing
confidence_threshold=0.4, # Lower threshold for challenging images
)
config = ExtractionConfig(
extract_images=True,
image_ocr_config=ImageOCRConfig(
enabled=True,
backend="easyocr",
backend_config=easyocr_config,
),
)
result = await extract_file("document_with_photos.pdf", config=config)
# Filter OCR results by confidence
high_confidence_results = [
ocr_result for ocr_result in result.image_ocr_results if ocr_result.confidence_score and ocr_result.confidence_score > 0.7
]
print(f"High-confidence OCR results: {len(high_confidence_results)}")
|
Processing Different Document Types
Image extraction works with various document formats:
| from kreuzberg import extract_file, ExtractionConfig, ImageOCRConfig
async def extract_from_various_formats():
config = ExtractionConfig(
extract_images=True,
image_ocr_config=ImageOCRConfig(
enabled=True,
backend="tesseract",
),
)
# PDF documents - embedded images and graphics
pdf_result = await extract_file("report.pdf", config=config)
print(f"PDF: {len(pdf_result.images)} images extracted")
# PowerPoint presentations - slide images and shapes
pptx_result = await extract_file("presentation.pptx", config=config)
print(f"PPTX: {len(pptx_result.images)} images extracted")
# HTML documents - inline and base64 images
html_result = await extract_file("webpage.html", config=config)
print(f"HTML: {len(html_result.images)} images extracted")
# Email messages - attachments and inline images
email_result = await extract_file("message.eml", config=config)
print(f"Email: {len(email_result.images)} images extracted")
# Word documents - embedded images and charts
docx_result = await extract_file("document.docx", config=config)
print(f"DOCX: {len(docx_result.images)} images extracted")
|
Control performance and resource usage:
| from kreuzberg import extract_file, ExtractionConfig, ImageOCRConfig
async def optimized_image_processing():
# Fast processing for large batches
fast_config = ExtractionConfig(
extract_images=True,
deduplicate_images=True, # Remove duplicates
# image_ocr_config not set, so OCR is disabled
)
# Quality processing for important documents
quality_config = ExtractionConfig(
extract_images=True,
deduplicate_images=True,
image_ocr_config=ImageOCRConfig(
enabled=True,
backend="tesseract",
min_dimensions=(50, 50), # Process smaller images
max_dimensions=(5000, 5000), # Allow larger images
),
)
# Selective processing based on image size
selective_config = ExtractionConfig(
extract_images=True,
deduplicate_images=True,
image_ocr_config=ImageOCRConfig(
enabled=True,
backend="tesseract",
min_dimensions=(300, 100), # Good for charts and tables
max_dimensions=(3000, 3000),
),
)
# Process with different configs
for config_name, config in [("fast", fast_config), ("quality", quality_config), ("selective", selective_config)]:
result = await extract_file("large_document.pdf", config=config)
print(f"{config_name.title()} mode: {len(result.images)} images, " f"{len(result.image_ocr_results)} OCR results")
|
Use image extraction alongside other Kreuzberg features:
| from kreuzberg import extract_file, ExtractionConfig, ImageOCRConfig
async def comprehensive_extraction():
config = ExtractionConfig(
# Text extraction
chunk_content=True,
max_chars=1000,
# Image extraction
extract_images=True,
image_ocr_config=ImageOCRConfig(
enabled=True,
backend="tesseract",
),
# Table extraction
extract_tables=True,
# Entity and keyword extraction
extract_entities=True,
extract_keywords=True,
keyword_count=10,
# Language detection
auto_detect_language=True,
)
result = await extract_file("comprehensive_document.pdf", config=config)
print("=== Comprehensive Extraction Results ===")
print(f"Main content: {len(result.content)} characters")
print(f"Content chunks: {len(result.chunks)}")
print(f"Extracted images: {len(result.images)}")
print(f"Image OCR results: {len(result.image_ocr_results)}")
print(f"Tables: {len(result.tables)}")
print(f"Detected languages: {result.detected_languages}")
print(f"Keywords: {len(result.keywords) if result.keywords else 0}")
print(f"Entities: {len(result.entities) if result.entities else 0}")
# Combine text from main content and image OCR
all_text = result.content
for ocr_result in result.image_ocr_results:
if ocr_result.ocr_result.content.strip():
all_text += "\n\nFrom image OCR:\n" + ocr_result.ocr_result.content
print(f"Total text (including OCR): {len(all_text)} characters")
|
JSON and Structured Data Extraction
| from kreuzberg import extract_file_sync
# Simple JSON extraction
result = extract_file_sync("data.json")
print(result.content)
# Metadata includes detected text fields
print(f"Title: {result.metadata.get('title')}")
print(f"Description: {result.metadata.get('description')}")
|
| from kreuzberg import extract_file_sync, ExtractionConfig, JSONExtractionConfig
# Configure advanced JSON extraction
json_config = JSONExtractionConfig(
extract_schema=True, # Extract JSON structure
custom_text_field_patterns=frozenset({"summary", "abstract"}), # Custom fields
include_type_info=True, # Add type annotations
flatten_nested_objects=True, # Flatten nested structures
max_depth=5, # Limit schema depth
array_item_limit=100, # Limit array processing
)
config = ExtractionConfig(json_config=json_config)
result = extract_file_sync("complex.json", config=config)
# Access schema information
if "json_schema" in result.metadata:
schema = result.metadata["json_schema"]
print(f"Root type: {schema['type']}")
print(f"Properties: {list(schema.get('properties', {}).keys())}")
# Access nested attributes with dotted notation
if "attributes" in result.metadata:
attrs = result.metadata["attributes"]
# Nested fields like {"info": {"title": "Example"}} become "info.title"
print(f"Nested title: {attrs.get('info.title')}")
|
YAML and TOML Processing
| from kreuzberg import extract_file_sync
# YAML extraction (similar to JSON)
yaml_result = extract_file_sync("config.yaml")
print(yaml_result.content)
# TOML extraction
toml_result = extract_file_sync("pyproject.toml")
print(toml_result.content)
# Both formats support the same metadata extraction as JSON
print(f"Package name: {toml_result.metadata.get('name')}")
|
Working with API Responses
| import httpx
from kreuzberg import extract_bytes_sync, ExtractionConfig, JSONExtractionConfig
# Fetch JSON from API
response = httpx.get("https://api.example.com/data")
# Extract with schema
config = ExtractionConfig(json_config=JSONExtractionConfig(extract_schema=True))
result = extract_bytes_sync(response.content, mime_type="application/json", config=config)
print(f"API Response: {result.content}")
print(f"Schema: {result.metadata.get('json_schema')}")
|
Batch Processing
| from kreuzberg import batch_extract_file, ExtractionConfig
async def process_documents():
file_paths = ["document1.pdf", "document2.docx", "data.json", "image.jpg"]
config = ExtractionConfig() # Optional: configure extraction options
results = await batch_extract_file(file_paths, config=config)
for path, result in zip(file_paths, results):
print(f"File: {path}")
print(f"Content: {result.content[:100]}...")
|
Working with Bytes
| from kreuzberg import extract_bytes, ExtractionConfig
async def process_upload(file_content: bytes, mime_type: str):
# Extract text from uploaded file content
config = ExtractionConfig() # Optional: configure extraction options
result = await extract_bytes(file_content, mime_type=mime_type, config=config)
print(f"Content: {result.content[:100]}...")
# Access metadata
if result.metadata:
for key, value in result.metadata.items():
print(f"{key}: {value}")
|
Keywords
Kreuzberg supports keywords and regex extraction as follows:
| from kreuzberg import ExtractionConfig, extract_file
async def extract_keywords():
config = ExtractionConfig(
extract_keywords=True,
keyword_count=5, # defaults to 10 if not set
)
result = await extract_file(
"document.pdf",
config=config,
)
print(f"Keywords: {result.keywords}")
|
Entity and Keyword Extraction
Kreuzberg can extract named entities using spaCy and keywords using KeyBERT. It automatically detects entities like people, organizations, locations, and more, plus supports custom regex patterns:
| from kreuzberg import ExtractionConfig, extract_file, SpacyEntityExtractionConfig
async def extract_entities_and_keywords():
# Basic extraction
config = ExtractionConfig(
extract_entities=True,
extract_keywords=True,
keyword_count=5,
custom_entity_patterns={
"INVOICE_ID": r"INV-\d+",
"EMAIL": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
},
)
result = await extract_file("document.pdf", config=config)
# Print extracted entities
if result.entities:
for entity in result.entities:
print(f"{entity.type}: {entity.text}")
# Print extracted keywords
if result.keywords:
for keyword, score in result.keywords:
print(f"Keyword: {keyword} (score: {score:.3f})")
async def extract_multilingual_entities():
# Configure spaCy for multiple languages
spacy_config = SpacyEntityExtractionConfig(
language_models={
"en": "en_core_web_sm",
"de": "de_core_news_sm",
"fr": "fr_core_news_sm",
},
fallback_to_multilingual=True,
)
config = ExtractionConfig(
auto_detect_language=True, # Automatically detect document languages
extract_entities=True,
spacy_entity_extraction_config=spacy_config,
)
result = await extract_file("multilingual_document.pdf", config=config)
if result.detected_languages:
print(f"Detected languages: {result.detected_languages}")
if result.entities:
print(f"Extracted {len(result.entities)} entities")
for entity in result.entities:
print(f" {entity.type}: {entity.text}")
|
Synchronous API
For cases where async isn't needed or available:
| from kreuzberg import extract_file_sync, batch_extract_file_sync, ExtractionConfig
# Configuration for extraction
config = ExtractionConfig() # Optional: configure extraction options
# Single file extraction
result = extract_file_sync("document.pdf", config=config)
print(result.content)
# Batch processing
file_paths = ["document1.pdf", "document2.docx", "image.jpg"]
results = batch_extract_file_sync(file_paths, config=config)
for path, result in zip(file_paths, results):
print(f"File: {path}")
print(f"Content: {result.content[:100]}...")
|
Error Handling
| from kreuzberg import extract_file, ExtractionConfig
from kreuzberg import KreuzbergError, MissingDependencyError, OCRError
async def safe_extract(path):
try:
config = ExtractionConfig() # Optional: configure extraction options
result = await extract_file(path, config=config)
return result.content
except MissingDependencyError as e:
print(f"Missing dependency: {e}")
print("Please install the required dependencies.")
except OCRError as e:
print(f"OCR processing failed: {e}")
except KreuzbergError as e:
print(f"Extraction error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
return None
|