Configuration¶
graph TD
ExtractionConfig[ExtractionConfig<br/>Main Configuration]
ExtractionConfig --> OCR[OcrConfig<br/>OCR Backend Settings]
ExtractionConfig --> PDF[PdfConfig<br/>PDF Options]
ExtractionConfig --> Images[ImageExtractionConfig<br/>Image Settings]
ExtractionConfig --> Chunking[ChunkingConfig<br/>Text Chunking]
ExtractionConfig --> TokenRed[TokenReductionConfig<br/>Token Optimization]
ExtractionConfig --> LangDet[LanguageDetectionConfig<br/>Language Detection]
ExtractionConfig --> PostProc[PostProcessorConfig<br/>Post-Processing]
OCR --> Tesseract[TesseractConfig<br/>Tesseract Options]
Tesseract --> ImgPreproc[ImagePreprocessingConfig<br/>Image Enhancement]
Chunking --> Embedding[EmbeddingConfig<br/>Vector Embeddings]
Embedding --> Model[EmbeddingModelType<br/>Model Selection]
style ExtractionConfig fill:#4CAF50,color:#fff
style OCR fill:#87CEEB
style Chunking fill:#FFD700
style Embedding fill:#FFB6C1 Kreuzberg's behavior is controlled through configuration objects. All settings are optional with sensible defaults, allowing you to configure only what you need.
Configuration Methods¶
Kreuzberg supports four ways to configure extraction:
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
config, err := kreuzberg.LoadExtractionConfigFromFile("")
if err != nil {
log.Fatalf("discover config failed: %v", err)
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Printf("Content length: %d", len(result.Content))
}
import asyncio
from kreuzberg import ExtractionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig()
result = await extract_file("document.pdf", config=config)
content: str = result.content
content_preview: str = content[:100]
print(f"Content preview: {content_preview}")
print(f"Total length: {len(content)}")
asyncio.run(main())
import { initWasm, extractBytes } from '@kreuzberg/wasm';
await initWasm();
const config = {
use_cache: true,
enable_quality_processing: true,
ocr: {
backend: 'tesseract-wasm',
language: 'eng'
}
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'application/pdf', config);
console.log(result.content);
Configuration Discovery¶
flowchart TD
Start[ExtractionConfig.discover] --> Current{Check Current Directory}
Current -->|Found| LoadCurrent[Load ./kreuzberg.*]
Current -->|Not Found| User{Check User Config}
User -->|Found| LoadUser[Load ~/.config/kreuzberg/config.*]
User -->|Not Found| System{Check System Config}
System -->|Found| LoadSystem[Load /etc/kreuzberg/config.*]
System -->|Not Found| Default[Use Default Config]
LoadCurrent --> Merge[Merge with Defaults]
LoadUser --> Merge
LoadSystem --> Merge
Default --> Return[Return Config]
Merge --> Return
style LoadCurrent fill:#90EE90
style LoadUser fill:#87CEEB
style LoadSystem fill:#FFD700
style Default fill:#FFB6C1 Kreuzberg automatically discovers configuration files in the following locations (in order):
- Current directory:
./kreuzberg.{toml,yaml,yml,json} - User config:
~/.config/kreuzberg/config.{toml,yaml,yml,json} - System config:
/etc/kreuzberg/config.{toml,yaml,yml,json}
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
config, err := kreuzberg.LoadExtractionConfigFromFile("")
if err != nil {
log.Fatalf("discover config failed: %v", err)
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Printf("Content length: %d", len(result.Content))
}
import asyncio
from kreuzberg import ExtractionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig()
result = await extract_file("document.pdf", config=config)
content: str = result.content
content_preview: str = content[:100]
print(f"Content preview: {content_preview}")
print(f"Total length: {len(content)}")
asyncio.run(main())
import { initWasm, extractBytes } from '@kreuzberg/wasm';
await initWasm();
const config = {
use_cache: true,
enable_quality_processing: true,
ocr: {
backend: 'tesseract-wasm',
language: 'eng'
}
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'application/pdf', config);
console.log(result.content);
ExtractionConfig¶
The main configuration object controlling extraction behavior.
| Field | Type | Default | Description |
|---|---|---|---|
use_cache | bool | true | Enable caching of extraction results |
enable_quality_processing | bool | true | Enable quality post-processing |
force_ocr | bool | false | Force OCR even for text-based PDFs |
ocr | OcrConfig? | None | OCR configuration (if None, OCR disabled) |
pdf_options | PdfConfig? | None | PDF-specific configuration |
images | ImageExtractionConfig? | None | Image extraction configuration |
chunking | ChunkingConfig? | None | Text chunking configuration |
token_reduction | TokenReductionConfig? | None | Token reduction configuration |
language_detection | LanguageDetectionConfig? | None | Language detection configuration |
keywords | KeywordConfig? | None | Keyword extraction configuration (requires keywords-yake or keywords-rake feature flag) |
postprocessor | PostProcessorConfig? | None | Post-processing pipeline configuration |
Basic Example¶
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
useCache := true
enableQP := true
result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
UseCache: &useCache,
EnableQualityProcessing: &enableQP,
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.config.ExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.useCache(true)
.enableQualityProcessing(true)
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
use kreuzberg::{extract_file, ExtractionConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
use_cache: true,
enable_quality_processing: true,
..Default::default()
};
let result = extract_file("document.pdf", None, &config).await?;
println!("{}", result.content);
Ok(())
}
import { initWasm, extractBytes } from '@kreuzberg/wasm';
await initWasm();
const config = {
ocr: {
backend: 'tesseract-wasm',
language: 'eng'
},
images: {
extractImages: true
}
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'application/pdf', config);
console.log(result.content);
OcrConfig¶
Configuration for OCR processing. Set to enable OCR on images and scanned PDFs.
| Field | Type | Default | Description |
|---|---|---|---|
backend | str | "tesseract" | OCR backend: "tesseract", "easyocr", "paddleocr" |
language | str | "eng" | Language code(s), e.g., "eng", "eng+fra" |
tesseract_config | TesseractConfig? | None | Tesseract-specific configuration |
Example¶
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.OcrConfig;
import dev.kreuzberg.config.TesseractConfig;
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng+fra")
.tesseractConfig(TesseractConfig.builder()
.psm(3)
.build())
.build())
.build();
import asyncio
from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(
backend="tesseract", language="eng+fra",
tesseract_config=TesseractConfig(psm=3)
)
)
result = await extract_file("document.pdf", config=config)
print(result.content)
asyncio.run(main())
use kreuzberg::{ExtractionConfig, OcrConfig, TesseractConfig};
fn main() {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: Some("eng+fra".to_string()),
tesseract_config: Some(TesseractConfig {
psm: 3,
..Default::default()
}),
}),
..Default::default()
};
}
TesseractConfig¶
Tesseract OCR engine configuration.
| Field | Type | Default | Description |
|---|---|---|---|
language | str | "eng" | Language code(s), e.g., "eng", "eng+fra" |
psm | int | 3 | Page segmentation mode (0-13) |
output_format | str | "text" | Output format: "text", "hocr" |
oem | int | 3 | OCR engine mode (0-3) |
min_confidence | float | 0.0 | Minimum confidence threshold (0.0-1.0) |
preprocessing | ImagePreprocessingConfig? | None | Image preprocessing configuration |
enable_table_detection | bool | false | Enable table detection and extraction |
table_min_confidence | float | 0.5 | Minimum confidence for table cells |
table_column_threshold | int | 50 | Pixel threshold for column detection |
table_row_threshold_ratio | float | 0.5 | Row threshold ratio |
use_cache | bool | true | Enable OCR result caching |
classify_use_pre_adapted_templates | bool | false | Tesseract variable |
language_model_ngram_on | bool | false | Tesseract variable |
tessedit_dont_blkrej_good_wds | bool | false | Tesseract variable |
tessedit_dont_rowrej_good_wds | bool | false | Tesseract variable |
tessedit_enable_dict_correction | bool | false | Tesseract variable |
tessedit_char_whitelist | str | "" | Allowed characters |
tessedit_char_blacklist | str | "" | Disallowed characters |
tessedit_use_primary_params_model | bool | false | Tesseract variable |
textord_space_size_is_variable | bool | false | Tesseract variable |
thresholding_method | bool | false | Tesseract variable |
Page Segmentation Modes (PSM)¶
0: Orientation and script detection only1: Automatic page segmentation with OSD2: Automatic page segmentation (no OSD, no OCR)3: Fully automatic page segmentation (default)4: Single column of text5: Single uniform block of vertically aligned text6: Single uniform block of text7: Single text line8: Single word9: Single word in a circle10: Single character11: Sparse text, no particular order12: Sparse text with OSD13: Raw line (no assumptions about text layout)
OCR Engine Modes (OEM)¶
0: Legacy engine only1: Neural nets LSTM engine only2: Legacy + LSTM engines3: Default based on what's available (default)
Example¶
using Kreuzberg;
var config = new ExtractionConfig
{
Ocr = new OcrConfig
{
Language = "eng+fra+deu",
TesseractConfig = new TesseractConfig
{
Psm = 6,
Oem = 1,
MinConfidence = 0.8m,
EnableTableDetection = true
}
}
};
var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Content: {result.Content[..Math.Min(100, result.Content.Length)]}");
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
psm := 6
oem := 1
minConf := 0.8
lang := "eng+fra+deu"
whitelist := "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?"
config := &kreuzberg.ExtractionConfig{
OCR: &kreuzberg.OCRConfig{
Backend: "tesseract",
Language: &lang,
Tesseract: &kreuzberg.TesseractConfig{
PSM: &psm,
OEM: &oem,
MinConfidence: &minConf,
EnableTableDetection: kreuzberg.BoolPtr(true),
TesseditCharWhitelist: whitelist,
},
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.OcrConfig;
import dev.kreuzberg.config.TesseractConfig;
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.language("eng+fra+deu")
.tesseractConfig(TesseractConfig.builder()
.psm(6)
.oem(1)
.minConfidence(0.8)
.tesseditCharWhitelist("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?")
.enableTableDetection(true)
.build())
.build())
.build();
import asyncio
from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(
language="eng+fra+deu",
tesseract_config=TesseractConfig(
psm=6,
oem=1,
min_confidence=0.8,
enable_table_detection=True,
),
)
)
result = await extract_file("document.pdf", config=config)
print(f"Content: {result.content[:100]}")
asyncio.run(main())
require 'kreuzberg'
config = Kreuzberg::Config::Extraction.new(
ocr: Kreuzberg::Config::OCR.new(
language: 'eng+fra+deu',
tesseract_config: Kreuzberg::Config::Tesseract.new(
psm: 6,
oem: 1,
min_confidence: 0.8,
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?',
enable_table_detection: true
)
)
)
use kreuzberg::{ExtractionConfig, OcrConfig, TesseractConfig};
fn main() {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
language: Some("eng+fra+deu".to_string()),
tesseract_config: Some(TesseractConfig {
psm: Some(6),
oem: Some(1),
min_confidence: Some(0.8),
tessedit_char_whitelist: Some("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?".to_string()),
enable_table_detection: Some(true),
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
println!("{:?}", config.ocr);
}
import { extractFile } from '@kreuzberg/node';
const config = {
ocr: {
backend: 'tesseract',
language: 'eng+fra+deu',
tesseractConfig: {
psm: 6,
tesseditCharWhitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?',
enableTableDetection: true,
},
},
};
const result = await extractFile('document.pdf', null, config);
console.log(result.content);
ImagePreprocessingConfig¶
Image preprocessing configuration for OCR.
| Field | Type | Default | Description |
|---|---|---|---|
target_dpi | int | 300 | Target DPI for OCR processing |
auto_rotate | bool | true | Automatically rotate images based on orientation |
deskew | bool | true | Apply deskewing to straighten tilted text |
denoise | bool | true | Apply denoising filter |
contrast_enhance | bool | true | Enhance image contrast |
binarization_method | str | "otsu" | Binarization method: "otsu", "adaptive", "none" |
invert_colors | bool | false | Invert image colors (useful for white-on-black text) |
Example¶
using Kreuzberg;
var config = new ExtractionConfig
{
Ocr = new OcrConfig
{
TesseractConfig = new TesseractConfig
{
Preprocessing = new ImagePreprocessingConfig
{
TargetDpi = 300,
Denoise = true,
Deskew = true,
ContrastEnhance = true,
BinarizationMethod = "otsu"
}
}
}
};
var result = await KreuzbergClient.ExtractFileAsync("scanned.pdf", config);
Console.WriteLine($"Content: {result.Content[..Math.Min(100, result.Content.Length)]}");
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
targetDPI := 300
config := &kreuzberg.ExtractionConfig{
OCR: &kreuzberg.OCRConfig{
Tesseract: &kreuzberg.TesseractConfig{
Preprocessing: &kreuzberg.ImagePreprocessingConfig{
TargetDPI: &targetDPI,
Denoise: kreuzberg.BoolPtr(true),
Deskew: kreuzberg.BoolPtr(true),
ContrastEnhance: kreuzberg.BoolPtr(true),
BinarizationMode: kreuzberg.StringPtr("otsu"),
},
},
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.ImagePreprocessingConfig;
import dev.kreuzberg.config.OcrConfig;
import dev.kreuzberg.config.TesseractConfig;
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.tesseractConfig(TesseractConfig.builder()
.preprocessing(ImagePreprocessingConfig.builder()
.targetDpi(300)
.denoise(true)
.deskew(true)
.contrastEnhance(true)
.binarizationMethod("otsu")
.build())
.build())
.build())
.build();
import asyncio
from kreuzberg import (
ExtractionConfig,
OcrConfig,
TesseractConfig,
ImagePreprocessingConfig,
extract_file,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(
tesseract_config=TesseractConfig(
preprocessing=ImagePreprocessingConfig(
target_dpi=300,
denoise=True,
deskew=True,
contrast_enhance=True,
binarization_method="otsu",
)
)
)
)
result = await extract_file("scanned.pdf", config=config)
print(f"Content: {result.content[:100]}")
asyncio.run(main())
require 'kreuzberg'
config = Kreuzberg::Config::Extraction.new(
ocr: Kreuzberg::Config::OCR.new(
tesseract_config: Kreuzberg::Config::Tesseract.new(
preprocessing: Kreuzberg::Config::ImagePreprocessing.new(
target_dpi: 300,
denoise: true,
deskew: true,
contrast_enhance: true,
binarization_method: 'otsu'
)
)
)
)
use kreuzberg::{ExtractionConfig, ImagePreprocessingConfig, OcrConfig, TesseractConfig};
fn main() {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
tesseract_config: Some(TesseractConfig {
preprocessing: Some(ImagePreprocessingConfig {
target_dpi: Some(300),
denoise: Some(true),
deskew: Some(true),
contrast_enhance: Some(true),
binarization_method: Some("otsu".to_string()),
..Default::default()
}),
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
println!("{:?}", config.ocr);
}
PdfConfig¶
PDF-specific extraction configuration.
| Field | Type | Default | Description |
|---|---|---|---|
extract_images | bool | true | Extract embedded images from PDF |
extract_metadata | bool | true | Extract PDF metadata (title, author, etc.) |
passwords | list[str]? | None | List of passwords to try for encrypted PDFs |
Example¶
using Kreuzberg;
var config = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
ExtractImages = true,
ExtractMetadata = true,
Passwords = new List<string> { "password1", "password2" }
}
};
var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Content: {result.Content[..Math.Min(100, result.Content.Length)]}");
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
pw := []string{"password1", "password2"}
result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
PdfOptions: &kreuzberg.PdfConfig{
ExtractImages: kreuzberg.BoolPtr(true),
ExtractMetadata: kreuzberg.BoolPtr(true),
Passwords: pw,
},
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.PdfConfig;
import java.util.Arrays;
ExtractionConfig config = ExtractionConfig.builder()
.pdfOptions(PdfConfig.builder()
.extractImages(true)
.extractMetadata(true)
.passwords(Arrays.asList("password1", "password2"))
.build())
.build();
import asyncio
from kreuzberg import ExtractionConfig, PdfConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
pdf_options=PdfConfig(
extract_images=True,
extract_metadata=True,
passwords=["password1", "password2"],
)
)
result = await extract_file("document.pdf", config=config)
print(f"Content: {result.content[:100]}")
asyncio.run(main())
use kreuzberg::{ExtractionConfig, PdfConfig};
fn main() {
let config = ExtractionConfig {
pdf_options: Some(PdfConfig {
extract_images: Some(true),
extract_metadata: Some(true),
passwords: Some(vec!["password1".to_string(), "password2".to_string()]),
}),
..Default::default()
};
println!("{:?}", config.pdf_options);
}
Page Configuration¶
Configure page extraction and boundary tracking.
Overview¶
Page tracking enables: - Per-page content extraction - Byte-accurate page boundaries - Automatic chunk-to-page mapping - Page markers for LLM context
Configuration Options¶
| Field | Type | Default | Description |
|---|---|---|---|
extract_pages | bool | false | Extract pages array with per-page content |
insert_page_markers | bool | false | Insert page markers in combined content |
marker_format | String | "\\n\\n<!-- PAGE {page_num} -->\\n\\n" | Page marker template |
Example Configuration¶
Field Details¶
extract_pages: When true, populates ExtractionResult.pages with per-page content. Each page contains its text, tables, and images separately.
insert_page_markers: When true, inserts page markers into the combined content string at page boundaries. Useful for LLMs to understand document structure.
marker_format: Template string for page markers. Use {page_num} placeholder for the page number. Default HTML comment format is LLM-friendly.
Format-Specific Behavior¶
PDF: Full byte-accurate page tracking with O(1) lookup performance. Every page boundary is tracked precisely.
PPTX: Slide boundaries tracked. Each slide is treated as a "page" with PageUnitType::Slide.
DOCX: Best-effort detection using explicit page breaks. Only pages with <w:br type="page"/> tags are tracked.
Other formats: Page tracking not available. PageStructure will be None/null.
Byte Offsets vs Character Offsets¶
Page boundaries use byte offsets (not character offsets) for UTF-8 safety and performance:
# Correct: Use byte offsets
boundary = boundaries[0]
page_text = content.encode('utf-8')[boundary.byte_start:boundary.byte_end].decode('utf-8')
# Incorrect: Don't use as character indices
page_text = content[boundary.byte_start:boundary.byte_end] # Wrong for multi-byte chars
See Byte Offset Handling in the migration guide.
ImageExtractionConfig¶
Configuration for extracting images from documents.
| Field | Type | Default | Description |
|---|---|---|---|
extract_images | bool | true | Extract images from documents |
target_dpi | int | 300 | Target DPI for extracted images |
max_image_dimension | int | 4096 | Maximum image dimension (width or height) in pixels |
auto_adjust_dpi | bool | true | Automatically adjust DPI based on image size |
min_dpi | int | 72 | Minimum DPI when auto-adjusting |
max_dpi | int | 600 | Maximum DPI when auto-adjusting |
Example¶
using Kreuzberg;
var config = new ExtractionConfig
{
Images = new ImageExtractionConfig
{
ExtractImages = true,
TargetDpi = 200,
MaxImageDimension = 2048,
AutoAdjustDpi = true
}
};
var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Extracted: {result.Content[..Math.Min(100, result.Content.Length)]}");
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
targetDPI := 200
maxDim := 2048
result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
ImageExtraction: &kreuzberg.ImageExtractionConfig{
ExtractImages: kreuzberg.BoolPtr(true),
TargetDPI: &targetDPI,
MaxImageDimension: &maxDim,
AutoAdjustDPI: kreuzberg.BoolPtr(true),
},
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.ImageExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.imageExtraction(ImageExtractionConfig.builder()
.extractImages(true)
.targetDpi(200)
.maxImageDimension(2048)
.autoAdjustDpi(true)
.build())
.build();
import asyncio
from kreuzberg import ExtractionConfig, ImageExtractionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
images=ImageExtractionConfig(
extract_images=True,
target_dpi=200,
max_image_dimension=2048,
auto_adjust_dpi=True,
)
)
result = await extract_file("document.pdf", config=config)
print(f"Extracted: {result.content[:100]}")
asyncio.run(main())
use kreuzberg::{ExtractionConfig, ImageExtractionConfig};
fn main() {
let config = ExtractionConfig {
images: Some(ImageExtractionConfig {
extract_images: Some(true),
target_dpi: Some(200),
max_image_dimension: Some(2048),
auto_adjust_dpi: Some(true),
..Default::default()
}),
..Default::default()
};
println!("{:?}", config.images);
}
ChunkingConfig¶
Text chunking configuration for splitting extracted text into chunks.
| Field | Type | Default | Description |
|---|---|---|---|
max_chars | int | 1000 | Maximum chunk size in characters |
max_overlap | int | 200 | Overlap between chunks in characters |
embedding | EmbeddingConfig? | None | Embedding configuration for chunks |
preset | str? | None | Chunking preset: "small", "medium", "large" |
Example¶
using Kreuzberg;
using System;
using System.Collections.Generic;
using System.Threading.Tasks;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 512,
MaxOverlap = 50,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("balanced"),
Normalize = true,
BatchSize = 32,
ShowDownloadProgress = false
}
}
};
var result = await Kreuzberg.ExtractFileAsync("document.pdf", config);
var chunks = result.Chunks ?? new List<Chunk>();
foreach (var (index, chunk) in chunks.WithIndex())
{
var chunkId = $"doc_chunk_{index}";
Console.WriteLine($"Chunk {chunkId}: {chunk.Content[..Math.Min(50, chunk.Content.Length)]}");
if (chunk.Embedding != null)
{
Console.WriteLine($" Embedding dimensions: {chunk.Embedding.Length}");
}
}
internal static class EnumerableExtensions
{
public static IEnumerable<(int Index, T Item)> WithIndex<T>(
this IEnumerable<T> items)
{
var index = 0;
foreach (var item in items)
{
yield return (index++, item);
}
}
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
maxChars := 1000
maxOverlap := 200
config := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
MaxOverlap: &maxOverlap,
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
for i, chunk := range result.Chunks {
fmt.Printf("Chunk %d/%d (%d-%d)\n", i+1, chunk.Metadata.TotalChunks, chunk.Metadata.CharStart, chunk.Metadata.CharEnd)
fmt.Printf("%s...\n", chunk.Content[:min(len(chunk.Content), 100)])
}
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
import dev.kreuzberg.config.ChunkingConfig;
import dev.kreuzberg.config.EmbeddingConfig;
import dev.kreuzberg.config.EmbeddingModelType;
import dev.kreuzberg.config.ExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(1500)
.maxOverlap(200)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.builder()
.type("preset")
.name("text-embedding-all-minilm-l6-v2")
.build())
.build())
.build())
.build();
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};
fn main() {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_chars: Some(1500),
max_overlap: Some(200),
embedding: Some(EmbeddingConfig {
model: Some(EmbeddingModelType {
r#type: "preset".to_string(),
name: Some("text-embedding-all-minilm-l6-v2".to_string()),
..Default::default()
}),
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
println!("{:?}", config.chunking);
}
import { initWasm, extractBytes } from '@kreuzberg/wasm';
await initWasm();
const config = {
chunking: {
maxChars: 1000,
chunkOverlap: 100
}
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'application/pdf', config);
result.chunks?.forEach((chunk, idx) => {
console.log(`Chunk ${idx}: ${chunk.content.substring(0, 50)}...`);
console.log(`Tokens: ${chunk.metadata?.token_count}`);
});
EmbeddingConfig¶
Configuration for generating embeddings from extracted text or chunks.
| Field | Type | Default | Description |
|---|---|---|---|
model | EmbeddingModelType | preset("all-MiniLM-L6-v2") | Embedding model configuration |
normalize | bool | true | Normalize embeddings to unit length |
batch_size | int | 32 | Batch size for embedding generation |
show_download_progress | bool | true | Show download progress for models |
cache_dir | str? | None | Custom cache directory for models |
EmbeddingModelType¶
Create embedding models using these factory methods:
EmbeddingModelType.preset(name): Use a preset model"all-MiniLM-L6-v2": Fast, 384-dimensional embeddings (default)"all-mpnet-base-v2": High quality, 768-dimensional embeddings-
"paraphrase-multilingual-MiniLM-L12-v2": Multilingual support -
EmbeddingModelType.fastembed(model, dimensions): Use a FastEmbed model -
Example:
fastembed("BAAI/bge-small-en-v1.5", 384) -
EmbeddingModelType.custom(model_id, dimensions): Use a custom model - Example:
custom("sentence-transformers/all-MiniLM-L6-v2", 384)
Example¶
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
maxChars := 1000
batchSize := 16
cfg := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
Embedding: &kreuzberg.EmbeddingConfig{
Model: &kreuzberg.EmbeddingModelType{
Type: "preset",
Name: "all-mpnet-base-v2",
},
BatchSize: &batchSize,
Normalize: kreuzberg.BoolPtr(true),
ShowDownloadProgress: kreuzberg.BoolPtr(true),
},
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", cfg)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
import dev.kreuzberg.config.ChunkingConfig;
import dev.kreuzberg.config.EmbeddingConfig;
import dev.kreuzberg.config.EmbeddingModelType;
import dev.kreuzberg.config.ExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(1000)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.builder()
.type("preset")
.name("all-mpnet-base-v2")
.build())
.batchSize(16)
.normalize(true)
.showDownloadProgress(true)
.build())
.build())
.build();
from kreuzberg import ExtractionConfig, ChunkingConfig, EmbeddingConfig, EmbeddingModelType
config = ExtractionConfig(
chunking=ChunkingConfig(
max_chars=1000,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("all-mpnet-base-v2"),
batch_size=16,
normalize=True,
show_download_progress=True
)
)
)
require 'kreuzberg'
config = Kreuzberg::Config::Extraction.new(
chunking: Kreuzberg::Config::Chunking.new(
max_chars: 1000,
embedding: Kreuzberg::Config::Embedding.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'all-mpnet-base-v2'
),
batch_size: 16,
normalize: true,
show_download_progress: true
)
)
)
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};
fn main() {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_chars: Some(1000),
embedding: Some(EmbeddingConfig {
model: Some(EmbeddingModelType {
r#type: "preset".to_string(),
name: Some("all-mpnet-base-v2".to_string()),
..Default::default()
}),
batch_size: Some(16),
normalize: Some(true),
show_download_progress: Some(true),
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
println!("{:?}", config.chunking);
}
import { extractFile } from '@kreuzberg/node';
const config = {
chunking: {
maxChars: 1000,
embedding: {
preset: 'quality',
},
},
};
const result = await extractFile('document.pdf', null, config);
if (result.chunks && result.chunks.length > 0) {
console.log(`Chunk embeddings: ${result.chunks[0].embedding?.length ?? 0} dimensions`);
}
TokenReductionConfig¶
Configuration for reducing token count in extracted text.
| Field | Type | Default | Description |
|---|---|---|---|
mode | str | "off" | Reduction mode: "off", "moderate", "aggressive" |
preserve_important_words | bool | true | Preserve important words during reduction |
Example¶
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
preserve := true
result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
TokenReduction: &kreuzberg.TokenReductionConfig{
Mode: "moderate",
PreserveImportantWords: &preserve,
},
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
import asyncio
from kreuzberg import ExtractionConfig, TokenReductionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
token_reduction=TokenReductionConfig(
mode="moderate", preserve_important_words=True
)
)
result = await extract_file("document.pdf", config=config)
print(f"Content length: {len(result.content)}")
asyncio.run(main())
LanguageDetectionConfig¶
Configuration for automatic language detection.
| Field | Type | Default | Description |
|---|---|---|---|
enabled | bool | true | Enable language detection |
min_confidence | float | 0.8 | Minimum confidence threshold (0.0-1.0) |
detect_multiple | bool | false | Detect multiple languages (vs. dominant only) |
Example¶
using Kreuzberg;
var config = new ExtractionConfig
{
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.9m,
DetectMultiple = true
}
};
var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Languages: {string.Join(", ", result.DetectedLanguages ?? new List<string>())}");
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
minConfidence := 0.9
detectMultiple := true
result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
Enabled: kreuzberg.BoolPtr(true),
MinConfidence: &minConfidence,
DetectMultiple: &detectMultiple,
},
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
import asyncio
from kreuzberg import ExtractionConfig, LanguageDetectionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
language_detection=LanguageDetectionConfig(
enabled=True, min_confidence=0.9, detect_multiple=True
)
)
result = await extract_file("document.pdf", config=config)
print(f"Languages: {result.detected_languages}")
asyncio.run(main())
use kreuzberg::{ExtractionConfig, LanguageDetectionConfig};
fn main() {
let config = ExtractionConfig {
language_detection: Some(LanguageDetectionConfig {
enabled: Some(true),
min_confidence: Some(0.9),
detect_multiple: Some(true),
}),
..Default::default()
};
println!("{:?}", config.language_detection);
}
PostProcessorConfig¶
Configuration for post-processing pipeline.
| Field | Type | Default | Description |
|---|---|---|---|
enabled | bool | true | Enable post-processing pipeline |
enabled_processors | list[str]? | None | Specific processors to enable (if None, all enabled) |
disabled_processors | list[str]? | None | Specific processors to disable |
Example¶
using Kreuzberg;
var config = new ExtractionConfig
{
Postprocessor = new PostProcessorConfig
{
Enabled = true,
EnabledProcessors = new List<string> { "deduplication" }
}
};
var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Content: {result.Content[..Math.Min(100, result.Content.Length)]}");
package main
import "github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
func main() {
enabled := true
cfg := &kreuzberg.ExtractionConfig{
Postprocessor: &kreuzberg.PostProcessorConfig{
Enabled: &enabled,
EnabledProcessors: []string{"deduplication", "whitespace_normalization"},
DisabledProcessors: []string{"mojibake_fix"},
},
}
_ = cfg
}
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.PostProcessorConfig;
import java.util.Arrays;
ExtractionConfig config = ExtractionConfig.builder()
.postprocessor(PostProcessorConfig.builder()
.enabled(true)
.enabledProcessors(Arrays.asList("deduplication", "whitespace_normalization"))
.disabledProcessors(Arrays.asList("mojibake_fix"))
.build())
.build();
import asyncio
from kreuzberg import ExtractionConfig, PostProcessorConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
postprocessor=PostProcessorConfig(
enabled=True,
enabled_processors=["deduplication"],
)
)
result = await extract_file("document.pdf", config=config)
print(f"Content: {result.content[:100]}")
asyncio.run(main())
use kreuzberg::{ExtractionConfig, PostProcessorConfig};
fn main() {
let config = ExtractionConfig {
postprocessor: Some(PostProcessorConfig {
enabled: Some(true),
enabled_processors: Some(vec![
"deduplication".to_string(),
"whitespace_normalization".to_string(),
]),
disabled_processors: Some(vec!["mojibake_fix".to_string()]),
}),
..Default::default()
};
println!("{:?}", config.postprocessor);
}
import { extractFile } from '@kreuzberg/node';
const config = {
postprocessor: {
enabled: true,
enabledProcessors: ['deduplication', 'whitespace_normalization'],
disabledProcessors: ['mojibake_fix'],
},
};
const result = await extractFile('document.pdf', null, config);
console.log(result.content);
Complete Example¶
Here's a complete example showing all configuration options together:
using Kreuzberg;
var config = new ExtractionConfig
{
UseCache = true,
EnableQualityProcessing = true,
Ocr = new OcrConfig
{
Backend = "tesseract",
Language = "eng+fra",
TesseractConfig = new TesseractConfig { Psm = 3 }
},
PdfOptions = new PdfConfig { ExtractImages = true },
Chunking = new ChunkingConfig
{
MaxChars = 1000,
MaxOverlap = 200,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("all-MiniLM-L6-v2")
}
}
};
var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Content: {result.Content[..Math.Min(100, result.Content.Length)]}");
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
config := &kreuzberg.ExtractionConfig{
UseCache: kreuzberg.BoolPtr(true),
EnableQualityProcessing: kreuzberg.BoolPtr(true),
ForceOCR: kreuzberg.BoolPtr(false),
OCR: &kreuzberg.OCRConfig{
Backend: "tesseract",
Language: kreuzberg.StringPtr("eng+fra"),
Tesseract: &kreuzberg.TesseractConfig{
PSM: kreuzberg.IntPtr(3),
OEM: kreuzberg.IntPtr(3),
MinConfidence: kreuzberg.FloatPtr(0.8),
EnableTableDetection: kreuzberg.BoolPtr(true),
},
},
PdfOptions: &kreuzberg.PdfConfig{
ExtractImages: kreuzberg.BoolPtr(true),
ExtractMetadata: kreuzberg.BoolPtr(true),
},
Images: &kreuzberg.ImageExtractionConfig{
ExtractImages: kreuzberg.BoolPtr(true),
TargetDPI: kreuzberg.IntPtr(150),
MaxImageDimension: kreuzberg.IntPtr(4096),
},
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: kreuzberg.IntPtr(1000),
MaxOverlap: kreuzberg.IntPtr(200),
},
TokenReduction: &kreuzberg.TokenReductionConfig{
Mode: "moderate",
PreserveImportantWords: kreuzberg.BoolPtr(true),
},
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
Enabled: kreuzberg.BoolPtr(true),
MinConfidence: kreuzberg.FloatPtr(0.8),
DetectMultiple: kreuzberg.BoolPtr(false),
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Printf("Extracted content length: %d\n", len(result.Content))
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.config.*;
ExtractionConfig config = ExtractionConfig.builder()
.useCache(true)
.enableQualityProcessing(true)
.forceOcr(false)
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng+fra")
.build())
.pdfOptions(PdfConfig.builder()
.extractImages(true)
.extractMetadata(true)
.build())
.imageExtraction(ImageExtractionConfig.builder()
.extractImages(true)
.targetDpi(150)
.maxImageDimension(4096)
.build())
.imagePreprocessing(ImagePreprocessingConfig.builder()
.targetDpi(300)
.denoise(true)
.deskew(true)
.contrastEnhance(true)
.build())
.chunking(ChunkingConfig.builder()
.maxChars(1000)
.maxOverlap(200)
.build())
.tokenReduction(TokenReductionConfig.builder()
.mode("moderate")
.preserveImportantWords(true)
.build())
.languageDetection(LanguageDetectionConfig.builder()
.enabled(true)
.minConfidence(0.8)
.build())
.postprocessor(PostProcessorConfig.builder()
.enabled(true)
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
import asyncio
from kreuzberg import (
extract_file,
ExtractionConfig,
OcrConfig,
TesseractConfig,
PdfConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
use_cache=True,
enable_quality_processing=True,
ocr=OcrConfig(
backend="tesseract",
language="eng+fra",
tesseract_config=TesseractConfig(psm=3),
),
pdf_options=PdfConfig(extract_images=True),
chunking=ChunkingConfig(
max_chars=1000,
max_overlap=200,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("all-MiniLM-L6-v2")
),
),
)
result = await extract_file("document.pdf", config=config)
print(f"Content: {result.content[:100]}")
asyncio.run(main())
require 'kreuzberg'
config = Kreuzberg::Config::Extraction.new(
use_cache: true,
enable_quality_processing: true,
force_ocr: false,
ocr: Kreuzberg::Config::OCR.new(
backend: 'tesseract',
language: 'eng+fra'
),
pdf_options: Kreuzberg::Config::PDF.new(
extract_images: true,
extract_metadata: true
),
image_extraction: Kreuzberg::Config::ImageExtraction.new(
extract_images: true,
target_dpi: 150,
max_image_dimension: 4096
),
chunking: Kreuzberg::Config::Chunking.new(
max_chars: 1000,
max_overlap: 200
),
token_reduction: Kreuzberg::Config::TokenReduction.new(mode: 'moderate'),
language_detection: Kreuzberg::Config::LanguageDetection.new(
enabled: true,
min_confidence: 0.8
),
postprocessor: Kreuzberg::Config::PostProcessor.new(enabled: true)
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
puts "Content length: #{result.content.length}"
use kreuzberg::{
extract_file, ExtractionConfig, OcrConfig, TesseractConfig, ImagePreprocessingConfig,
PdfConfig, ImageExtractionConfig, ChunkingConfig, TokenReductionConfig,
LanguageDetectionConfig, PostProcessorConfig,
};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
use_cache: true,
enable_quality_processing: true,
force_ocr: false,
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng+fra".to_string(),
tesseract_config: Some(TesseractConfig {
psm: 3,
oem: 3,
min_confidence: 0.8,
preprocessing: Some(ImagePreprocessingConfig {
target_dpi: 300,
denoise: true,
deskew: true,
contrast_enhance: true,
..Default::default()
}),
enable_table_detection: true,
..Default::default()
}),
}),
pdf_options: Some(PdfConfig {
extract_images: true,
extract_metadata: true,
..Default::default()
}),
images: Some(ImageExtractionConfig {
extract_images: true,
target_dpi: 150,
max_image_dimension: 4096,
..Default::default()
}),
chunking: Some(ChunkingConfig {
max_chars: 1000,
max_overlap: 200,
..Default::default()
}),
token_reduction: Some(TokenReductionConfig {
mode: "moderate".to_string(),
preserve_important_words: true,
}),
language_detection: Some(LanguageDetectionConfig {
enabled: true,
min_confidence: 0.8,
detect_multiple: false,
}),
postprocessor: Some(PostProcessorConfig {
enabled: true,
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.pdf", None, &config).await?;
println!("Extracted content length: {}", result.content.len());
Ok(())
}
use_cache = true
enable_quality_processing = true
force_ocr = false
[ocr]
backend = "tesseract"
language = "eng+fra"
[ocr.tesseract_config]
psm = 3
oem = 3
min_confidence = 0.8
enable_table_detection = true
[ocr.tesseract_config.preprocessing]
target_dpi = 300
denoise = true
deskew = true
contrast_enhance = true
[pdf_options]
extract_images = true
extract_metadata = true
[images]
extract_images = true
target_dpi = 150
max_image_dimension = 4096
[chunking]
max_chars = 1000
max_overlap = 200
[chunking.embedding]
batch_size = 32
[token_reduction]
mode = "moderate"
preserve_important_words = true
[language_detection]
enabled = true
min_confidence = 0.8
detect_multiple = false
[postprocessor]
enabled = true
import { extractFile } from '@kreuzberg/node';
const config = {
useCache: true,
enableQualityProcessing: true,
forceOcr: false,
ocr: {
backend: 'tesseract',
language: 'eng+fra',
tesseractConfig: {
psm: 3,
enableTableDetection: true,
},
},
pdfOptions: {
extractImages: true,
extractMetadata: true,
},
images: {
extractImages: true,
targetDpi: 150,
maxImageDimension: 2048,
},
chunking: {
maxChars: 1000,
maxOverlap: 200,
embedding: {
preset: 'balanced',
},
},
tokenReduction: {
mode: 'moderate',
preserveImportantWords: true,
},
languageDetection: {
enabled: true,
minConfidence: 0.8,
detectMultiple: false,
},
postprocessor: {
enabled: true,
},
};
const result = await extractFile('document.pdf', null, config);
console.log(`Extracted content length: ${result.content.length}`);