Skip to content

Configuration

graph TD
    ExtractionConfig[ExtractionConfig<br/>Main Configuration]

    ExtractionConfig --> OCR[OcrConfig<br/>OCR Backend Settings]
    ExtractionConfig --> PDF[PdfConfig<br/>PDF Options]
    ExtractionConfig --> Images[ImageExtractionConfig<br/>Image Settings]
    ExtractionConfig --> Chunking[ChunkingConfig<br/>Text Chunking]
    ExtractionConfig --> TokenRed[TokenReductionConfig<br/>Token Optimization]
    ExtractionConfig --> LangDet[LanguageDetectionConfig<br/>Language Detection]
    ExtractionConfig --> PostProc[PostProcessorConfig<br/>Post-Processing]

    OCR --> Tesseract[TesseractConfig<br/>Tesseract Options]
    Tesseract --> ImgPreproc[ImagePreprocessingConfig<br/>Image Enhancement]

    Chunking --> Embedding[EmbeddingConfig<br/>Vector Embeddings]
    Embedding --> Model[EmbeddingModelType<br/>Model Selection]

    style ExtractionConfig fill:#4CAF50,color:#fff
    style OCR fill:#87CEEB
    style Chunking fill:#FFD700
    style Embedding fill:#FFB6C1

Kreuzberg's behavior is controlled through configuration objects. All settings are optional with sensible defaults, allowing you to configure only what you need.

Configuration Methods

Kreuzberg supports four ways to configure extraction:

C#
using Kreuzberg;

var config = new ExtractionConfig();
var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);

Console.WriteLine(result.Content[..Math.Min(100, result.Content.Length)]);
Console.WriteLine($"Total length: {result.Content.Length}");
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)

func main() {
    config, err := kreuzberg.LoadExtractionConfigFromFile("")
    if err != nil {
        log.Fatalf("discover config failed: %v", err)
    }

    result, err := kreuzberg.ExtractFileSync("document.pdf", config)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    log.Printf("Content length: %d", len(result.Content))
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.config.ExtractionConfig;

ExtractionConfig config = Kreuzberg.discoverExtractionConfig();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
Python
import asyncio
from kreuzberg import ExtractionConfig, extract_file

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig()
    result = await extract_file("document.pdf", config=config)

    content: str = result.content
    content_preview: str = content[:100]

    print(f"Content preview: {content_preview}")
    print(f"Total length: {len(content)}")

asyncio.run(main())
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.discover
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
Rust
use kreuzberg::{extract_file, ExtractionConfig};

#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig::discover()?;
    let result = extract_file("document.pdf", None, &config).await?;
    println!("{}", result.content);
    Ok(())
}
TypeScript
import { extractFile, ExtractionConfig } from '@kreuzberg/node';

const config = ExtractionConfig.discover();
const result = await extractFile('document.pdf', null, config);
console.log(result.content);
import { initWasm, extractBytes } from '@kreuzberg/wasm';

await initWasm();

const config = {
  use_cache: true,
  enable_quality_processing: true,
  ocr: {
    backend: 'tesseract-wasm',
    language: 'eng'
  }
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'application/pdf', config);
console.log(result.content);
kreuzberg.toml
use_cache = true
enable_quality_processing = true

[ocr]
backend = "tesseract"
language = "eng"

[ocr.tesseract_config]
psm = 3
# kreuzberg.yaml
use_cache: true
enable_quality_processing: true

ocr:
  backend: tesseract
  language: eng
  tesseract_config:
    psm: 3
{
  "use_cache": true,
  "enable_quality_processing": true,
  "ocr": {
    "backend": "tesseract",
    "language": "eng",
    "tesseract_config": {
      "psm": 3
    }
  }
}

Configuration Discovery

flowchart TD
    Start[ExtractionConfig.discover] --> Current{Check Current Directory}

    Current -->|Found| LoadCurrent[Load ./kreuzberg.*]
    Current -->|Not Found| User{Check User Config}

    User -->|Found| LoadUser[Load ~/.config/kreuzberg/config.*]
    User -->|Not Found| System{Check System Config}

    System -->|Found| LoadSystem[Load /etc/kreuzberg/config.*]
    System -->|Not Found| Default[Use Default Config]

    LoadCurrent --> Merge[Merge with Defaults]
    LoadUser --> Merge
    LoadSystem --> Merge
    Default --> Return[Return Config]

    Merge --> Return

    style LoadCurrent fill:#90EE90
    style LoadUser fill:#87CEEB
    style LoadSystem fill:#FFD700
    style Default fill:#FFB6C1

Kreuzberg automatically discovers configuration files in the following locations (in order):

  1. Current directory: ./kreuzberg.{toml,yaml,yml,json}
  2. User config: ~/.config/kreuzberg/config.{toml,yaml,yml,json}
  3. System config: /etc/kreuzberg/config.{toml,yaml,yml,json}
C#
using Kreuzberg;

var config = new ExtractionConfig();
var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);

Console.WriteLine(result.Content[..Math.Min(100, result.Content.Length)]);
Console.WriteLine($"Total length: {result.Content.Length}");
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)

func main() {
    config, err := kreuzberg.LoadExtractionConfigFromFile("")
    if err != nil {
        log.Fatalf("discover config failed: %v", err)
    }

    result, err := kreuzberg.ExtractFileSync("document.pdf", config)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    log.Printf("Content length: %d", len(result.Content))
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.config.ExtractionConfig;

ExtractionConfig config = Kreuzberg.discoverExtractionConfig();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
Python
import asyncio
from kreuzberg import ExtractionConfig, extract_file

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig()
    result = await extract_file("document.pdf", config=config)

    content: str = result.content
    content_preview: str = content[:100]

    print(f"Content preview: {content_preview}")
    print(f"Total length: {len(content)}")

asyncio.run(main())
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.discover
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
Rust
use kreuzberg::{extract_file, ExtractionConfig};

#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig::discover()?;
    let result = extract_file("document.pdf", None, &config).await?;
    println!("{}", result.content);
    Ok(())
}
TypeScript
import { extractFile, ExtractionConfig } from '@kreuzberg/node';

const config = ExtractionConfig.discover();
const result = await extractFile('document.pdf', null, config);
console.log(result.content);
import { initWasm, extractBytes } from '@kreuzberg/wasm';

await initWasm();

const config = {
  use_cache: true,
  enable_quality_processing: true,
  ocr: {
    backend: 'tesseract-wasm',
    language: 'eng'
  }
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'application/pdf', config);
console.log(result.content);

ExtractionConfig

The main configuration object controlling extraction behavior.

Field Type Default Description
use_cache bool true Enable caching of extraction results
enable_quality_processing bool true Enable quality post-processing
force_ocr bool false Force OCR even for text-based PDFs
ocr OcrConfig? None OCR configuration (if None, OCR disabled)
pdf_options PdfConfig? None PDF-specific configuration
images ImageExtractionConfig? None Image extraction configuration
chunking ChunkingConfig? None Text chunking configuration
token_reduction TokenReductionConfig? None Token reduction configuration
language_detection LanguageDetectionConfig? None Language detection configuration
keywords KeywordConfig? None Keyword extraction configuration (requires keywords-yake or keywords-rake feature flag)
postprocessor PostProcessorConfig? None Post-processing pipeline configuration

Basic Example

C#
using Kreuzberg;

var config = new ExtractionConfig
{
    UseCache = true,
    EnableQualityProcessing = true
};

var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine(result.Content);
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)

func main() {
    useCache := true
    enableQP := true

    result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
        UseCache:                &useCache,
        EnableQualityProcessing: &enableQP,
    })
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    log.Println("content length:", len(result.Content))
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.config.ExtractionConfig;

ExtractionConfig config = ExtractionConfig.builder()
    .useCache(true)
    .enableQualityProcessing(true)
    .build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
Python
import asyncio
from kreuzberg import extract_file, ExtractionConfig

async def main() -> None:
    config = ExtractionConfig(
        use_cache=True,
        enable_quality_processing=True
    )
    result = await extract_file("document.pdf", config=config)
    print(result.content)

asyncio.run(main())
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(
  use_cache: true,
  enable_quality_processing: true
)

result = Kreuzberg.extract_file_sync('document.pdf', config: config)
Rust
use kreuzberg::{extract_file, ExtractionConfig};

#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig {
        use_cache: true,
        enable_quality_processing: true,
        ..Default::default()
    };

    let result = extract_file("document.pdf", None, &config).await?;
    println!("{}", result.content);
    Ok(())
}
TypeScript
import { extractFile } from '@kreuzberg/node';

const config = {
    useCache: true,
    enableQualityProcessing: true,
};

const result = await extractFile('document.pdf', null, config);
console.log(result.content);
import { initWasm, extractBytes } from '@kreuzberg/wasm';

await initWasm();

const config = {
  ocr: {
    backend: 'tesseract-wasm',
    language: 'eng'
  },
  images: {
    extractImages: true
  }
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'application/pdf', config);
console.log(result.content);

OcrConfig

Configuration for OCR processing. Set to enable OCR on images and scanned PDFs.

Field Type Default Description
backend str "tesseract" OCR backend: "tesseract", "easyocr", "paddleocr"
language str "eng" Language code(s), e.g., "eng", "eng+fra"
tesseract_config TesseractConfig? None Tesseract-specific configuration

Example

C#
using Kreuzberg;

var config = new ExtractionConfig
{
    Ocr = new OcrConfig
    {
        Backend = "tesseract",
        Language = "eng+fra",
        TesseractConfig = new TesseractConfig { Psm = 3 }
    }
};

var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine(result.Content);
Go
package main

import "github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"

func main() {
    language := "eng+fra"
    psm := 3

    _ = &kreuzberg.ExtractionConfig{
        OCR: &kreuzberg.OCRConfig{
            Backend:  "tesseract",
            Language: &language,
            Tesseract: &kreuzberg.TesseractConfig{
                PSM: &psm,
            },
        },
    }
}
Java
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.OcrConfig;
import dev.kreuzberg.config.TesseractConfig;

ExtractionConfig config = ExtractionConfig.builder()
    .ocr(OcrConfig.builder()
        .backend("tesseract")
        .language("eng+fra")
        .tesseractConfig(TesseractConfig.builder()
            .psm(3)
            .build())
        .build())
    .build();
Python
import asyncio
from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig(
        ocr=OcrConfig(
            backend="tesseract", language="eng+fra",
            tesseract_config=TesseractConfig(psm=3)
        )
    )
    result = await extract_file("document.pdf", config=config)
    print(result.content)

asyncio.run(main())
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(
  ocr: Kreuzberg::Config::OCR.new(
    backend: 'tesseract',
    language: 'eng+fra',
    tesseract_config: Kreuzberg::Config::Tesseract.new(psm: 3)
  )
)
Rust
use kreuzberg::{ExtractionConfig, OcrConfig, TesseractConfig};

fn main() {
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: Some("eng+fra".to_string()),
            tesseract_config: Some(TesseractConfig {
                psm: 3,
                ..Default::default()
            }),
        }),
        ..Default::default()
    };
}
TypeScript
import { extractFile } from '@kreuzberg/node';

const config = {
    ocr: {
        backend: 'tesseract',
        language: 'eng+fra',
        tesseractConfig: {
            psm: 3,
        },
    },
};

const result = await extractFile('document.pdf', null, config);
console.log(result.content);

TesseractConfig

Tesseract OCR engine configuration.

Field Type Default Description
language str "eng" Language code(s), e.g., "eng", "eng+fra"
psm int 3 Page segmentation mode (0-13)
output_format str "text" Output format: "text", "hocr"
oem int 3 OCR engine mode (0-3)
min_confidence float 0.0 Minimum confidence threshold (0.0-1.0)
preprocessing ImagePreprocessingConfig? None Image preprocessing configuration
enable_table_detection bool false Enable table detection and extraction
table_min_confidence float 0.5 Minimum confidence for table cells
table_column_threshold int 50 Pixel threshold for column detection
table_row_threshold_ratio float 0.5 Row threshold ratio
use_cache bool true Enable OCR result caching
classify_use_pre_adapted_templates bool false Tesseract variable
language_model_ngram_on bool false Tesseract variable
tessedit_dont_blkrej_good_wds bool false Tesseract variable
tessedit_dont_rowrej_good_wds bool false Tesseract variable
tessedit_enable_dict_correction bool false Tesseract variable
tessedit_char_whitelist str "" Allowed characters
tessedit_char_blacklist str "" Disallowed characters
tessedit_use_primary_params_model bool false Tesseract variable
textord_space_size_is_variable bool false Tesseract variable
thresholding_method bool false Tesseract variable

Page Segmentation Modes (PSM)

  • 0: Orientation and script detection only
  • 1: Automatic page segmentation with OSD
  • 2: Automatic page segmentation (no OSD, no OCR)
  • 3: Fully automatic page segmentation (default)
  • 4: Single column of text
  • 5: Single uniform block of vertically aligned text
  • 6: Single uniform block of text
  • 7: Single text line
  • 8: Single word
  • 9: Single word in a circle
  • 10: Single character
  • 11: Sparse text, no particular order
  • 12: Sparse text with OSD
  • 13: Raw line (no assumptions about text layout)

OCR Engine Modes (OEM)

  • 0: Legacy engine only
  • 1: Neural nets LSTM engine only
  • 2: Legacy + LSTM engines
  • 3: Default based on what's available (default)

Example

C#
using Kreuzberg;

var config = new ExtractionConfig
{
    Ocr = new OcrConfig
    {
        Language = "eng+fra+deu",
        TesseractConfig = new TesseractConfig
        {
            Psm = 6,
            Oem = 1,
            MinConfidence = 0.8m,
            EnableTableDetection = true
        }
    }
};

var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Content: {result.Content[..Math.Min(100, result.Content.Length)]}");
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)

func main() {
    psm := 6
    oem := 1
    minConf := 0.8
    lang := "eng+fra+deu"
    whitelist := "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?"

    config := &kreuzberg.ExtractionConfig{
        OCR: &kreuzberg.OCRConfig{
            Backend:  "tesseract",
            Language: &lang,
            Tesseract: &kreuzberg.TesseractConfig{
                PSM:              &psm,
                OEM:              &oem,
                MinConfidence:    &minConf,
                EnableTableDetection: kreuzberg.BoolPtr(true),
                TesseditCharWhitelist: whitelist,
            },
        },
    }

    result, err := kreuzberg.ExtractFileSync("document.pdf", config)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    log.Println("content length:", len(result.Content))
}
Java
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.OcrConfig;
import dev.kreuzberg.config.TesseractConfig;

ExtractionConfig config = ExtractionConfig.builder()
    .ocr(OcrConfig.builder()
        .language("eng+fra+deu")
        .tesseractConfig(TesseractConfig.builder()
            .psm(6)
            .oem(1)
            .minConfidence(0.8)
            .tesseditCharWhitelist("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?")
            .enableTableDetection(true)
            .build())
        .build())
    .build();
Python
import asyncio
from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig(
        ocr=OcrConfig(
            language="eng+fra+deu",
            tesseract_config=TesseractConfig(
                psm=6,
                oem=1,
                min_confidence=0.8,
                enable_table_detection=True,
            ),
        )
    )
    result = await extract_file("document.pdf", config=config)
    print(f"Content: {result.content[:100]}")

asyncio.run(main())
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(
  ocr: Kreuzberg::Config::OCR.new(
    language: 'eng+fra+deu',
    tesseract_config: Kreuzberg::Config::Tesseract.new(
      psm: 6,
      oem: 1,
      min_confidence: 0.8,
      tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?',
      enable_table_detection: true
    )
  )
)
Rust
use kreuzberg::{ExtractionConfig, OcrConfig, TesseractConfig};

fn main() {
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            language: Some("eng+fra+deu".to_string()),
            tesseract_config: Some(TesseractConfig {
                psm: Some(6),
                oem: Some(1),
                min_confidence: Some(0.8),
                tessedit_char_whitelist: Some("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?".to_string()),
                enable_table_detection: Some(true),
                ..Default::default()
            }),
            ..Default::default()
        }),
        ..Default::default()
    };
    println!("{:?}", config.ocr);
}
TypeScript
import { extractFile } from '@kreuzberg/node';

const config = {
    ocr: {
        backend: 'tesseract',
        language: 'eng+fra+deu',
        tesseractConfig: {
            psm: 6,
            tesseditCharWhitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?',
            enableTableDetection: true,
        },
    },
};

const result = await extractFile('document.pdf', null, config);
console.log(result.content);

ImagePreprocessingConfig

Image preprocessing configuration for OCR.

Field Type Default Description
target_dpi int 300 Target DPI for OCR processing
auto_rotate bool true Automatically rotate images based on orientation
deskew bool true Apply deskewing to straighten tilted text
denoise bool true Apply denoising filter
contrast_enhance bool true Enhance image contrast
binarization_method str "otsu" Binarization method: "otsu", "adaptive", "none"
invert_colors bool false Invert image colors (useful for white-on-black text)

Example

C#
using Kreuzberg;

var config = new ExtractionConfig
{
    Ocr = new OcrConfig
    {
        TesseractConfig = new TesseractConfig
        {
            Preprocessing = new ImagePreprocessingConfig
            {
                TargetDpi = 300,
                Denoise = true,
                Deskew = true,
                ContrastEnhance = true,
                BinarizationMethod = "otsu"
            }
        }
    }
};

var result = await KreuzbergClient.ExtractFileAsync("scanned.pdf", config);
Console.WriteLine($"Content: {result.Content[..Math.Min(100, result.Content.Length)]}");
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)

func main() {
    targetDPI := 300
    config := &kreuzberg.ExtractionConfig{
        OCR: &kreuzberg.OCRConfig{
            Tesseract: &kreuzberg.TesseractConfig{
                Preprocessing: &kreuzberg.ImagePreprocessingConfig{
                    TargetDPI:         &targetDPI,
                    Denoise:           kreuzberg.BoolPtr(true),
                    Deskew:            kreuzberg.BoolPtr(true),
                    ContrastEnhance:   kreuzberg.BoolPtr(true),
                    BinarizationMode:  kreuzberg.StringPtr("otsu"),
                },
            },
        },
    }

    result, err := kreuzberg.ExtractFileSync("document.pdf", config)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    log.Println("content length:", len(result.Content))
}
Java
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.ImagePreprocessingConfig;
import dev.kreuzberg.config.OcrConfig;
import dev.kreuzberg.config.TesseractConfig;

ExtractionConfig config = ExtractionConfig.builder()
    .ocr(OcrConfig.builder()
        .tesseractConfig(TesseractConfig.builder()
            .preprocessing(ImagePreprocessingConfig.builder()
                .targetDpi(300)
                .denoise(true)
                .deskew(true)
                .contrastEnhance(true)
                .binarizationMethod("otsu")
                .build())
            .build())
        .build())
    .build();
Python
import asyncio
from kreuzberg import (
    ExtractionConfig,
    OcrConfig,
    TesseractConfig,
    ImagePreprocessingConfig,
    extract_file,
)

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig(
        ocr=OcrConfig(
            tesseract_config=TesseractConfig(
                preprocessing=ImagePreprocessingConfig(
                    target_dpi=300,
                    denoise=True,
                    deskew=True,
                    contrast_enhance=True,
                    binarization_method="otsu",
                )
            )
        )
    )
    result = await extract_file("scanned.pdf", config=config)
    print(f"Content: {result.content[:100]}")

asyncio.run(main())
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(
  ocr: Kreuzberg::Config::OCR.new(
    tesseract_config: Kreuzberg::Config::Tesseract.new(
      preprocessing: Kreuzberg::Config::ImagePreprocessing.new(
        target_dpi: 300,
        denoise: true,
        deskew: true,
        contrast_enhance: true,
        binarization_method: 'otsu'
      )
    )
  )
)
Rust
use kreuzberg::{ExtractionConfig, ImagePreprocessingConfig, OcrConfig, TesseractConfig};

fn main() {
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            tesseract_config: Some(TesseractConfig {
                preprocessing: Some(ImagePreprocessingConfig {
                    target_dpi: Some(300),
                    denoise: Some(true),
                    deskew: Some(true),
                    contrast_enhance: Some(true),
                    binarization_method: Some("otsu".to_string()),
                    ..Default::default()
                }),
                ..Default::default()
            }),
            ..Default::default()
        }),
        ..Default::default()
    };

    println!("{:?}", config.ocr);
}
TypeScript
import { extractFile } from '@kreuzberg/node';

const config = {
    ocr: {
        backend: 'tesseract',
        tesseractConfig: {
            psm: 6,
            enableTableDetection: true,
        },
    },
};

const result = await extractFile('document.pdf', null, config);
console.log(result.content);

PdfConfig

PDF-specific extraction configuration.

Field Type Default Description
extract_images bool true Extract embedded images from PDF
extract_metadata bool true Extract PDF metadata (title, author, etc.)
passwords list[str]? None List of passwords to try for encrypted PDFs

Example

C#
using Kreuzberg;

var config = new ExtractionConfig
{
    PdfOptions = new PdfConfig
    {
        ExtractImages = true,
        ExtractMetadata = true,
        Passwords = new List<string> { "password1", "password2" }
    }
};

var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Content: {result.Content[..Math.Min(100, result.Content.Length)]}");
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)

func main() {
    pw := []string{"password1", "password2"}
    result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
        PdfOptions: &kreuzberg.PdfConfig{
            ExtractImages:   kreuzberg.BoolPtr(true),
            ExtractMetadata: kreuzberg.BoolPtr(true),
            Passwords:       pw,
        },
    })
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    log.Println("content length:", len(result.Content))
}
Java
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.PdfConfig;
import java.util.Arrays;

ExtractionConfig config = ExtractionConfig.builder()
    .pdfOptions(PdfConfig.builder()
        .extractImages(true)
        .extractMetadata(true)
        .passwords(Arrays.asList("password1", "password2"))
        .build())
    .build();
Python
import asyncio
from kreuzberg import ExtractionConfig, PdfConfig, extract_file

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig(
        pdf_options=PdfConfig(
            extract_images=True,
            extract_metadata=True,
            passwords=["password1", "password2"],
        )
    )
    result = await extract_file("document.pdf", config=config)
    print(f"Content: {result.content[:100]}")

asyncio.run(main())
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(
  pdf_options: Kreuzberg::Config::PDF.new(
    extract_images: true,
    extract_metadata: true,
    passwords: ['password1', 'password2']
  )
)
Rust
use kreuzberg::{ExtractionConfig, PdfConfig};

fn main() {
    let config = ExtractionConfig {
        pdf_options: Some(PdfConfig {
            extract_images: Some(true),
            extract_metadata: Some(true),
            passwords: Some(vec!["password1".to_string(), "password2".to_string()]),
        }),
        ..Default::default()
    };
    println!("{:?}", config.pdf_options);
}
TypeScript
import { extractFile } from '@kreuzberg/node';

const config = {
    pdfOptions: {
        extractImages: true,
        extractMetadata: true,
        passwords: ['password1', 'password2'],
    },
};

const result = await extractFile('document.pdf', null, config);
console.log(result.content);

Page Configuration

Configure page extraction and boundary tracking.

Overview

Page tracking enables: - Per-page content extraction - Byte-accurate page boundaries - Automatic chunk-to-page mapping - Page markers for LLM context

Configuration Options

Field Type Default Description
extract_pages bool false Extract pages array with per-page content
insert_page_markers bool false Insert page markers in combined content
marker_format String "\\n\\n<!-- PAGE {page_num} -->\\n\\n" Page marker template

Example Configuration

page_config.cs
var config = new ExtractionConfig
{
    Pages = new PageConfig
    {
        ExtractPages = true,
        InsertPageMarkers = true,
        MarkerFormat = "\n\n--- Page {page_num} ---\n\n"
    }
};
page_config.go
config := &ExtractionConfig{
    Pages: &PageConfig{
        ExtractPages:      true,
        InsertPageMarkers: true,
        MarkerFormat:      "\n\n--- Page {page_num} ---\n\n",
    },
}
PageConfig.java
var config = ExtractionConfig.builder()
    .pages(PageConfig.builder()
        .extractPages(true)
        .insertPageMarkers(true)
        .markerFormat("\n\n--- Page {page_num} ---\n\n")
        .build())
    .build();
page_config.py
config = ExtractionConfig(
    pages=PageConfig(
        extract_pages=True,
        insert_page_markers=True,
        marker_format="\n\n--- Page {page_num} ---\n\n"
    )
)
page_config.rb
config = ExtractionConfig.new(
  pages: PageConfig.new(
    extract_pages: true,
    insert_page_markers: true,
    marker_format: "\n\n--- Page {page_num} ---\n\n"
  )
)
page_config.rs
let config = ExtractionConfig {
    pages: Some(PageConfig {
        extract_pages: true,
        insert_page_markers: true,
        marker_format: "\n\n--- Page {page_num} ---\n\n".to_string(),
    }),
    ..Default::default()
};
page_config.ts
const config: ExtractionConfig = {
  pages: {
    extractPages: true,
    insertPageMarkers: true,
    markerFormat: "\n\n--- Page {page_num} ---\n\n"
  }
};

Field Details

extract_pages: When true, populates ExtractionResult.pages with per-page content. Each page contains its text, tables, and images separately.

insert_page_markers: When true, inserts page markers into the combined content string at page boundaries. Useful for LLMs to understand document structure.

marker_format: Template string for page markers. Use {page_num} placeholder for the page number. Default HTML comment format is LLM-friendly.

Format-Specific Behavior

PDF: Full byte-accurate page tracking with O(1) lookup performance. Every page boundary is tracked precisely.

PPTX: Slide boundaries tracked. Each slide is treated as a "page" with PageUnitType::Slide.

DOCX: Best-effort detection using explicit page breaks. Only pages with <w:br type="page"/> tags are tracked.

Other formats: Page tracking not available. PageStructure will be None/null.

Byte Offsets vs Character Offsets

Page boundaries use byte offsets (not character offsets) for UTF-8 safety and performance:

# Correct: Use byte offsets
boundary = boundaries[0]
page_text = content.encode('utf-8')[boundary.byte_start:boundary.byte_end].decode('utf-8')

# Incorrect: Don't use as character indices
page_text = content[boundary.byte_start:boundary.byte_end]  # Wrong for multi-byte chars

See Byte Offset Handling in the migration guide.

ImageExtractionConfig

Configuration for extracting images from documents.

Field Type Default Description
extract_images bool true Extract images from documents
target_dpi int 300 Target DPI for extracted images
max_image_dimension int 4096 Maximum image dimension (width or height) in pixels
auto_adjust_dpi bool true Automatically adjust DPI based on image size
min_dpi int 72 Minimum DPI when auto-adjusting
max_dpi int 600 Maximum DPI when auto-adjusting

Example

C#
using Kreuzberg;

var config = new ExtractionConfig
{
    Images = new ImageExtractionConfig
    {
        ExtractImages = true,
        TargetDpi = 200,
        MaxImageDimension = 2048,
        AutoAdjustDpi = true
    }
};

var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Extracted: {result.Content[..Math.Min(100, result.Content.Length)]}");
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)

func main() {
    targetDPI := 200
    maxDim := 2048
    result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
        ImageExtraction: &kreuzberg.ImageExtractionConfig{
            ExtractImages:     kreuzberg.BoolPtr(true),
            TargetDPI:         &targetDPI,
            MaxImageDimension: &maxDim,
            AutoAdjustDPI:     kreuzberg.BoolPtr(true),
        },
    })
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    log.Println("content length:", len(result.Content))
}
Java
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.ImageExtractionConfig;

ExtractionConfig config = ExtractionConfig.builder()
    .imageExtraction(ImageExtractionConfig.builder()
        .extractImages(true)
        .targetDpi(200)
        .maxImageDimension(2048)
        .autoAdjustDpi(true)
        .build())
    .build();
Python
import asyncio
from kreuzberg import ExtractionConfig, ImageExtractionConfig, extract_file

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig(
        images=ImageExtractionConfig(
            extract_images=True,
            target_dpi=200,
            max_image_dimension=2048,
            auto_adjust_dpi=True,
        )
    )
    result = await extract_file("document.pdf", config=config)
    print(f"Extracted: {result.content[:100]}")

asyncio.run(main())
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(
  images: Kreuzberg::Config::ImageExtraction.new(
    extract_images: true,
    target_dpi: 200,
    max_image_dimension: 2048,
    auto_adjust_dpi: true
  )
)
Rust
use kreuzberg::{ExtractionConfig, ImageExtractionConfig};

fn main() {
    let config = ExtractionConfig {
        images: Some(ImageExtractionConfig {
            extract_images: Some(true),
            target_dpi: Some(200),
            max_image_dimension: Some(2048),
            auto_adjust_dpi: Some(true),
            ..Default::default()
        }),
        ..Default::default()
    };
    println!("{:?}", config.images);
}
TypeScript
import { extractFile } from '@kreuzberg/node';

const config = {
    images: {
        extractImages: true,
        targetDpi: 200,
        maxImageDimension: 2048,
        autoAdjustDpi: true,
    },
};

const result = await extractFile('document.pdf', null, config);
console.log(`Extracted ${result.images?.length ?? 0} images`);

ChunkingConfig

Text chunking configuration for splitting extracted text into chunks.

Field Type Default Description
max_chars int 1000 Maximum chunk size in characters
max_overlap int 200 Overlap between chunks in characters
embedding EmbeddingConfig? None Embedding configuration for chunks
preset str? None Chunking preset: "small", "medium", "large"

Example

C#
using Kreuzberg;
using System;
using System.Collections.Generic;
using System.Threading.Tasks;

var config = new ExtractionConfig
{
    Chunking = new ChunkingConfig
    {
        MaxChars = 512,
        MaxOverlap = 50,
        Embedding = new EmbeddingConfig
        {
            Model = EmbeddingModelType.Preset("balanced"),
            Normalize = true,
            BatchSize = 32,
            ShowDownloadProgress = false
        }
    }
};

var result = await Kreuzberg.ExtractFileAsync("document.pdf", config);

var chunks = result.Chunks ?? new List<Chunk>();
foreach (var (index, chunk) in chunks.WithIndex())
{
    var chunkId = $"doc_chunk_{index}";
    Console.WriteLine($"Chunk {chunkId}: {chunk.Content[..Math.Min(50, chunk.Content.Length)]}");

    if (chunk.Embedding != null)
    {
        Console.WriteLine($"  Embedding dimensions: {chunk.Embedding.Length}");
    }
}

internal static class EnumerableExtensions
{
    public static IEnumerable<(int Index, T Item)> WithIndex<T>(
        this IEnumerable<T> items)
    {
        var index = 0;
        foreach (var item in items)
        {
            yield return (index++, item);
        }
    }
}
Go
package main

import (
    "fmt"
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)

func main() {
    maxChars := 1000
    maxOverlap := 200
    config := &kreuzberg.ExtractionConfig{
        Chunking: &kreuzberg.ChunkingConfig{
            MaxChars:   &maxChars,
            MaxOverlap: &maxOverlap,
        },
    }

    result, err := kreuzberg.ExtractFileSync("document.pdf", config)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    for i, chunk := range result.Chunks {
        fmt.Printf("Chunk %d/%d (%d-%d)\n", i+1, chunk.Metadata.TotalChunks, chunk.Metadata.CharStart, chunk.Metadata.CharEnd)
        fmt.Printf("%s...\n", chunk.Content[:min(len(chunk.Content), 100)])
    }
}

func min(a, b int) int {
    if a < b {
        return a
    }
    return b
}
Java
import dev.kreuzberg.config.ChunkingConfig;
import dev.kreuzberg.config.EmbeddingConfig;
import dev.kreuzberg.config.EmbeddingModelType;
import dev.kreuzberg.config.ExtractionConfig;

ExtractionConfig config = ExtractionConfig.builder()
    .chunking(ChunkingConfig.builder()
        .maxChars(1500)
        .maxOverlap(200)
        .embedding(EmbeddingConfig.builder()
            .model(EmbeddingModelType.builder()
                .type("preset")
                .name("text-embedding-all-minilm-l6-v2")
                .build())
            .build())
        .build())
    .build();
Python
from kreuzberg import (
    ExtractionConfig,
    ChunkingConfig,
    EmbeddingConfig,
    EmbeddingModelType,
)

config: ExtractionConfig = ExtractionConfig(
    chunking=ChunkingConfig(
        max_chars=1500,
        max_overlap=200,
        embedding=EmbeddingConfig(
            model=EmbeddingModelType.preset("all-minilm-l6-v2")
        ),
    )
)
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(
  chunking: Kreuzberg::Config::Chunking.new(
    max_chars: 1500,
    max_overlap: 200,
    embedding: Kreuzberg::Config::Embedding.new(
      model: Kreuzberg::EmbeddingModelType.new(
        type: 'preset',
        name: 'text-embedding-all-minilm-l6-v2'
      )
    )
  )
)
Rust
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};

fn main() {
    let config = ExtractionConfig {
        chunking: Some(ChunkingConfig {
            max_chars: Some(1500),
            max_overlap: Some(200),
            embedding: Some(EmbeddingConfig {
                model: Some(EmbeddingModelType {
                    r#type: "preset".to_string(),
                    name: Some("text-embedding-all-minilm-l6-v2".to_string()),
                    ..Default::default()
                }),
                ..Default::default()
            }),
            ..Default::default()
        }),
        ..Default::default()
    };
    println!("{:?}", config.chunking);
}
TypeScript
import { extractFile } from '@kreuzberg/node';

const config = {
    chunking: {
        maxChars: 1500,
        maxOverlap: 200,
        embedding: {
            preset: 'quality',
        },
    },
};

const result = await extractFile('document.pdf', null, config);
console.log(`Chunks created: ${result.chunks?.length ?? 0}`);
import { initWasm, extractBytes } from '@kreuzberg/wasm';

await initWasm();

const config = {
  chunking: {
    maxChars: 1000,
    chunkOverlap: 100
  }
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'application/pdf', config);

result.chunks?.forEach((chunk, idx) => {
  console.log(`Chunk ${idx}: ${chunk.content.substring(0, 50)}...`);
  console.log(`Tokens: ${chunk.metadata?.token_count}`);
});

EmbeddingConfig

Configuration for generating embeddings from extracted text or chunks.

Field Type Default Description
model EmbeddingModelType preset("all-MiniLM-L6-v2") Embedding model configuration
normalize bool true Normalize embeddings to unit length
batch_size int 32 Batch size for embedding generation
show_download_progress bool true Show download progress for models
cache_dir str? None Custom cache directory for models

EmbeddingModelType

Create embedding models using these factory methods:

  • EmbeddingModelType.preset(name): Use a preset model
  • "all-MiniLM-L6-v2": Fast, 384-dimensional embeddings (default)
  • "all-mpnet-base-v2": High quality, 768-dimensional embeddings
  • "paraphrase-multilingual-MiniLM-L12-v2": Multilingual support

  • EmbeddingModelType.fastembed(model, dimensions): Use a FastEmbed model

  • Example: fastembed("BAAI/bge-small-en-v1.5", 384)

  • EmbeddingModelType.custom(model_id, dimensions): Use a custom model

  • Example: custom("sentence-transformers/all-MiniLM-L6-v2", 384)

Example

C#
using Kreuzberg;

var config = new ExtractionConfig
{
    Chunking = new ChunkingConfig
    {
        MaxChars = 1000,
        Embedding = new EmbeddingConfig
        {
            Model = EmbeddingModelType.Preset("all-mpnet-base-v2"),
            BatchSize = 16,
            Normalize = true,
            ShowDownloadProgress = true
        }
    }
};
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)

func main() {
    maxChars := 1000
    batchSize := 16

    cfg := &kreuzberg.ExtractionConfig{
        Chunking: &kreuzberg.ChunkingConfig{
            MaxChars: &maxChars,
            Embedding: &kreuzberg.EmbeddingConfig{
                Model: &kreuzberg.EmbeddingModelType{
                    Type: "preset",
                    Name: "all-mpnet-base-v2",
                },
                BatchSize:            &batchSize,
                Normalize:            kreuzberg.BoolPtr(true),
                ShowDownloadProgress: kreuzberg.BoolPtr(true),
            },
        },
    }

    result, err := kreuzberg.ExtractFileSync("document.pdf", cfg)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }
    log.Println("content length:", len(result.Content))
}
Java
import dev.kreuzberg.config.ChunkingConfig;
import dev.kreuzberg.config.EmbeddingConfig;
import dev.kreuzberg.config.EmbeddingModelType;
import dev.kreuzberg.config.ExtractionConfig;

ExtractionConfig config = ExtractionConfig.builder()
    .chunking(ChunkingConfig.builder()
        .maxChars(1000)
        .embedding(EmbeddingConfig.builder()
            .model(EmbeddingModelType.builder()
                .type("preset")
                .name("all-mpnet-base-v2")
                .build())
            .batchSize(16)
            .normalize(true)
            .showDownloadProgress(true)
            .build())
        .build())
    .build();
Python
from kreuzberg import ExtractionConfig, ChunkingConfig, EmbeddingConfig, EmbeddingModelType

config = ExtractionConfig(
    chunking=ChunkingConfig(
        max_chars=1000,
        embedding=EmbeddingConfig(
            model=EmbeddingModelType.preset("all-mpnet-base-v2"),
            batch_size=16,
            normalize=True,
            show_download_progress=True
        )
    )
)
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(
  chunking: Kreuzberg::Config::Chunking.new(
    max_chars: 1000,
    embedding: Kreuzberg::Config::Embedding.new(
      model: Kreuzberg::EmbeddingModelType.new(
        type: 'preset',
        name: 'all-mpnet-base-v2'
      ),
      batch_size: 16,
      normalize: true,
      show_download_progress: true
    )
  )
)
Rust
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};

fn main() {
    let config = ExtractionConfig {
        chunking: Some(ChunkingConfig {
            max_chars: Some(1000),
            embedding: Some(EmbeddingConfig {
                model: Some(EmbeddingModelType {
                    r#type: "preset".to_string(),
                    name: Some("all-mpnet-base-v2".to_string()),
                    ..Default::default()
                }),
                batch_size: Some(16),
                normalize: Some(true),
                show_download_progress: Some(true),
                ..Default::default()
            }),
            ..Default::default()
        }),
        ..Default::default()
    };
    println!("{:?}", config.chunking);
}
TypeScript
import { extractFile } from '@kreuzberg/node';

const config = {
    chunking: {
        maxChars: 1000,
        embedding: {
            preset: 'quality',
        },
    },
};

const result = await extractFile('document.pdf', null, config);
if (result.chunks && result.chunks.length > 0) {
    console.log(`Chunk embeddings: ${result.chunks[0].embedding?.length ?? 0} dimensions`);
}

TokenReductionConfig

Configuration for reducing token count in extracted text.

Field Type Default Description
mode str "off" Reduction mode: "off", "moderate", "aggressive"
preserve_important_words bool true Preserve important words during reduction

Example

C#
using Kreuzberg;

var config = new ExtractionConfig
{
    TokenReduction = new TokenReductionConfig
    {
        Mode = "moderate",
        PreserveImportantWords = true
    }
};

var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Content length: {result.Content.Length}");
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)

func main() {
    preserve := true
    result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
        TokenReduction: &kreuzberg.TokenReductionConfig{
            Mode:                  "moderate",
            PreserveImportantWords: &preserve,
        },
    })
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    log.Println("content length:", len(result.Content))
}
Java
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.TokenReductionConfig;

ExtractionConfig config = ExtractionConfig.builder()
    .tokenReduction(TokenReductionConfig.builder()
        .mode("moderate")
        .preserveImportantWords(true)
        .build())
    .build();
Python
import asyncio
from kreuzberg import ExtractionConfig, TokenReductionConfig, extract_file

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig(
        token_reduction=TokenReductionConfig(
            mode="moderate", preserve_important_words=True
        )
    )
    result = await extract_file("document.pdf", config=config)
    print(f"Content length: {len(result.content)}")

asyncio.run(main())
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(
  token_reduction: Kreuzberg::Config::TokenReduction.new(
    mode: 'moderate',
    preserve_important_words: true
  )
)
Rust
use kreuzberg::{ExtractionConfig, TokenReductionConfig};

fn main() {
    let config = ExtractionConfig {
        token_reduction: Some(TokenReductionConfig {
            mode: Some("moderate".to_string()),
            preserve_important_words: Some(true),
        }),
        ..Default::default()
    };
    println!("{:?}", config.token_reduction);
}
TypeScript
import { extractFile } from '@kreuzberg/node';

const config = {
    tokenReduction: {
        mode: 'moderate',
        preserveImportantWords: true,
    },
};

const result = await extractFile('document.pdf', null, config);
console.log(result.content);

LanguageDetectionConfig

Configuration for automatic language detection.

Field Type Default Description
enabled bool true Enable language detection
min_confidence float 0.8 Minimum confidence threshold (0.0-1.0)
detect_multiple bool false Detect multiple languages (vs. dominant only)

Example

C#
using Kreuzberg;

var config = new ExtractionConfig
{
    LanguageDetection = new LanguageDetectionConfig
    {
        Enabled = true,
        MinConfidence = 0.9m,
        DetectMultiple = true
    }
};

var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Languages: {string.Join(", ", result.DetectedLanguages ?? new List<string>())}");
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)

func main() {
    minConfidence := 0.9
    detectMultiple := true
    result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
        LanguageDetection: &kreuzberg.LanguageDetectionConfig{
            Enabled:        kreuzberg.BoolPtr(true),
            MinConfidence:  &minConfidence,
            DetectMultiple: &detectMultiple,
        },
    })
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    log.Println("content length:", len(result.Content))
}
Java
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.LanguageDetectionConfig;

ExtractionConfig config = ExtractionConfig.builder()
    .languageDetection(LanguageDetectionConfig.builder()
        .enabled(true)
        .minConfidence(0.9)
        .detectMultiple(true)
        .build())
    .build();
Python
import asyncio
from kreuzberg import ExtractionConfig, LanguageDetectionConfig, extract_file

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig(
        language_detection=LanguageDetectionConfig(
            enabled=True, min_confidence=0.9, detect_multiple=True
        )
    )
    result = await extract_file("document.pdf", config=config)
    print(f"Languages: {result.detected_languages}")

asyncio.run(main())
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(
  language_detection: Kreuzberg::Config::LanguageDetection.new(
    enabled: true,
    min_confidence: 0.9,
    detect_multiple: true
  )
)
Rust
use kreuzberg::{ExtractionConfig, LanguageDetectionConfig};

fn main() {
    let config = ExtractionConfig {
        language_detection: Some(LanguageDetectionConfig {
            enabled: Some(true),
            min_confidence: Some(0.9),
            detect_multiple: Some(true),
        }),
        ..Default::default()
    };
    println!("{:?}", config.language_detection);
}
TypeScript
import { extractFile } from '@kreuzberg/node';

const config = {
    languageDetection: {
        enabled: true,
        minConfidence: 0.9,
        detectMultiple: true,
    },
};

const result = await extractFile('document.pdf', null, config);
console.log(result.content);

PostProcessorConfig

Configuration for post-processing pipeline.

Field Type Default Description
enabled bool true Enable post-processing pipeline
enabled_processors list[str]? None Specific processors to enable (if None, all enabled)
disabled_processors list[str]? None Specific processors to disable

Example

C#
using Kreuzberg;

var config = new ExtractionConfig
{
    Postprocessor = new PostProcessorConfig
    {
        Enabled = true,
        EnabledProcessors = new List<string> { "deduplication" }
    }
};

var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Content: {result.Content[..Math.Min(100, result.Content.Length)]}");
Go
package main

import "github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"

func main() {
    enabled := true
    cfg := &kreuzberg.ExtractionConfig{
        Postprocessor: &kreuzberg.PostProcessorConfig{
            Enabled:            &enabled,
            EnabledProcessors:  []string{"deduplication", "whitespace_normalization"},
            DisabledProcessors: []string{"mojibake_fix"},
        },
    }

    _ = cfg
}
Java
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.PostProcessorConfig;
import java.util.Arrays;

ExtractionConfig config = ExtractionConfig.builder()
    .postprocessor(PostProcessorConfig.builder()
        .enabled(true)
        .enabledProcessors(Arrays.asList("deduplication", "whitespace_normalization"))
        .disabledProcessors(Arrays.asList("mojibake_fix"))
        .build())
    .build();
Python
import asyncio
from kreuzberg import ExtractionConfig, PostProcessorConfig, extract_file

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig(
        postprocessor=PostProcessorConfig(
            enabled=True,
            enabled_processors=["deduplication"],
        )
    )
    result = await extract_file("document.pdf", config=config)
    print(f"Content: {result.content[:100]}")

asyncio.run(main())
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(
  postprocessor: Kreuzberg::Config::PostProcessor.new(
    enabled: true,
    enabled_processors: ['deduplication', 'whitespace_normalization'],
    disabled_processors: ['mojibake_fix']
  )
)
Rust
use kreuzberg::{ExtractionConfig, PostProcessorConfig};

fn main() {
    let config = ExtractionConfig {
        postprocessor: Some(PostProcessorConfig {
            enabled: Some(true),
            enabled_processors: Some(vec![
                "deduplication".to_string(),
                "whitespace_normalization".to_string(),
            ]),
            disabled_processors: Some(vec!["mojibake_fix".to_string()]),
        }),
        ..Default::default()
    };
    println!("{:?}", config.postprocessor);
}
TypeScript
import { extractFile } from '@kreuzberg/node';

const config = {
    postprocessor: {
        enabled: true,
        enabledProcessors: ['deduplication', 'whitespace_normalization'],
        disabledProcessors: ['mojibake_fix'],
    },
};

const result = await extractFile('document.pdf', null, config);
console.log(result.content);

Complete Example

Here's a complete example showing all configuration options together:

C#
using Kreuzberg;

var config = new ExtractionConfig
{
    UseCache = true,
    EnableQualityProcessing = true,
    Ocr = new OcrConfig
    {
        Backend = "tesseract",
        Language = "eng+fra",
        TesseractConfig = new TesseractConfig { Psm = 3 }
    },
    PdfOptions = new PdfConfig { ExtractImages = true },
    Chunking = new ChunkingConfig
    {
        MaxChars = 1000,
        MaxOverlap = 200,
        Embedding = new EmbeddingConfig
        {
            Model = EmbeddingModelType.Preset("all-MiniLM-L6-v2")
        }
    }
};

var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Content: {result.Content[..Math.Min(100, result.Content.Length)]}");
Go
package main

import (
    "fmt"
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)

func main() {
    config := &kreuzberg.ExtractionConfig{
        UseCache:                kreuzberg.BoolPtr(true),
        EnableQualityProcessing: kreuzberg.BoolPtr(true),
        ForceOCR:                kreuzberg.BoolPtr(false),
        OCR: &kreuzberg.OCRConfig{
            Backend:   "tesseract",
            Language:  kreuzberg.StringPtr("eng+fra"),
            Tesseract: &kreuzberg.TesseractConfig{
                PSM:                  kreuzberg.IntPtr(3),
                OEM:                  kreuzberg.IntPtr(3),
                MinConfidence:        kreuzberg.FloatPtr(0.8),
                EnableTableDetection: kreuzberg.BoolPtr(true),
            },
        },
        PdfOptions: &kreuzberg.PdfConfig{
            ExtractImages:   kreuzberg.BoolPtr(true),
            ExtractMetadata: kreuzberg.BoolPtr(true),
        },
        Images: &kreuzberg.ImageExtractionConfig{
            ExtractImages:     kreuzberg.BoolPtr(true),
            TargetDPI:         kreuzberg.IntPtr(150),
            MaxImageDimension: kreuzberg.IntPtr(4096),
        },
        Chunking: &kreuzberg.ChunkingConfig{
            MaxChars:   kreuzberg.IntPtr(1000),
            MaxOverlap: kreuzberg.IntPtr(200),
        },
        TokenReduction: &kreuzberg.TokenReductionConfig{
            Mode:                   "moderate",
            PreserveImportantWords: kreuzberg.BoolPtr(true),
        },
        LanguageDetection: &kreuzberg.LanguageDetectionConfig{
            Enabled:        kreuzberg.BoolPtr(true),
            MinConfidence:  kreuzberg.FloatPtr(0.8),
            DetectMultiple: kreuzberg.BoolPtr(false),
        },
    }

    result, err := kreuzberg.ExtractFileSync("document.pdf", config)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    fmt.Printf("Extracted content length: %d\n", len(result.Content))
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.config.*;

ExtractionConfig config = ExtractionConfig.builder()
    .useCache(true)
    .enableQualityProcessing(true)
    .forceOcr(false)
    .ocr(OcrConfig.builder()
        .backend("tesseract")
        .language("eng+fra")
        .build())
    .pdfOptions(PdfConfig.builder()
        .extractImages(true)
        .extractMetadata(true)
        .build())
    .imageExtraction(ImageExtractionConfig.builder()
        .extractImages(true)
        .targetDpi(150)
        .maxImageDimension(4096)
        .build())
    .imagePreprocessing(ImagePreprocessingConfig.builder()
        .targetDpi(300)
        .denoise(true)
        .deskew(true)
        .contrastEnhance(true)
        .build())
    .chunking(ChunkingConfig.builder()
        .maxChars(1000)
        .maxOverlap(200)
        .build())
    .tokenReduction(TokenReductionConfig.builder()
        .mode("moderate")
        .preserveImportantWords(true)
        .build())
    .languageDetection(LanguageDetectionConfig.builder()
        .enabled(true)
        .minConfidence(0.8)
        .build())
    .postprocessor(PostProcessorConfig.builder()
        .enabled(true)
        .build())
    .build();

ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
Python
import asyncio
from kreuzberg import (
    extract_file,
    ExtractionConfig,
    OcrConfig,
    TesseractConfig,
    PdfConfig,
    ChunkingConfig,
    EmbeddingConfig,
    EmbeddingModelType,
)

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig(
        use_cache=True,
        enable_quality_processing=True,
        ocr=OcrConfig(
            backend="tesseract",
            language="eng+fra",
            tesseract_config=TesseractConfig(psm=3),
        ),
        pdf_options=PdfConfig(extract_images=True),
        chunking=ChunkingConfig(
            max_chars=1000,
            max_overlap=200,
            embedding=EmbeddingConfig(
                model=EmbeddingModelType.preset("all-MiniLM-L6-v2")
            ),
        ),
    )
    result = await extract_file("document.pdf", config=config)
    print(f"Content: {result.content[:100]}")

asyncio.run(main())
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(
  use_cache: true,
  enable_quality_processing: true,
  force_ocr: false,
  ocr: Kreuzberg::Config::OCR.new(
    backend: 'tesseract',
    language: 'eng+fra'
  ),
  pdf_options: Kreuzberg::Config::PDF.new(
    extract_images: true,
    extract_metadata: true
  ),
  image_extraction: Kreuzberg::Config::ImageExtraction.new(
    extract_images: true,
    target_dpi: 150,
    max_image_dimension: 4096
  ),
  chunking: Kreuzberg::Config::Chunking.new(
    max_chars: 1000,
    max_overlap: 200
  ),
  token_reduction: Kreuzberg::Config::TokenReduction.new(mode: 'moderate'),
  language_detection: Kreuzberg::Config::LanguageDetection.new(
    enabled: true,
    min_confidence: 0.8
  ),
  postprocessor: Kreuzberg::Config::PostProcessor.new(enabled: true)
)

result = Kreuzberg.extract_file_sync('document.pdf', config: config)
puts "Content length: #{result.content.length}"
Rust
use kreuzberg::{
    extract_file, ExtractionConfig, OcrConfig, TesseractConfig, ImagePreprocessingConfig,
    PdfConfig, ImageExtractionConfig, ChunkingConfig, TokenReductionConfig,
    LanguageDetectionConfig, PostProcessorConfig,
};

#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig {
        use_cache: true,
        enable_quality_processing: true,
        force_ocr: false,
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng+fra".to_string(),
            tesseract_config: Some(TesseractConfig {
                psm: 3,
                oem: 3,
                min_confidence: 0.8,
                preprocessing: Some(ImagePreprocessingConfig {
                    target_dpi: 300,
                    denoise: true,
                    deskew: true,
                    contrast_enhance: true,
                    ..Default::default()
                }),
                enable_table_detection: true,
                ..Default::default()
            }),
        }),
        pdf_options: Some(PdfConfig {
            extract_images: true,
            extract_metadata: true,
            ..Default::default()
        }),
        images: Some(ImageExtractionConfig {
            extract_images: true,
            target_dpi: 150,
            max_image_dimension: 4096,
            ..Default::default()
        }),
        chunking: Some(ChunkingConfig {
            max_chars: 1000,
            max_overlap: 200,
            ..Default::default()
        }),
        token_reduction: Some(TokenReductionConfig {
            mode: "moderate".to_string(),
            preserve_important_words: true,
        }),
        language_detection: Some(LanguageDetectionConfig {
            enabled: true,
            min_confidence: 0.8,
            detect_multiple: false,
        }),
        postprocessor: Some(PostProcessorConfig {
            enabled: true,
            ..Default::default()
        }),
        ..Default::default()
    };

    let result = extract_file("document.pdf", None, &config).await?;
    println!("Extracted content length: {}", result.content.len());
    Ok(())
}
kreuzberg.toml
use_cache = true
enable_quality_processing = true
force_ocr = false

[ocr]
backend = "tesseract"
language = "eng+fra"

[ocr.tesseract_config]
psm = 3
oem = 3
min_confidence = 0.8
enable_table_detection = true

[ocr.tesseract_config.preprocessing]
target_dpi = 300
denoise = true
deskew = true
contrast_enhance = true

[pdf_options]
extract_images = true
extract_metadata = true

[images]
extract_images = true
target_dpi = 150
max_image_dimension = 4096

[chunking]
max_chars = 1000
max_overlap = 200

[chunking.embedding]
batch_size = 32

[token_reduction]
mode = "moderate"
preserve_important_words = true

[language_detection]
enabled = true
min_confidence = 0.8
detect_multiple = false

[postprocessor]
enabled = true
TypeScript
import { extractFile } from '@kreuzberg/node';

const config = {
    useCache: true,
    enableQualityProcessing: true,
    forceOcr: false,
    ocr: {
        backend: 'tesseract',
        language: 'eng+fra',
        tesseractConfig: {
            psm: 3,
            enableTableDetection: true,
        },
    },
    pdfOptions: {
        extractImages: true,
        extractMetadata: true,
    },
    images: {
        extractImages: true,
        targetDpi: 150,
        maxImageDimension: 2048,
    },
    chunking: {
        maxChars: 1000,
        maxOverlap: 200,
        embedding: {
            preset: 'balanced',
        },
    },
    tokenReduction: {
        mode: 'moderate',
        preserveImportantWords: true,
    },
    languageDetection: {
        enabled: true,
        minConfidence: 0.8,
        detectMultiple: false,
    },
    postprocessor: {
        enabled: true,
    },
};

const result = await extractFile('document.pdf', null, config);
console.log(`Extracted content length: ${result.content.length}`);