Quick Start¶
Get up and running with Kreuzberg in minutes.
Choosing Your TypeScript Package
Kreuzberg provides two TypeScript packages for different runtimes:
@kreuzberg/node– Use for Node.js servers and CLI tools (native performance, 100% speed)@kreuzberg/wasm– Use for browsers, Cloudflare Workers, Deno, Bun, and serverless (60-80% speed, cross-platform)
The examples below show both. Pick the one matching your runtime. See Platform Overview for detailed guidance.
Basic Extraction¶
Extract text from any supported document format:
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Println(result.Content)
fmt.Printf("Tables: %d\n", len(result.Tables))
fmt.Printf("Metadata: %+v\n", result.Metadata)
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
try {
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
System.out.println(result.getContent());
System.out.println("Tables: " + result.getTables().size());
System.out.println("Metadata: " + result.getMetadata());
} catch (IOException | KreuzbergException e) {
e.printStackTrace();
}
from kreuzberg import extract_file_sync, ExtractionConfig
config: ExtractionConfig = ExtractionConfig()
result = extract_file_sync("document.pdf", config=config)
content: str = result.content
table_count: int = len(result.tables)
metadata: dict = result.metadata
print(f"Content length: {len(content)} characters")
print(f"Tables: {table_count}")
print(f"Metadata keys: {list(metadata.keys())}")
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
println!("{}", result.content);
println!("Tables: {}", result.tables.len());
println!("Metadata: {:?}", result.metadata);
Ok(())
}
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file);
console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
}
Async Extraction¶
For better performance with I/O-bound operations:
package main
import (
"context"
"fmt"
"log"
"time"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
result, err := kreuzberg.ExtractFile(ctx, "document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Println(result.Content)
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import java.nio.file.Path;
import java.util.concurrent.CompletableFuture;
public class Example {
public static void main(String[] args) {
CompletableFuture<ExtractionResult> future =
Kreuzberg.extractFileAsync(Path.of("document.pdf"), null);
future.thenAccept(result -> {
System.out.println(result.getContent());
System.out.println("Tables: " + result.getTables().size());
}).join();
}
}
import asyncio
from kreuzberg import extract_file, ExtractionConfig
async def main() -> None:
config: ExtractionConfig = ExtractionConfig()
result = await extract_file("document.pdf", config=config)
content: str = result.content
table_count: int = len(result.tables)
print(f"Content length: {len(content)} characters")
print(f"Tables: {table_count}")
asyncio.run(main())
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file);
const content = result.content;
const tableCount = result.tables.length;
console.log(`Content length: ${content.length} characters`);
console.log(`Tables: ${tableCount}`);
}
Not Applicable
Async extraction is an API-level feature. The CLI operates synchronously. Use language-specific bindings (Python, TypeScript, Rust, WASM) for async operations.
OCR Extraction¶
Extract text from images and scanned documents:
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
lang := "eng"
cfg := &kreuzberg.ExtractionConfig{
OCR: &kreuzberg.OCRConfig{
Backend: "tesseract",
Language: &lang,
},
}
result, err := kreuzberg.ExtractFileSync("scanned.pdf", cfg)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println(len(result.Content))
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.OcrConfig;
import java.io.IOException;
public class Main {
public static void main(String[] args) {
try {
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng")
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
System.out.println(result.getContent());
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(backend="tesseract", language="eng")
)
result = extract_file_sync("scanned.pdf", config=config)
content: str = result.content
preview: str = content[:100]
total_length: int = len(content)
print(f"Extracted content (preview): {preview}")
print(f"Total characters: {total_length}")
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: Some("eng".to_string()),
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("scanned.pdf", None, &config)?;
println!("{}", result.content);
Ok(())
}
import { enableOcr, extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
await enableOcr();
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file, file.type, {
ocr: {
backend: 'tesseract-wasm',
language: 'eng',
},
});
console.log(result.content);
}
Batch Processing¶
Process multiple files concurrently:
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
files := []string{"doc1.pdf", "doc2.docx", "doc3.pptx"}
results, err := kreuzberg.BatchExtractFilesSync(files, nil)
if err != nil {
log.Fatalf("batch extract failed: %v", err)
}
for i, result := range results {
if result == nil {
continue
}
fmt.Printf("File %d: %d characters\n", i+1, len(result.Content))
}
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
try {
List<String> files = Arrays.asList("doc1.pdf", "doc2.docx", "doc3.pptx");
List<ExtractionResult> results = Kreuzberg.batchExtractFiles(files, null);
for (int i = 0; i < results.size(); i++) {
ExtractionResult result = results.get(i);
System.out.println("File " + (i + 1) + ": " + result.getContent().length() + " characters");
}
} catch (IOException | KreuzbergException e) {
e.printStackTrace();
}
from kreuzberg import batch_extract_files_sync, ExtractionConfig
files: list[str] = ["doc1.pdf", "doc2.docx", "doc3.pptx"]
config: ExtractionConfig = ExtractionConfig()
results = batch_extract_files_sync(files, config=config)
for i, result in enumerate(results):
char_count: int = len(result.content)
print(f"File {i + 1}: {char_count} characters")
use kreuzberg::{batch_extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let files = vec!["doc1.pdf", "doc2.docx", "doc3.pptx"];
let config = ExtractionConfig::default();
let results = batch_extract_file_sync(&files, None, &config)?;
for (i, result) in results.iter().enumerate() {
println!("File {}: {} characters", i + 1, result.content.len());
}
Ok(())
}
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const fileInputs = document.getElementById('files') as HTMLInputElement;
const files = Array.from(fileInputs.files || []);
const results = await Promise.all(
files.map((file) => extractFromFile(file))
);
results.forEach((result, i) => {
console.log(`File ${i + 1}: ${result.content.length} characters`);
});
Extract from Bytes¶
When you already have file content in memory:
package main
import (
"log"
"os"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
data, err := os.ReadFile("document.pdf")
if err != nil {
log.Fatalf("read file: %v", err)
}
result, err := kreuzberg.ExtractBytesSync(data, "application/pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println(result.Content)
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
try {
byte[] data = Files.readAllBytes(Paths.get("document.pdf"));
ExtractionResult result = Kreuzberg.extractBytes(
data,
"application/pdf",
null
);
System.out.println(result.getContent());
} catch (IOException | KreuzbergException e) {
e.printStackTrace();
}
Not Applicable
The CLI operates on files from disk. For in-memory data processing, use language-specific bindings.
However, you can use CLI with pipes and temporary files:
Advanced Configuration¶
Customize extraction behavior:
using Kreuzberg;
var config = new ExtractionConfig
{
Ocr = new OcrConfig { Backend = "tesseract", Language = "eng+deu" },
Chunking = new ChunkingConfig { MaxChars = 1000, MaxOverlap = 100 },
TokenReduction = new TokenReductionConfig { Enabled = true },
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
DetectMultiple = true
},
UseCache = true,
EnableQualityProcessing = true
};
var result = KreuzbergClient.ExtractFileSync("document.pdf", config);
foreach (var chunk in result.Chunks)
{
Console.WriteLine($"Chunk: {chunk.Content[..Math.Min(100, chunk.Content.Length)]}");
}
if (result.DetectedLanguages?.Count > 0)
{
Console.WriteLine($"Languages: {string.Join(", ", result.DetectedLanguages)}");
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
lang := "eng+deu" // Multiple languages
chunkSize := 1000
chunkOverlap := 100
useCache := true
enableQuality := true
detectMultiple := true
config := &kreuzberg.ExtractionConfig{
OCR: &kreuzberg.OCRConfig{
Backend: "tesseract",
Language: &lang,
},
Chunking: &kreuzberg.ChunkingConfig{
ChunkSize: &chunkSize,
ChunkOverlap: &chunkOverlap,
},
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
Enabled: &useCache,
DetectMultiple: &detectMultiple,
},
UseCache: &useCache,
EnableQualityProcessing: &enableQuality,
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
// Access chunks
if len(result.Chunks) > 0 {
snippet := result.Chunks[0].Content
if len(snippet) > 100 {
snippet = snippet[:100]
}
fmt.Printf("First chunk: %s...\n", snippet)
}
// Access detected languages
if len(result.DetectedLanguages) > 0 {
fmt.Printf("Languages: %v\n", result.DetectedLanguages)
}
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.config.*;
import java.io.IOException;
public class Main {
public static void main(String[] args) {
try {
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng+deu")
.build())
.chunking(ChunkingConfig.builder()
.maxChars(1000)
.maxOverlap(100)
.build())
.tokenReduction(TokenReductionConfig.builder()
.mode("moderate")
.preserveImportantWords(true)
.build())
.languageDetection(LanguageDetectionConfig.builder()
.enabled(true)
.build())
.useCache(true)
.enableQualityProcessing(true)
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
if (!result.getDetectedLanguages().isEmpty()) {
System.out.println("Languages: " + result.getDetectedLanguages());
}
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
from kreuzberg import (
extract_file_sync,
ExtractionConfig,
OcrConfig,
ChunkingConfig,
TokenReductionConfig,
LanguageDetectionConfig,
)
config = ExtractionConfig(
ocr=OcrConfig(backend="tesseract", language="eng+deu"),
chunking=ChunkingConfig(max_chars=1000, max_overlap=100),
token_reduction=TokenReductionConfig(enabled=True),
language_detection=LanguageDetectionConfig(
enabled=True, detect_multiple=True
),
use_cache=True,
enable_quality_processing=True,
)
result = extract_file_sync("document.pdf", config=config)
for chunk in result.chunks:
print(f"Chunk: {chunk.content[:100]}")
if result.detected_languages:
print(f"Languages: {result.detected_languages}")
require 'kreuzberg'
config = Kreuzberg::Config::Extraction.new(
ocr: Kreuzberg::Config::OCR.new(
backend: 'tesseract',
language: 'eng+deu'
),
chunking: Kreuzberg::Config::Chunking.new(
max_chars: 1000,
max_overlap: 100
),
language_detection: Kreuzberg::Config::LanguageDetection.new,
use_cache: true,
enable_quality_processing: true
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
result.chunks&.each { |chunk| puts chunk[0..100] }
puts "Languages: #{result.detected_languages.inspect}"
use kreuzberg::{
extract_file_sync, ChunkingConfig, ExtractionConfig, LanguageDetectionConfig, OcrConfig,
};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: Some("eng+deu".to_string()),
..Default::default()
}),
chunking: Some(ChunkingConfig {
max_chars: 1000,
max_overlap: 100,
..Default::default()
}),
language_detection: Some(LanguageDetectionConfig {
enabled: true,
detect_multiple: true,
..Default::default()
}),
use_cache: true,
enable_quality_processing: true,
..Default::default()
};
let result = extract_file_sync("document.pdf", None, &config)?;
if let Some(chunks) = result.chunks {
for chunk in chunks {
println!("Chunk: {}...", &chunk[..100.min(chunk.len())]);
}
}
if let Some(languages) = result.detected_languages {
println!("Languages: {:?}", languages);
}
Ok(())
}
import { extractFileSync } from '@kreuzberg/node';
const config = {
ocr: {
backend: 'tesseract',
language: 'eng+deu',
},
chunking: {
maxChars: 1000,
maxOverlap: 100,
},
tokenReduction: {
mode: 'aggressive',
},
languageDetection: {
enabled: true,
detectMultiple: true,
},
useCache: true,
enableQualityProcessing: true,
};
const result = extractFileSync('document.pdf', null, config);
if (result.chunks) {
for (const chunk of result.chunks) {
console.log(`Chunk: ${chunk.content.substring(0, 100)}...`);
}
}
if (result.detectedLanguages) {
console.log(`Languages: ${result.detectedLanguages.join(', ')}`);
}
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const config = {
ocr: {
backend: 'tesseract-wasm',
language: 'eng',
},
chunking: {
maxChars: 1000,
chunkOverlap: 100,
},
enable_language_detection: true,
enable_quality: true,
};
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file, file.type, config);
if (result.chunks) {
for (const chunk of result.chunks) {
console.log(`Chunk: ${chunk.content.substring(0, 100)}...`);
}
}
if (result.detectedLanguages) {
console.log(`Languages: ${result.detectedLanguages.join(', ')}`);
}
}
Configure extraction behavior via command-line flags or config files:
# Using command-line flags
kreuzberg extract document.pdf \
--ocr \
--chunk --chunk-size 1000 --chunk-overlap 100 \
--detect-language \
--quality
# Using config file
kreuzberg extract document.pdf --config kreuzberg.toml
kreuzberg.toml:
[ocr]
backend = "tesseract"
language = "eng"
[chunking]
max_chunk_size = 1000
overlap = 100
[language_detection]
enabled = true
detect_multiple = true
enable_quality_processing = true
use_cache = true
kreuzberg.yaml:
Working with Metadata¶
Access format-specific metadata from extracted documents:
using Kreuzberg;
var config = new ExtractionConfig
{
PdfOptions = new PdfConfig { ExtractMetadata = true }
};
var result = KreuzbergClient.ExtractFileSync("document.pdf", config);
if (result.Metadata?.Format.Pdf != null)
{
var pdfMeta = result.Metadata.Format.Pdf;
Console.WriteLine($"Pages: {pdfMeta.PageCount}");
Console.WriteLine($"Author: {pdfMeta.Author}");
Console.WriteLine($"Title: {pdfMeta.Title}");
}
var htmlResult = KreuzbergClient.ExtractFileSync("page.html", config);
if (htmlResult.Metadata?.Format.Html != null)
{
var htmlMeta = htmlResult.Metadata.Format.Html;
Console.WriteLine($"Title: {htmlMeta.Title}");
Console.WriteLine($"Description: {htmlMeta.Description}");
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract pdf: %v", err)
}
// Access PDF metadata
if pdf, ok := result.Metadata.PdfMetadata(); ok {
if pdf.PageCount != nil {
fmt.Printf("Pages: %d\n", *pdf.PageCount)
}
if pdf.Author != nil {
fmt.Printf("Author: %s\n", *pdf.Author)
}
if pdf.Title != nil {
fmt.Printf("Title: %s\n", *pdf.Title)
}
}
// Access HTML metadata
htmlResult, err := kreuzberg.ExtractFileSync("page.html", nil)
if err != nil {
log.Fatalf("extract html: %v", err)
}
if html, ok := htmlResult.Metadata.HTMLMetadata(); ok {
if html.Title != nil {
fmt.Printf("Title: %s\n", *html.Title)
}
if html.Description != nil {
fmt.Printf("Description: %s\n", *html.Description)
}
if html.OGImage != nil {
fmt.Printf("Open Graph Image: %s\n", *html.OGImage)
}
}
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.util.Map;
public class Main {
public static void main(String[] args) {
try {
ExtractionResult result = Kreuzberg.extractFileSync("document.pdf");
// Access PDF metadata
@SuppressWarnings("unchecked")
Map<String, Object> pdfMeta = (Map<String, Object>) result.getMetadata().get("pdf");
if (pdfMeta != null) {
System.out.println("Pages: " + pdfMeta.get("page_count"));
System.out.println("Author: " + pdfMeta.get("author"));
System.out.println("Title: " + pdfMeta.get("title"));
}
// Access HTML metadata
ExtractionResult htmlResult = Kreuzberg.extractFileSync("page.html");
@SuppressWarnings("unchecked")
Map<String, Object> htmlMeta = (Map<String, Object>) htmlResult.getMetadata().get("html");
if (htmlMeta != null) {
System.out.println("Title: " + htmlMeta.get("title"));
System.out.println("Description: " + htmlMeta.get("description"));
System.out.println("Open Graph Image: " + htmlMeta.get("og_image"));
}
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
from kreuzberg import extract_file_sync, ExtractionConfig
result = extract_file_sync("document.pdf", config=ExtractionConfig())
pdf_meta: dict = result.metadata.get("pdf", {})
if pdf_meta:
print(f"Pages: {pdf_meta.get('page_count')}")
print(f"Author: {pdf_meta.get('author')}")
print(f"Title: {pdf_meta.get('title')}")
result = extract_file_sync("page.html", config=ExtractionConfig())
html_meta: dict = result.metadata.get("html", {})
if html_meta:
print(f"Title: {html_meta.get('title')}")
print(f"Description: {html_meta.get('description')}")
require 'kreuzberg'
result = Kreuzberg.extract_file_sync('document.pdf')
# Access PDF metadata
if result.metadata['pdf']
pdf_meta = result.metadata['pdf']
puts "Pages: #{pdf_meta['page_count']}"
puts "Author: #{pdf_meta['author']}"
puts "Title: #{pdf_meta['title']}"
end
# Access HTML metadata
html_result = Kreuzberg.extract_file_sync('page.html')
if html_result.metadata['html']
html_meta = html_result.metadata['html']
puts "Title: #{html_meta['title']}"
puts "Description: #{html_meta['description']}"
puts "Open Graph Image: #{html_meta['og_image']}"
end
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
if let Some(pdf_meta) = result.metadata.pdf {
if let Some(pages) = pdf_meta.page_count {
println!("Pages: {}", pages);
}
if let Some(author) = pdf_meta.author {
println!("Author: {}", author);
}
if let Some(title) = pdf_meta.title {
println!("Title: {}", title);
}
}
let html_result = extract_file_sync("page.html", None, &ExtractionConfig::default())?;
if let Some(html_meta) = html_result.metadata.html {
if let Some(title) = html_meta.title {
println!("Title: {}", title);
}
if let Some(desc) = html_meta.description {
println!("Description: {}", desc);
}
if let Some(og_img) = html_meta.og_image {
println!("Open Graph Image: {}", og_img);
}
}
Ok(())
}
import { extractFileSync } from '@kreuzberg/node';
const result = extractFileSync('document.pdf');
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
if (result.metadata.page_count) {
console.log(`Pages: ${result.metadata.page_count}`);
}
const htmlResult = extractFileSync('page.html');
console.log(`HTML Metadata: ${JSON.stringify(htmlResult.metadata)}`);
if (htmlResult.metadata.title) {
console.log(`Title: ${htmlResult.metadata.title}`);
}
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
if (result.metadata.page_count) {
console.log(`Pages: ${result.metadata.page_count}`);
}
if (result.metadata.title) {
console.log(`Title: ${result.metadata.title}`);
}
}
Extract and parse metadata using JSON output:
# Extract with metadata
kreuzberg extract document.pdf --metadata --format json --pretty
# Save to file and parse metadata
kreuzberg extract document.pdf --metadata --format json > result.json
# Extract PDF metadata
cat result.json | jq '.metadata.pdf'
# Extract HTML metadata
kreuzberg extract page.html --metadata --format json | jq '.metadata.html'
# Get specific fields
kreuzberg extract document.pdf --metadata --format json | \
jq '.metadata | {page_count, author, title}'
# Process multiple files
kreuzberg batch documents/*.pdf --metadata --format json > all_metadata.json
JSON Output Structure:
Kreuzberg extracts format-specific metadata for: - PDF: page count, title, author, subject, keywords, dates - HTML: 21 fields including SEO meta tags, Open Graph, Twitter Card - Excel: sheet count, sheet names - Email: from, to, CC, BCC, message ID, attachments - PowerPoint: title, author, description, fonts - Images: dimensions, format, EXIF data - Archives: format, file count, file list, sizes - XML: element count, unique elements - Text/Markdown: word count, line count, headers, links
See Types Reference for complete metadata reference.
Working with Tables¶
Extract and process tables from documents:
using Kreuzberg;
var result = KreuzbergClient.ExtractFileSync("document.pdf", new ExtractionConfig());
foreach (var table in result.Tables)
{
Console.WriteLine($"Table with {table.Cells.Count} rows");
Console.WriteLine(table.Markdown);
foreach (var row in table.Cells)
{
Console.WriteLine(string.Join(" | ", row));
}
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
// Iterate over tables
for _, table := range result.Tables {
fmt.Printf("Table with %d rows\n", len(table.Cells))
fmt.Println(table.Markdown) // Markdown representation
// Access cells
for _, row := range table.Cells {
fmt.Println(row)
}
}
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.Table;
import java.io.IOException;
import java.util.List;
public class Main {
public static void main(String[] args) {
try {
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
for (Table table : result.getTables()) {
System.out.println("Table with " + table.cells().size() + " rows");
System.out.println(table.markdown());
for (List<String> row : table.cells()) {
System.out.println(row);
}
}
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
from kreuzberg import extract_file_sync, ExtractionConfig, ExtractedTable
result = extract_file_sync("document.pdf", config=ExtractionConfig())
for table in result.tables:
row_count: int = len(table.cells)
print(f"Table with {row_count} rows")
print(table.markdown)
for row in table.cells:
print(row)
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
for table in &result.tables {
println!("Table with {} rows", table.cells.len());
println!("{}", table.markdown);
for row in &table.cells {
println!("{:?}", row);
}
}
Ok(())
}
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file);
for (const table of result.tables) {
console.log(`Table with ${table.cells.length} rows`);
console.log(`Page: ${table.pageNumber}`);
console.log(table.markdown);
}
}
Extract and process tables from documents:
# Extract tables
kreuzberg extract document.pdf --tables --format json --pretty
# Save tables to JSON
kreuzberg extract spreadsheet.xlsx --tables --format json > tables.json
# Extract and parse table markdown
kreuzberg extract document.pdf --tables --format json | \
jq '.tables[] | .markdown'
# Get table cells
kreuzberg extract document.pdf --tables --format json | \
jq '.tables[] | .cells'
# Batch extract tables from multiple files
kreuzberg batch documents/**/*.pdf --tables --format json > all_tables.json
JSON Table Structure:
Error Handling¶
Handle extraction errors gracefully:
using Kreuzberg;
try
{
var result = KreuzbergClient.ExtractFileSync("missing.pdf");
Console.WriteLine(result.Content);
}
catch (KreuzbergValidationException ex)
{
Console.Error.WriteLine($"Validation error: {ex.Message}");
}
catch (KreuzbergIOException ex)
{
Console.Error.WriteLine($"IO error: {ex.Message}");
throw;
}
catch (KreuzbergException ex)
{
Console.Error.WriteLine($"Extraction failed: {ex.Message}");
throw;
}
package main
import (
"errors"
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/kreuzberg"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
switch {
case errors.As(err, new(*kreuzberg.ValidationError)):
log.Fatalf("invalid configuration: %v", err)
case errors.As(err, new(*kreuzberg.ParsingError)):
log.Fatalf("failed to parse document: %v", err)
case errors.As(err, new(*kreuzberg.OCRError)):
log.Fatalf("OCR processing failed: %v", err)
case errors.As(err, new(*kreuzberg.MissingDependencyError)):
log.Fatalf("missing dependency: %v", err)
default:
log.Fatalf("extraction error: %v", err)
}
}
fmt.Println(result.Content)
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
try {
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
System.out.println("Extracted: " + result.getContent()
.substring(0, Math.min(100, result.getContent().length())));
} catch (IOException e) {
System.err.println("File not found: " + e.getMessage());
} catch (KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
try {
byte[] pdfBytes = new byte[] { };
ExtractionResult result = Kreuzberg.extractBytes(pdfBytes, "application/pdf", null);
System.out.println("Extracted " + result.getContent().length() + " characters");
} catch (KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
from kreuzberg import extract_file_sync, extract_bytes_sync, ExtractionConfig
from kreuzberg import (
KreuzbergError,
ParsingError,
OCRError,
ValidationError,
)
try:
result = extract_file_sync("document.pdf")
print(f"Extracted {len(result.content)} characters")
except ParsingError as e:
print(f"Failed to parse document: {e}")
except OCRError as e:
print(f"OCR processing failed: {e}")
except KreuzbergError as e:
print(f"Extraction error: {e}")
try:
config: ExtractionConfig = ExtractionConfig()
pdf_bytes: bytes = b"%PDF-1.4\n"
result = extract_bytes_sync(pdf_bytes, "application/pdf", config)
print(f"Extracted: {result.content[:100]}")
except ValidationError as e:
print(f"Invalid configuration: {e}")
except OCRError as e:
print(f"OCR failed: {e}")
except KreuzbergError as e:
print(f"Extraction failed: {e}")
require 'kreuzberg'
begin
result = Kreuzberg.extract_file_sync('document.pdf')
puts result.content
rescue Kreuzberg::ValidationError => e
puts "Invalid configuration: #{e.message}"
rescue Kreuzberg::ParsingError => e
puts "Failed to parse document: #{e.message}"
rescue Kreuzberg::OCRError => e
puts "OCR processing failed: #{e.message}"
rescue Kreuzberg::MissingDependencyError => e
puts "Missing dependency: #{e.message}"
rescue Kreuzberg::Error => e
puts "Extraction error: #{e.message}"
rescue StandardError => e
puts "System error: #{e.message}"
end
use kreuzberg::{extract_file_sync, extract_bytes_sync, ExtractionConfig, KreuzbergError};
fn main() -> kreuzberg::Result<()> {
match extract_file_sync("document.pdf", None, &ExtractionConfig::default()) {
Ok(result) => {
println!("Extracted {} characters", result.content.len());
}
Err(KreuzbergError::Parsing { message, .. }) => {
eprintln!("Failed to parse document: {}", message);
}
Err(KreuzbergError::Ocr { message, .. }) => {
eprintln!("OCR processing failed: {}", message);
}
Err(KreuzbergError::MissingDependency { message, .. }) => {
eprintln!("Missing dependency: {}", message);
}
Err(e) => {
eprintln!("Extraction failed: {}", e);
}
}
let pdf_bytes = b"%PDF-1.4\n...";
match extract_bytes_sync(pdf_bytes, "application/pdf", None, &ExtractionConfig::default()) {
Ok(result) => {
println!("Extracted: {}", &result.content[..100.min(result.content.len())]);
Ok(())
}
Err(KreuzbergError::Validation { message, .. }) => {
eprintln!("Invalid configuration: {}", message);
Err(KreuzbergError::Validation {
message: message.clone(),
source: None,
})
}
Err(KreuzbergError::Ocr { message, .. }) => {
eprintln!("OCR failed: {}", message);
Err(KreuzbergError::Ocr {
message: message.clone(),
source: None,
})
}
Err(e) => {
eprintln!("Extraction failed: {}", e);
Err(e)
}
}
}
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
try {
const result = await extractFromFile(file);
console.log(result.content);
} catch (error) {
if (error instanceof Error) {
console.error(`Extraction error: ${error.message}`);
} else {
throw error;
}
}
}
Next Steps¶
- Contributing - Learn how to contribute to Kreuzberg