Skip to content

Extractor Registry

The ExtractorRegistry manages document extractors and allows custom extractor registration.

kreuzberg.ExtractorRegistry

Registry for managing document extractors.

This class maintains a registry of extractors for different file types and provides functionality to get the appropriate extractor for a given MIME type, as well as add or remove custom extractors.

Source code in kreuzberg/_registry.py
class ExtractorRegistry:
    """Registry for managing document extractors.

    This class maintains a registry of extractors for different file types and provides
    functionality to get the appropriate extractor for a given MIME type, as well as
    add or remove custom extractors.
    """

    _default_extractors: ClassVar[list[type[Extractor]]] = [
        PDFExtractor,
        OfficeDocumentExtractor,
        PresentationExtractor,
        SpreadSheetExtractor,
        HTMLExtractor,
        EmailExtractor,
        StructuredDataExtractor,
        MarkdownExtractor,
        ImageExtractor,
        BibliographyExtractor,
        EbookExtractor,
        LaTeXExtractor,
        MiscFormatExtractor,
        StructuredTextExtractor,
        TabularDataExtractor,
        XMLBasedExtractor,
    ]
    _registered_extractors: ClassVar[list[type[Extractor]]] = []

    @classmethod
    @lru_cache
    def get_extractor(cls, mime_type: str | None, config: ExtractionConfig) -> Extractor | None:
        """Get an appropriate extractor for the given MIME type.

        Args:
            mime_type: The MIME type to find an extractor for.
            config: The extraction configuration.

        Returns:
            An extractor instance if one supports the MIME type, None otherwise.
        """
        extractors: list[type[Extractor]] = [
            *cls._registered_extractors,
            *cls._default_extractors,
        ]
        if mime_type:
            for extractor in extractors:
                if extractor.supports_mimetype(mime_type):
                    return extractor(mime_type=mime_type, config=config)

        return None

    @classmethod
    def add_extractor(cls, extractor: type[Extractor]) -> None:
        """Add a custom extractor to the registry.

        Args:
            extractor: The extractor class to add to the registry.
        """
        cls._registered_extractors.append(extractor)
        cls.get_extractor.cache_clear()

    @classmethod
    def remove_extractor(cls, extractor: type[Extractor]) -> None:
        """Remove a custom extractor from the registry.

        Args:
            extractor: The extractor class to remove from the registry.
        """
        try:
            cls._registered_extractors.remove(extractor)
            cls.get_extractor.cache_clear()
        except ValueError:
            pass

Functions

add_extractor(extractor: type[Extractor]) -> None classmethod

Add a custom extractor to the registry.

PARAMETER DESCRIPTION
extractor

The extractor class to add to the registry.

TYPE: type[Extractor]

Source code in kreuzberg/_registry.py
@classmethod
def add_extractor(cls, extractor: type[Extractor]) -> None:
    """Add a custom extractor to the registry.

    Args:
        extractor: The extractor class to add to the registry.
    """
    cls._registered_extractors.append(extractor)
    cls.get_extractor.cache_clear()

get_extractor(mime_type: str | None, config: ExtractionConfig) -> Extractor | None cached classmethod

Get an appropriate extractor for the given MIME type.

PARAMETER DESCRIPTION
mime_type

The MIME type to find an extractor for.

TYPE: str | None

config

The extraction configuration.

TYPE: ExtractionConfig

RETURNS DESCRIPTION
Extractor | None

An extractor instance if one supports the MIME type, None otherwise.

Source code in kreuzberg/_registry.py
@classmethod
@lru_cache
def get_extractor(cls, mime_type: str | None, config: ExtractionConfig) -> Extractor | None:
    """Get an appropriate extractor for the given MIME type.

    Args:
        mime_type: The MIME type to find an extractor for.
        config: The extraction configuration.

    Returns:
        An extractor instance if one supports the MIME type, None otherwise.
    """
    extractors: list[type[Extractor]] = [
        *cls._registered_extractors,
        *cls._default_extractors,
    ]
    if mime_type:
        for extractor in extractors:
            if extractor.supports_mimetype(mime_type):
                return extractor(mime_type=mime_type, config=config)

    return None

remove_extractor(extractor: type[Extractor]) -> None classmethod

Remove a custom extractor from the registry.

PARAMETER DESCRIPTION
extractor

The extractor class to remove from the registry.

TYPE: type[Extractor]

Source code in kreuzberg/_registry.py
@classmethod
def remove_extractor(cls, extractor: type[Extractor]) -> None:
    """Remove a custom extractor from the registry.

    Args:
        extractor: The extractor class to remove from the registry.
    """
    try:
        cls._registered_extractors.remove(extractor)
        cls.get_extractor.cache_clear()
    except ValueError:
        pass