Skip to content

Pdf loader

PdfLoader

Bases: BaseTextLoader

Source code in griptape/loaders/pdf_loader.py
@define
class PdfLoader(BaseTextLoader):
    chunker: PdfChunker = field(
        default=Factory(lambda self: PdfChunker(tokenizer=self.tokenizer, max_tokens=self.max_tokens), takes_self=True),
        kw_only=True,
    )
    encoding: None = field(default=None, kw_only=True)

    def load(
        self, source: bytes, password: Optional[str] = None, *args, **kwargs
    ) -> ErrorArtifact | list[TextArtifact]:
        PdfReader = import_optional_dependency("pypdf").PdfReader
        reader = PdfReader(BytesIO(source), strict=True, password=password)
        return self._text_to_artifacts("\n".join([p.extract_text() for p in reader.pages]))

    def load_collection(self, sources: list[bytes], *args, **kwargs) -> dict[str, ErrorArtifact | list[TextArtifact]]:
        return cast(
            dict[str, Union[ErrorArtifact, list[TextArtifact]]], super().load_collection(sources, *args, **kwargs)
        )

chunker: PdfChunker = field(default=Factory(lambda self: PdfChunker(tokenizer=self.tokenizer, max_tokens=self.max_tokens), takes_self=True), kw_only=True) class-attribute instance-attribute

encoding: None = field(default=None, kw_only=True) class-attribute instance-attribute

load(source, password=None, *args, **kwargs)

Source code in griptape/loaders/pdf_loader.py
def load(
    self, source: bytes, password: Optional[str] = None, *args, **kwargs
) -> ErrorArtifact | list[TextArtifact]:
    PdfReader = import_optional_dependency("pypdf").PdfReader
    reader = PdfReader(BytesIO(source), strict=True, password=password)
    return self._text_to_artifacts("\n".join([p.extract_text() for p in reader.pages]))

load_collection(sources, *args, **kwargs)

Source code in griptape/loaders/pdf_loader.py
def load_collection(self, sources: list[bytes], *args, **kwargs) -> dict[str, ErrorArtifact | list[TextArtifact]]:
    return cast(
        dict[str, Union[ErrorArtifact, list[TextArtifact]]], super().load_collection(sources, *args, **kwargs)
    )