Skip to content

Pdf loader

PdfLoader

Bases: TextLoader

Source code in griptape/griptape/loaders/pdf_loader.py
@define
class PdfLoader(TextLoader):
    chunker: PdfChunker = field(
        default=Factory(lambda self: PdfChunker(tokenizer=self.tokenizer, max_tokens=self.max_tokens), takes_self=True),
        kw_only=True,
    )

    def load(self, stream: str | IO | Path, password: str | None = None) -> list[TextArtifact]:
        return self._load_pdf(stream, password)

    def load_collection(
        self, streams: list[str | IO | Path], password: str | None = None
    ) -> dict[str, list[TextArtifact]]:
        return execute_futures_dict(
            {
                str_to_hash(s.decode())
                if isinstance(s, bytes)
                else str_to_hash(str(s)): self.futures_executor.submit(self._load_pdf, s, password)
                for s in streams
            }
        )

    def _load_pdf(self, stream: str | IO | Path, password: str | None) -> list[TextArtifact]:
        reader = PdfReader(stream, strict=True, password=password)

        return self.text_to_artifacts("\n".join([p.extract_text() for p in reader.pages]))

chunker: PdfChunker = field(default=Factory(lambda : PdfChunker(tokenizer=self.tokenizer, max_tokens=self.max_tokens), takes_self=True), kw_only=True) class-attribute instance-attribute

load(stream, password=None)

Source code in griptape/griptape/loaders/pdf_loader.py
def load(self, stream: str | IO | Path, password: str | None = None) -> list[TextArtifact]:
    return self._load_pdf(stream, password)

load_collection(streams, password=None)

Source code in griptape/griptape/loaders/pdf_loader.py
def load_collection(
    self, streams: list[str | IO | Path], password: str | None = None
) -> dict[str, list[TextArtifact]]:
    return execute_futures_dict(
        {
            str_to_hash(s.decode())
            if isinstance(s, bytes)
            else str_to_hash(str(s)): self.futures_executor.submit(self._load_pdf, s, password)
            for s in streams
        }
    )