Skip to content

Text loader

TextLoader

Bases: BaseLoader

Source code in griptape/griptape/loaders/text_loader.py
@define
class TextLoader(BaseLoader):
    MAX_TOKEN_RATIO = 0.5

    tokenizer: OpenAiTokenizer = field(
        default=Factory(lambda: OpenAiTokenizer(model=OpenAiTokenizer.DEFAULT_OPENAI_GPT_3_CHAT_MODEL)), kw_only=True
    )
    max_tokens: int = field(
        default=Factory(lambda self: round(self.tokenizer.max_tokens * self.MAX_TOKEN_RATIO), takes_self=True),
        kw_only=True,
    )
    chunker: TextChunker = field(
        default=Factory(
            lambda self: TextChunker(tokenizer=self.tokenizer, max_tokens=self.max_tokens), takes_self=True
        ),
        kw_only=True,
    )
    embedding_driver: Optional[BaseEmbeddingDriver] = field(default=None, kw_only=True)
    encoding: str = field(default="utf-8", kw_only=True)

    def load(self, text: str | Path) -> list[TextArtifact]:
        return self.text_to_artifacts(text)

    def load_collection(self, texts: list[str | Path]) -> dict[str, list[TextArtifact]]:
        return utils.execute_futures_dict(
            {utils.str_to_hash(str(text)): self.futures_executor.submit(self.text_to_artifacts, text) for text in texts}
        )

    def text_to_artifacts(self, text: str | Path) -> list[TextArtifact]:
        artifacts = []

        if isinstance(text, Path):
            with open(text, "r", encoding=self.encoding) as file:
                body = file.read()
        else:
            body = text

        if self.chunker:
            chunks = self.chunker.chunk(body)
        else:
            chunks = [TextArtifact(body)]

        if self.embedding_driver:
            for chunk in chunks:
                chunk.generate_embedding(self.embedding_driver)

        for chunk in chunks:
            chunk.encoding = self.encoding
            artifacts.append(chunk)

        return artifacts

MAX_TOKEN_RATIO = 0.5 class-attribute instance-attribute

chunker: TextChunker = field(default=Factory(lambda : TextChunker(tokenizer=self.tokenizer, max_tokens=self.max_tokens), takes_self=True), kw_only=True) class-attribute instance-attribute

embedding_driver: Optional[BaseEmbeddingDriver] = field(default=None, kw_only=True) class-attribute instance-attribute

encoding: str = field(default='utf-8', kw_only=True) class-attribute instance-attribute

max_tokens: int = field(default=Factory(lambda : round(self.tokenizer.max_tokens * self.MAX_TOKEN_RATIO), takes_self=True), kw_only=True) class-attribute instance-attribute

tokenizer: OpenAiTokenizer = field(default=Factory(lambda : OpenAiTokenizer(model=OpenAiTokenizer.DEFAULT_OPENAI_GPT_3_CHAT_MODEL)), kw_only=True) class-attribute instance-attribute

load(text)

Source code in griptape/griptape/loaders/text_loader.py
def load(self, text: str | Path) -> list[TextArtifact]:
    return self.text_to_artifacts(text)

load_collection(texts)

Source code in griptape/griptape/loaders/text_loader.py
def load_collection(self, texts: list[str | Path]) -> dict[str, list[TextArtifact]]:
    return utils.execute_futures_dict(
        {utils.str_to_hash(str(text)): self.futures_executor.submit(self.text_to_artifacts, text) for text in texts}
    )

text_to_artifacts(text)

Source code in griptape/griptape/loaders/text_loader.py
def text_to_artifacts(self, text: str | Path) -> list[TextArtifact]:
    artifacts = []

    if isinstance(text, Path):
        with open(text, "r", encoding=self.encoding) as file:
            body = file.read()
    else:
        body = text

    if self.chunker:
        chunks = self.chunker.chunk(body)
    else:
        chunks = [TextArtifact(body)]

    if self.embedding_driver:
        for chunk in chunks:
            chunk.generate_embedding(self.embedding_driver)

    for chunk in chunks:
        chunk.encoding = self.encoding
        artifacts.append(chunk)

    return artifacts