Tokenizers
__all__ = ['BaseTokenizer', 'OpenAiTokenizer', 'CohereTokenizer', 'HuggingFaceTokenizer', 'AnthropicTokenizer', 'BedrockTitanTokenizer', 'BedrockCohereTokenizer', 'BedrockJurassicTokenizer', 'BedrockClaudeTokenizer', 'BedrockLlamaTokenizer', 'GoogleTokenizer', 'VoyageAiTokenizer', 'SimpleTokenizer', 'DummyTokenizer']
module-attribute
AnthropicTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/anthropic_tokenizer.py
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'claude-3': 200000, 'claude-2.1': 200000, 'claude': 100000}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'claude': 4096}
class-attribute
instance-attribute
client: Anthropic = field(default=Factory(lambda: import_optional_dependency('anthropic').Anthropic()), kw_only=True)
class-attribute
instance-attribute
BaseTokenizer
Bases: ABC
Source code in griptape/tokenizers/base_tokenizer.py
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {}
class-attribute
instance-attribute
max_input_tokens: int = field(kw_only=True, default=None)
class-attribute
instance-attribute
max_output_tokens: int = field(kw_only=True, default=None)
class-attribute
instance-attribute
model: str = field(kw_only=True)
class-attribute
instance-attribute
stop_sequences: list[str] = field(default=Factory(lambda: [utils.constants.RESPONSE_STOP_SEQUENCE]), kw_only=True)
class-attribute
instance-attribute
__attrs_post_init__()
count_input_tokens_left(text)
count_output_tokens_left(text)
BedrockClaudeTokenizer
Bases: AnthropicTokenizer
Source code in griptape/tokenizers/bedrock_claude_tokenizer.py
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'anthropic.claude-3': 200000, 'anthropic.claude-v2:1': 200000, 'anthropic.claude': 100000}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'anthropic.claude': 4096}
class-attribute
instance-attribute
BedrockCohereTokenizer
Bases: SimpleTokenizer
Source code in griptape/tokenizers/bedrock_cohere_tokenizer.py
DEFAULT_CHARACTERS_PER_TOKEN = 4
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'cohere': 1024}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'cohere': 4096}
class-attribute
instance-attribute
characters_per_token: int = field(default=DEFAULT_CHARACTERS_PER_TOKEN, kw_only=True)
class-attribute
instance-attribute
model: str = field(kw_only=True)
class-attribute
instance-attribute
BedrockJurassicTokenizer
Bases: SimpleTokenizer
Source code in griptape/tokenizers/bedrock_jurassic_tokenizer.py
DEFAULT_CHARACTERS_PER_TOKEN = 6
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'ai21': 8192}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'ai21.j2-mid-v1': 8191, 'ai21.j2-ultra-v1': 8191, 'ai21.j2-large-v1': 8191, 'ai21': 2048}
class-attribute
instance-attribute
characters_per_token: int = field(default=Factory(lambda self: self.DEFAULT_CHARACTERS_PER_TOKEN, takes_self=True), kw_only=True)
class-attribute
instance-attribute
model: str = field(kw_only=True)
class-attribute
instance-attribute
BedrockLlamaTokenizer
Bases: SimpleTokenizer
Source code in griptape/tokenizers/bedrock_llama_tokenizer.py
DEFAULT_CHARACTERS_PER_TOKEN = 6
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'meta': 2048}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'meta': 2048}
class-attribute
instance-attribute
characters_per_token: int = field(default=DEFAULT_CHARACTERS_PER_TOKEN, kw_only=True)
class-attribute
instance-attribute
model: str = field(kw_only=True)
class-attribute
instance-attribute
stop_sequences: list[str] = field(factory=list, kw_only=True)
class-attribute
instance-attribute
BedrockTitanTokenizer
Bases: SimpleTokenizer
Source code in griptape/tokenizers/bedrock_titan_tokenizer.py
DEFAULT_CHARACTERS_PER_TOKEN = 6
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'amazon': 4096}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'amazon': 8000}
class-attribute
instance-attribute
characters_per_token: int = field(default=DEFAULT_CHARACTERS_PER_TOKEN, kw_only=True)
class-attribute
instance-attribute
model: str = field(kw_only=True)
class-attribute
instance-attribute
stop_sequences: list[str] = field(default=Factory(lambda: ['User:']), kw_only=True)
class-attribute
instance-attribute
CohereTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/cohere_tokenizer.py
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'command': 4096}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'command': 4096}
class-attribute
instance-attribute
client: Client = field(kw_only=True)
class-attribute
instance-attribute
DummyTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/dummy_tokenizer.py
max_input_tokens: int = field(default=0, kw_only=True)
class-attribute
instance-attribute
max_output_tokens: int = field(default=0, kw_only=True)
class-attribute
instance-attribute
model: str = field(init=False)
class-attribute
instance-attribute
GoogleTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/google_tokenizer.py
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'gemini': 30720}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'gemini': 2048}
class-attribute
instance-attribute
api_key: str = field(kw_only=True, metadata={'serializable': True})
class-attribute
instance-attribute
model_client: GenerativeModel = field(default=Factory(lambda self: self._default_model_client(), takes_self=True), kw_only=True)
class-attribute
instance-attribute
count_tokens(text)
HuggingFaceTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/huggingface_tokenizer.py
max_input_tokens: int = field(default=Factory(lambda self: self.tokenizer.model_max_length, takes_self=True), kw_only=True)
class-attribute
instance-attribute
max_output_tokens: int = field(kw_only=True)
class-attribute
instance-attribute
model: str = field(init=False, kw_only=True)
class-attribute
instance-attribute
tokenizer: PreTrainedTokenizerBase = field(kw_only=True)
class-attribute
instance-attribute
OpenAiTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/openai_tokenizer.py
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
|
DEFAULT_ENCODING = 'cl100k_base'
class-attribute
instance-attribute
DEFAULT_MAX_OUTPUT_TOKENS = 4096
class-attribute
instance-attribute
DEFAULT_MAX_TOKENS = 2049
class-attribute
instance-attribute
DEFAULT_OPENAI_GPT_3_CHAT_MODEL = 'gpt-3.5-turbo'
class-attribute
instance-attribute
DEFAULT_OPENAI_GPT_3_COMPLETION_MODEL = 'gpt-3.5-turbo-instruct'
class-attribute
instance-attribute
DEFAULT_OPENAI_GPT_4_MODEL = 'gpt-4'
class-attribute
instance-attribute
EMBEDDING_MODELS = ['text-embedding-ada-002', 'text-embedding-ada-001', 'text-embedding-3-small', 'text-embedding-3-large']
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'gpt-4-1106': 128000, 'gpt-4-32k': 32768, 'gpt-4': 8192, 'gpt-3.5-turbo-16k': 16384, 'gpt-3.5-turbo': 4096, 'gpt-35-turbo-16k': 16384, 'gpt-35-turbo': 4096, 'text-embedding-ada-002': 8191, 'text-embedding-ada-001': 2046, 'text-embedding-3-small': 8191, 'text-embedding-3-large': 8191}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'gpt': 4096}
class-attribute
instance-attribute
TOKEN_OFFSET = 8
class-attribute
instance-attribute
encoding: tiktoken.Encoding
property
count_tokens(text, model=None)
Handles the special case of ChatML. Implementation adopted from the official OpenAI notebook: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
Source code in griptape/tokenizers/openai_tokenizer.py
SimpleTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/simple_tokenizer.py
characters_per_token: int = field(kw_only=True)
class-attribute
instance-attribute
model: str = field(kw_only=True, init=False)
class-attribute
instance-attribute
count_tokens(text)
Source code in griptape/tokenizers/simple_tokenizer.py
VoyageAiTokenizer
Bases: BaseTokenizer