tokenizers
__all__ = ['AmazonBedrockTokenizer', 'AnthropicTokenizer', 'BaseTokenizer', 'CohereTokenizer', 'DummyTokenizer', 'GoogleTokenizer', 'GrokTokenizer', 'HuggingFaceTokenizer', 'OpenAiTokenizer', 'SimpleTokenizer', 'VoyageAiTokenizer']
module-attribute
AmazonBedrockTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/amazon_bedrock_tokenizer.py
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'anthropic.claude-3': 200000, 'anthropic.claude-v2:1': 200000, 'anthropic.claude': 100000, 'cohere.command-r': 128000, 'cohere.embed': 512, 'cohere.command': 4000, 'cohere': 1024, 'ai21': 8192, 'meta.llama3-8b-instruct': 8000, 'meta.llama3-70b-instruct': 8000, 'meta.llama3-2-1b-instruct': 131000, 'meta.llama3-2-3b-instruct': 131000, 'meta.llama3': 128000, 'mistral.large-2407': 128000, 'mistral.mistral': 32000, 'mistral.mixtral': 32000, 'amazon.nova-micro-v1': 128000, 'amazon.nova': 300000, 'amazon.titan-embed-image': 128000, 'amazon.titan-embed-text': 8000, 'amazon.titan-text-express-v1': 8000, 'amazon.titan-text-lite-v1': 4000, 'amazon.titan-text-premier-v1': 32000}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'anthropic.claude-3-7': 8192, 'anthropic.claude-3-5': 8192, 'anthropic.claude': 4096, 'cohere': 4096, 'ai21.j2': 8191, 'meta': 2048, 'amazon.titan-text-lite': 4096, 'amazon.titan-text-express': 8192, 'amazon.titan-text-premier': 3072, 'amazon.nova': 5000, 'mistral.mistral': 8192, 'mistral.mixtral': 4096}
class-attribute
instance-attribute
characters_per_token = field(default=4, kw_only=True)
class-attribute
instance-attribute
model = field(kw_only=True)
class-attribute
instance-attribute
AnthropicTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/anthropic_tokenizer.py
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'claude-3': 200000, 'claude-2.1': 200000, 'claude': 100000}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'claude': 4096}
class-attribute
instance-attribute
client = field(default=Factory(lambda: import_optional_dependency('anthropic').Anthropic()), kw_only=True)
class-attribute
instance-attribute
count_tokens(text)
Source code in griptape/tokenizers/anthropic_tokenizer.py
BaseTokenizer
Bases: ABC
, SerializableMixin
Source code in griptape/tokenizers/base_tokenizer.py
DEFAULT_MAX_INPUT_TOKENS = 4096
class-attribute
instance-attribute
DEFAULT_MAX_OUTPUT_TOKENS = 1000
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {}
class-attribute
instance-attribute
_max_input_tokens = field(kw_only=True, default=None, alias='max_input_tokens', metadata={'serializable': True})
class-attribute
instance-attribute
_max_output_tokens = field(kw_only=True, default=None, alias='max_output_tokens', metadata={'serializable': True})
class-attribute
instance-attribute
model = field(kw_only=True, metadata={'serializable': True})
class-attribute
instance-attribute
stop_sequences = field(default=Factory(list), kw_only=True, metadata={'serializable': True})
class-attribute
instance-attribute
__attrs_post_init__()
Source code in griptape/tokenizers/base_tokenizer.py
_default_max_input_tokens()
Source code in griptape/tokenizers/base_tokenizer.py
_default_max_output_tokens()
Source code in griptape/tokenizers/base_tokenizer.py
count_input_tokens_left(text)
count_output_tokens_left(text)
count_tokens(text)
abstractmethod
max_input_tokens()
CohereTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/cohere_tokenizer.py
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'command-r': 128000, 'command': 4096, 'embed': 512}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'command': 4096, 'embed': 512}
class-attribute
instance-attribute
client = field(kw_only=True)
class-attribute
instance-attribute
DummyTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/dummy_tokenizer.py
_max_input_tokens = field(init=False, default=0, kw_only=True, alias='max_input_tokens')
class-attribute
instance-attribute
_max_output_tokens = field(init=False, default=0, kw_only=True, alias='max_output_tokens')
class-attribute
instance-attribute
model = field(default=None, kw_only=True)
class-attribute
instance-attribute
GoogleTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/google_tokenizer.py
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'gemini-1.5-pro': 2097152, 'gemini': 1048576}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'gemini': 8192}
class-attribute
instance-attribute
_client = field(default=None, kw_only=True, alias='client', metadata={'serializable': False})
class-attribute
instance-attribute
api_key = field(kw_only=True, metadata={'serializable': True})
class-attribute
instance-attribute
client()
GrokTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/grok_tokenizer.py
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'grok-2-vision': 32768, 'grok-2': 131072, 'grok-vision-beta': 8192, 'grok-beta': 131072}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'grok': 4096}
class-attribute
instance-attribute
api_key = field(kw_only=True, default=None)
class-attribute
instance-attribute
base_url = field(default='https://api.x.ai', kw_only=True, metadata={'serializable': True})
class-attribute
instance-attribute
headers = field(default=Factory(lambda self: {'Authorization': f'Bearer {self.api_key}'}, takes_self=True), kw_only=True)
class-attribute
instance-attribute
count_tokens(text)
Source code in griptape/tokenizers/grok_tokenizer.py
HuggingFaceTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/huggingface_tokenizer.py
_max_input_tokens = field(default=Factory(lambda self: self.tokenizer.model_max_length, takes_self=True), kw_only=True, alias='max_input_tokens')
class-attribute
instance-attribute
_max_output_tokens = field(default=4096, kw_only=True, alias='max_output_tokens')
class-attribute
instance-attribute
tokenizer = field(default=Factory(lambda self: import_optional_dependency('transformers').AutoTokenizer.from_pretrained(self.model), takes_self=True), kw_only=True)
class-attribute
instance-attribute
OpenAiTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/openai_tokenizer.py
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
|
DEFAULT_ENCODING = 'cl100k_base'
class-attribute
instance-attribute
DEFAULT_MAX_OUTPUT_TOKENS = 4096
class-attribute
instance-attribute
DEFAULT_MAX_TOKENS = 2049
class-attribute
instance-attribute
DEFAULT_OPENAI_GPT_3_CHAT_MODEL = 'gpt-3.5-turbo'
class-attribute
instance-attribute
DEFAULT_OPENAI_GPT_3_COMPLETION_MODEL = 'gpt-3.5-turbo-instruct'
class-attribute
instance-attribute
DEFAULT_OPENAI_GPT_4_MODEL = 'gpt-4o'
class-attribute
instance-attribute
EMBEDDING_MODELS = ['text-embedding-ada-002', 'text-embedding-ada-001', 'text-embedding-3-small', 'text-embedding-3-large']
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'gpt-4.1': 1000000, 'gpt-4o': 128000, 'gpt-4-1106': 128000, 'gpt-4-32k': 32768, 'gpt-4': 8192, 'gpt-3.5-turbo-16k': 16384, 'gpt-3.5-turbo': 4096, 'gpt-35-turbo-16k': 16384, 'gpt-35-turbo': 4096, 'text-embedding-ada-002': 8191, 'text-embedding-ada-001': 2046, 'text-embedding-3-small': 8191, 'text-embedding-3-large': 8191}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'gpt': 4096}
class-attribute
instance-attribute
TOKEN_OFFSET = 8
class-attribute
instance-attribute
_max_input_tokens = field(kw_only=True, default=Factory(lambda self: self._default_max_input_tokens(), takes_self=True), alias='max_input_tokens')
class-attribute
instance-attribute
_max_output_tokens = field(kw_only=True, default=Factory(lambda self: self._default_max_output_tokens(), takes_self=True), alias='max_output_tokens')
class-attribute
instance-attribute
encoding
property
_default_max_input_tokens()
Source code in griptape/tokenizers/openai_tokenizer.py
_default_max_output_tokens()
Source code in griptape/tokenizers/openai_tokenizer.py
count_tokens(text, model=None)
Handles the special case of ChatML.
Implementation adopted from the official OpenAI notebook: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb.
Source code in griptape/tokenizers/openai_tokenizer.py
SimpleTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/simple_tokenizer.py
characters_per_token = field(kw_only=True)
class-attribute
instance-attribute
model = field(init=False, default=None, kw_only=True)
class-attribute
instance-attribute
VoyageAiTokenizer
Bases: BaseTokenizer