tokenizers
__all__ = ['BaseTokenizer', 'OpenAiTokenizer', 'CohereTokenizer', 'HuggingFaceTokenizer', 'AnthropicTokenizer', 'GoogleTokenizer', 'VoyageAiTokenizer', 'SimpleTokenizer', 'DummyTokenizer', 'AmazonBedrockTokenizer']
module-attribute
AmazonBedrockTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/amazon_bedrock_tokenizer.py
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'anthropic.claude-3': 200000, 'anthropic.claude-v2:1': 200000, 'anthropic.claude': 100000, 'cohere.command-r': 128000, 'cohere.embed': 512, 'cohere.command': 4000, 'cohere': 1024, 'ai21': 8192, 'meta-llama3': 8000, 'meta-llama2': 4096, 'mistral': 32000, 'amazon': 4096}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'anthropic.claude': 4096, 'cohere': 4096, 'ai21.j2': 8191, 'meta': 2048, 'amazon.titan-text-lite': 4096, 'amazon.titan-text-express': 8192, 'amazon.titan-text-premier': 3072, 'amazon': 4096, 'mistral': 8192}
class-attribute
instance-attribute
characters_per_token: int = field(default=4, kw_only=True)
class-attribute
instance-attribute
model: str = field(kw_only=True)
class-attribute
instance-attribute
AnthropicTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/anthropic_tokenizer.py
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'claude-3': 200000, 'claude-2.1': 200000, 'claude': 100000}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'claude': 4096}
class-attribute
instance-attribute
client: Anthropic = field(default=Factory(lambda: import_optional_dependency('anthropic').Anthropic()), kw_only=True)
class-attribute
instance-attribute
BaseTokenizer
Bases: ABC
Source code in griptape/tokenizers/base_tokenizer.py
DEFAULT_MAX_INPUT_TOKENS = 4096
class-attribute
instance-attribute
DEFAULT_MAX_OUTPUT_TOKENS = 1000
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {}
class-attribute
instance-attribute
max_input_tokens: int = field(kw_only=True, default=None)
class-attribute
instance-attribute
max_output_tokens: int = field(kw_only=True, default=None)
class-attribute
instance-attribute
model: str = field(kw_only=True)
class-attribute
instance-attribute
stop_sequences: list[str] = field(default=Factory(list), kw_only=True)
class-attribute
instance-attribute
__attrs_post_init__()
Source code in griptape/tokenizers/base_tokenizer.py
count_input_tokens_left(text)
count_output_tokens_left(text)
CohereTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/cohere_tokenizer.py
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'command-r': 128000, 'command': 4096, 'embed': 512}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'command': 4096, 'embed': 512}
class-attribute
instance-attribute
client: Client = field(kw_only=True)
class-attribute
instance-attribute
DummyTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/dummy_tokenizer.py
max_input_tokens: int = field(init=False, default=0, kw_only=True)
class-attribute
instance-attribute
max_output_tokens: int = field(init=False, default=0, kw_only=True)
class-attribute
instance-attribute
model: Optional[str] = field(default=None, kw_only=True)
class-attribute
instance-attribute
GoogleTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/google_tokenizer.py
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'gemini': 30720}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'gemini': 2048}
class-attribute
instance-attribute
api_key: str = field(kw_only=True, metadata={'serializable': True})
class-attribute
instance-attribute
model_client: GenerativeModel = field(default=Factory(lambda self: self._default_model_client(), takes_self=True), kw_only=True)
class-attribute
instance-attribute
HuggingFaceTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/huggingface_tokenizer.py
max_input_tokens: int = field(default=Factory(lambda self: self.tokenizer.model_max_length, takes_self=True), kw_only=True)
class-attribute
instance-attribute
max_output_tokens: int = field(default=4096, kw_only=True)
class-attribute
instance-attribute
tokenizer: PreTrainedTokenizerBase = field(default=Factory(lambda self: import_optional_dependency('transformers').AutoTokenizer.from_pretrained(self.model), takes_self=True), kw_only=True)
class-attribute
instance-attribute
OpenAiTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/openai_tokenizer.py
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
|
DEFAULT_ENCODING = 'cl100k_base'
class-attribute
instance-attribute
DEFAULT_MAX_OUTPUT_TOKENS = 4096
class-attribute
instance-attribute
DEFAULT_MAX_TOKENS = 2049
class-attribute
instance-attribute
DEFAULT_OPENAI_GPT_3_CHAT_MODEL = 'gpt-3.5-turbo'
class-attribute
instance-attribute
DEFAULT_OPENAI_GPT_3_COMPLETION_MODEL = 'gpt-3.5-turbo-instruct'
class-attribute
instance-attribute
DEFAULT_OPENAI_GPT_4_MODEL = 'gpt-4o'
class-attribute
instance-attribute
EMBEDDING_MODELS = ['text-embedding-ada-002', 'text-embedding-ada-001', 'text-embedding-3-small', 'text-embedding-3-large']
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_INPUT_TOKENS = {'gpt-4o': 128000, 'gpt-4-1106': 128000, 'gpt-4-32k': 32768, 'gpt-4': 8192, 'gpt-3.5-turbo-16k': 16384, 'gpt-3.5-turbo': 4096, 'gpt-35-turbo-16k': 16384, 'gpt-35-turbo': 4096, 'text-embedding-ada-002': 8191, 'text-embedding-ada-001': 2046, 'text-embedding-3-small': 8191, 'text-embedding-3-large': 8191}
class-attribute
instance-attribute
MODEL_PREFIXES_TO_MAX_OUTPUT_TOKENS = {'gpt': 4096}
class-attribute
instance-attribute
TOKEN_OFFSET = 8
class-attribute
instance-attribute
encoding: tiktoken.Encoding
property
max_input_tokens: int = field(kw_only=True, default=Factory(lambda self: self._default_max_input_tokens(), takes_self=True))
class-attribute
instance-attribute
max_output_tokens: int = field(kw_only=True, default=Factory(lambda self: self._default_max_output_tokens(), takes_self=True))
class-attribute
instance-attribute
count_tokens(text, model=None)
Handles the special case of ChatML.
Implementation adopted from the official OpenAI notebook: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb.
Source code in griptape/tokenizers/openai_tokenizer.py
SimpleTokenizer
Bases: BaseTokenizer
Source code in griptape/tokenizers/simple_tokenizer.py
characters_per_token: int = field(kw_only=True)
class-attribute
instance-attribute
model: str = field(init=False, kw_only=True)
class-attribute
instance-attribute
VoyageAiTokenizer
Bases: BaseTokenizer