From 171ed11e3600231e0067ad8423495b56f588e3b8 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sat, 24 Jun 2023 16:55:43 -0700 Subject: [PATCH 1/3] tiktoken override --- langchain/chat_models/openai.py | 1 + langchain/embeddings/openai.py | 7 +++++-- langchain/llms/openai.py | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/langchain/chat_models/openai.py b/langchain/chat_models/openai.py index 83e43ae0ff11b..56cdfa091429e 100644 --- a/langchain/chat_models/openai.py +++ b/langchain/chat_models/openai.py @@ -184,6 +184,7 @@ def lc_serializable(self) -> bool: """Number of chat completions to generate for each prompt.""" max_tokens: Optional[int] = None """Maximum number of tokens to generate.""" + tiktoken_model_name: Optional[str] = None class Config: """Configuration for this pydantic object.""" diff --git a/langchain/embeddings/openai.py b/langchain/embeddings/openai.py index 9c23323075242..4defeaa85f529 100644 --- a/langchain/embeddings/openai.py +++ b/langchain/embeddings/openai.py @@ -170,6 +170,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings): request_timeout: Optional[Union[float, Tuple[float, float]]] = None """Timeout in seconds for the OpenAPI request.""" headers: Any = None + tiktoken_model_name: Optional[str] = None class Config: """Configuration for this pydantic object.""" @@ -265,7 +266,8 @@ def _get_len_safe_embeddings( tokens = [] indices = [] - encoding = tiktoken.model.encoding_for_model(self.model) + model_name = self.tiktoken_model_name or self.model + encoding = tiktoken.model.encoding_for_model(model_name) for i, text in enumerate(texts): if self.model.endswith("001"): # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 @@ -329,7 +331,8 @@ async def _aget_len_safe_embeddings( tokens = [] indices = [] - encoding = tiktoken.model.encoding_for_model(self.model) + model_name = self.tiktoken_model_name or self.model + encoding = tiktoken.model.encoding_for_model(model_name) for i, text in enumerate(texts): if self.model.endswith("001"): # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 diff --git a/langchain/llms/openai.py b/langchain/llms/openai.py index 8fd350cbff574..3685c920d6c44 100644 --- a/langchain/llms/openai.py +++ b/langchain/llms/openai.py @@ -171,6 +171,7 @@ def lc_serializable(self) -> bool: """Set of special tokens that are allowed。""" disallowed_special: Union[Literal["all"], Collection[str]] = "all" """Set of special tokens that are not allowed。""" + tiktoken_model_name: Optional[str] = None def __new__(cls, **data: Any) -> Union[OpenAIChat, BaseOpenAI]: # type: ignore """Initialize the OpenAI object.""" @@ -491,7 +492,8 @@ def get_token_ids(self, text: str) -> List[int]: "Please install it with `pip install tiktoken`." ) - enc = tiktoken.encoding_for_model(self.model_name) + model_name = self.tiktoken_model_name or self.model_name + enc = tiktoken.model.encoding_for_model(model_name) return enc.encode( text, From bb9680856b2d7a4e68e843871b18dbf79bf50452 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sun, 25 Jun 2023 12:03:23 -0700 Subject: [PATCH 2/3] cr --- langchain/chat_models/openai.py | 30 +++++++++++++++++++++--------- langchain/embeddings/openai.py | 16 +++++++++++++++- langchain/llms/openai.py | 16 +++++++++++++++- 3 files changed, 51 insertions(+), 11 deletions(-) diff --git a/langchain/chat_models/openai.py b/langchain/chat_models/openai.py index 56cdfa091429e..f1725b83313c3 100644 --- a/langchain/chat_models/openai.py +++ b/langchain/chat_models/openai.py @@ -185,6 +185,15 @@ def lc_serializable(self) -> bool: max_tokens: Optional[int] = None """Maximum number of tokens to generate.""" tiktoken_model_name: Optional[str] = None + """The model name to pass to tiktoken when using this class. + Tiktoken is used to count the number of tokens in documents to constrain + them to be under a certain limit. By default, when set to None, this will + be the same as the embedding model name. However, there are some cases + where you may want to use this Embedding class with a model name not + supported by tiktoken. This can include when using Azure embeddings or + when using one of the many model providers that expose an OpenAI-like + API but with different models. In those cases, in order to avoid erroring + when tiktoken is called, you can specify a model name to use here.""" class Config: """Configuration for this pydantic object.""" @@ -449,15 +458,18 @@ def _llm_type(self) -> str: def _get_encoding_model(self) -> Tuple[str, tiktoken.Encoding]: tiktoken_ = _import_tiktoken() - model = self.model_name - if model == "gpt-3.5-turbo": - # gpt-3.5-turbo may change over time. - # Returning num tokens assuming gpt-3.5-turbo-0301. - model = "gpt-3.5-turbo-0301" - elif model == "gpt-4": - # gpt-4 may change over time. - # Returning num tokens assuming gpt-4-0314. - model = "gpt-4-0314" + if self.tiktoken_model_name is not None: + model = self.tiktoken_model_name + else: + model = self.model_name + if model == "gpt-3.5-turbo": + # gpt-3.5-turbo may change over time. + # Returning num tokens assuming gpt-3.5-turbo-0301. + model = "gpt-3.5-turbo-0301" + elif model == "gpt-4": + # gpt-4 may change over time. + # Returning num tokens assuming gpt-4-0314. + model = "gpt-4-0314" # Returns the number of tokens used by a list of messages. try: encoding = tiktoken_.encoding_for_model(model) diff --git a/langchain/embeddings/openai.py b/langchain/embeddings/openai.py index 4defeaa85f529..bcc19fed05f96 100644 --- a/langchain/embeddings/openai.py +++ b/langchain/embeddings/openai.py @@ -171,6 +171,15 @@ class OpenAIEmbeddings(BaseModel, Embeddings): """Timeout in seconds for the OpenAPI request.""" headers: Any = None tiktoken_model_name: Optional[str] = None + """The model name to pass to tiktoken when using this class. + Tiktoken is used to count the number of tokens in documents to constrain + them to be under a certain limit. By default, when set to None, this will + be the same as the embedding model name. However, there are some cases + where you may want to use this Embedding class with a model name not + supported by tiktoken. This can include when using Azure embeddings or + when using one of the many model providers that expose an OpenAI-like + API but with different models. In those cases, in order to avoid erroring + when tiktoken is called, you can specify a model name to use here.""" class Config: """Configuration for this pydantic object.""" @@ -267,7 +276,12 @@ def _get_len_safe_embeddings( tokens = [] indices = [] model_name = self.tiktoken_model_name or self.model - encoding = tiktoken.model.encoding_for_model(model_name) + try: + encoding = tiktoken.encoding_for_model(model_name) + except KeyError: + logger.warning("Warning: model not found. Using cl100k_base encoding.") + model = "cl100k_base" + encoding = tiktoken.get_encoding(model) for i, text in enumerate(texts): if self.model.endswith("001"): # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 diff --git a/langchain/llms/openai.py b/langchain/llms/openai.py index 3685c920d6c44..19bce8bddcf79 100644 --- a/langchain/llms/openai.py +++ b/langchain/llms/openai.py @@ -172,6 +172,15 @@ def lc_serializable(self) -> bool: disallowed_special: Union[Literal["all"], Collection[str]] = "all" """Set of special tokens that are not allowed。""" tiktoken_model_name: Optional[str] = None + """The model name to pass to tiktoken when using this class. + Tiktoken is used to count the number of tokens in documents to constrain + them to be under a certain limit. By default, when set to None, this will + be the same as the embedding model name. However, there are some cases + where you may want to use this Embedding class with a model name not + supported by tiktoken. This can include when using Azure embeddings or + when using one of the many model providers that expose an OpenAI-like + API but with different models. In those cases, in order to avoid erroring + when tiktoken is called, you can specify a model name to use here.""" def __new__(cls, **data: Any) -> Union[OpenAIChat, BaseOpenAI]: # type: ignore """Initialize the OpenAI object.""" @@ -493,7 +502,12 @@ def get_token_ids(self, text: str) -> List[int]: ) model_name = self.tiktoken_model_name or self.model_name - enc = tiktoken.model.encoding_for_model(model_name) + try: + enc = tiktoken.encoding_for_model(model_name) + except KeyError: + logger.warning("Warning: model not found. Using cl100k_base encoding.") + model = "cl100k_base" + enc = tiktoken.get_encoding(model) return enc.encode( text, From fb701eb584e7e4820cdf9e5dbb97b738c81c83e0 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sun, 25 Jun 2023 12:03:54 -0700 Subject: [PATCH 3/3] cr --- langchain/embeddings/openai.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/langchain/embeddings/openai.py b/langchain/embeddings/openai.py index bcc19fed05f96..f3cb66547b7ed 100644 --- a/langchain/embeddings/openai.py +++ b/langchain/embeddings/openai.py @@ -346,7 +346,12 @@ async def _aget_len_safe_embeddings( tokens = [] indices = [] model_name = self.tiktoken_model_name or self.model - encoding = tiktoken.model.encoding_for_model(model_name) + try: + encoding = tiktoken.encoding_for_model(model_name) + except KeyError: + logger.warning("Warning: model not found. Using cl100k_base encoding.") + model = "cl100k_base" + encoding = tiktoken.get_encoding(model) for i, text in enumerate(texts): if self.model.endswith("001"): # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500