From 171ed11e3600231e0067ad8423495b56f588e3b8 Mon Sep 17 00:00:00 2001
From: Harrison Chase <hw.chase.17@gmail.com>
Date: Sat, 24 Jun 2023 16:55:43 -0700
Subject: [PATCH 1/3] tiktoken override

---
 langchain/chat_models/openai.py | 1 +
 langchain/embeddings/openai.py  | 7 +++++--
 langchain/llms/openai.py        | 4 +++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/langchain/chat_models/openai.py b/langchain/chat_models/openai.py
index 83e43ae0ff11b..56cdfa091429e 100644
--- a/langchain/chat_models/openai.py
+++ b/langchain/chat_models/openai.py
@@ -184,6 +184,7 @@ def lc_serializable(self) -> bool:
     """Number of chat completions to generate for each prompt."""
     max_tokens: Optional[int] = None
     """Maximum number of tokens to generate."""
+    tiktoken_model_name: Optional[str] = None
 
     class Config:
         """Configuration for this pydantic object."""
diff --git a/langchain/embeddings/openai.py b/langchain/embeddings/openai.py
index 9c23323075242..4defeaa85f529 100644
--- a/langchain/embeddings/openai.py
+++ b/langchain/embeddings/openai.py
@@ -170,6 +170,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
     request_timeout: Optional[Union[float, Tuple[float, float]]] = None
     """Timeout in seconds for the OpenAPI request."""
     headers: Any = None
+    tiktoken_model_name: Optional[str] = None
 
     class Config:
         """Configuration for this pydantic object."""
@@ -265,7 +266,8 @@ def _get_len_safe_embeddings(
 
         tokens = []
         indices = []
-        encoding = tiktoken.model.encoding_for_model(self.model)
+        model_name = self.tiktoken_model_name or self.model
+        encoding = tiktoken.model.encoding_for_model(model_name)
         for i, text in enumerate(texts):
             if self.model.endswith("001"):
                 # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
@@ -329,7 +331,8 @@ async def _aget_len_safe_embeddings(
 
         tokens = []
         indices = []
-        encoding = tiktoken.model.encoding_for_model(self.model)
+        model_name = self.tiktoken_model_name or self.model
+        encoding = tiktoken.model.encoding_for_model(model_name)
         for i, text in enumerate(texts):
             if self.model.endswith("001"):
                 # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
diff --git a/langchain/llms/openai.py b/langchain/llms/openai.py
index 8fd350cbff574..3685c920d6c44 100644
--- a/langchain/llms/openai.py
+++ b/langchain/llms/openai.py
@@ -171,6 +171,7 @@ def lc_serializable(self) -> bool:
     """Set of special tokens that are allowed。"""
     disallowed_special: Union[Literal["all"], Collection[str]] = "all"
     """Set of special tokens that are not allowed。"""
+    tiktoken_model_name: Optional[str] = None
 
     def __new__(cls, **data: Any) -> Union[OpenAIChat, BaseOpenAI]:  # type: ignore
         """Initialize the OpenAI object."""
@@ -491,7 +492,8 @@ def get_token_ids(self, text: str) -> List[int]:
                 "Please install it with `pip install tiktoken`."
             )
 
-        enc = tiktoken.encoding_for_model(self.model_name)
+        model_name = self.tiktoken_model_name or self.model_name
+        enc = tiktoken.model.encoding_for_model(model_name)
 
         return enc.encode(
             text,

From bb9680856b2d7a4e68e843871b18dbf79bf50452 Mon Sep 17 00:00:00 2001
From: Harrison Chase <hw.chase.17@gmail.com>
Date: Sun, 25 Jun 2023 12:03:23 -0700
Subject: [PATCH 2/3] cr

---
 langchain/chat_models/openai.py | 30 +++++++++++++++++++++---------
 langchain/embeddings/openai.py  | 16 +++++++++++++++-
 langchain/llms/openai.py        | 16 +++++++++++++++-
 3 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/langchain/chat_models/openai.py b/langchain/chat_models/openai.py
index 56cdfa091429e..f1725b83313c3 100644
--- a/langchain/chat_models/openai.py
+++ b/langchain/chat_models/openai.py
@@ -185,6 +185,15 @@ def lc_serializable(self) -> bool:
     max_tokens: Optional[int] = None
     """Maximum number of tokens to generate."""
     tiktoken_model_name: Optional[str] = None
+    """The model name to pass to tiktoken when using this class. 
+    Tiktoken is used to count the number of tokens in documents to constrain 
+    them to be under a certain limit. By default, when set to None, this will 
+    be the same as the embedding model name. However, there are some cases 
+    where you may want to use this Embedding class with a model name not 
+    supported by tiktoken. This can include when using Azure embeddings or 
+    when using one of the many model providers that expose an OpenAI-like 
+    API but with different models. In those cases, in order to avoid erroring 
+    when tiktoken is called, you can specify a model name to use here."""
 
     class Config:
         """Configuration for this pydantic object."""
@@ -449,15 +458,18 @@ def _llm_type(self) -> str:
 
     def _get_encoding_model(self) -> Tuple[str, tiktoken.Encoding]:
         tiktoken_ = _import_tiktoken()
-        model = self.model_name
-        if model == "gpt-3.5-turbo":
-            # gpt-3.5-turbo may change over time.
-            # Returning num tokens assuming gpt-3.5-turbo-0301.
-            model = "gpt-3.5-turbo-0301"
-        elif model == "gpt-4":
-            # gpt-4 may change over time.
-            # Returning num tokens assuming gpt-4-0314.
-            model = "gpt-4-0314"
+        if self.tiktoken_model_name is not None:
+            model = self.tiktoken_model_name
+        else:
+            model = self.model_name
+            if model == "gpt-3.5-turbo":
+                # gpt-3.5-turbo may change over time.
+                # Returning num tokens assuming gpt-3.5-turbo-0301.
+                model = "gpt-3.5-turbo-0301"
+            elif model == "gpt-4":
+                # gpt-4 may change over time.
+                # Returning num tokens assuming gpt-4-0314.
+                model = "gpt-4-0314"
         # Returns the number of tokens used by a list of messages.
         try:
             encoding = tiktoken_.encoding_for_model(model)
diff --git a/langchain/embeddings/openai.py b/langchain/embeddings/openai.py
index 4defeaa85f529..bcc19fed05f96 100644
--- a/langchain/embeddings/openai.py
+++ b/langchain/embeddings/openai.py
@@ -171,6 +171,15 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
     """Timeout in seconds for the OpenAPI request."""
     headers: Any = None
     tiktoken_model_name: Optional[str] = None
+    """The model name to pass to tiktoken when using this class. 
+    Tiktoken is used to count the number of tokens in documents to constrain 
+    them to be under a certain limit. By default, when set to None, this will 
+    be the same as the embedding model name. However, there are some cases 
+    where you may want to use this Embedding class with a model name not 
+    supported by tiktoken. This can include when using Azure embeddings or 
+    when using one of the many model providers that expose an OpenAI-like 
+    API but with different models. In those cases, in order to avoid erroring 
+    when tiktoken is called, you can specify a model name to use here."""
 
     class Config:
         """Configuration for this pydantic object."""
@@ -267,7 +276,12 @@ def _get_len_safe_embeddings(
         tokens = []
         indices = []
         model_name = self.tiktoken_model_name or self.model
-        encoding = tiktoken.model.encoding_for_model(model_name)
+        try:
+            encoding = tiktoken.encoding_for_model(model_name)
+        except KeyError:
+            logger.warning("Warning: model not found. Using cl100k_base encoding.")
+            model = "cl100k_base"
+            encoding = tiktoken.get_encoding(model)
         for i, text in enumerate(texts):
             if self.model.endswith("001"):
                 # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
diff --git a/langchain/llms/openai.py b/langchain/llms/openai.py
index 3685c920d6c44..19bce8bddcf79 100644
--- a/langchain/llms/openai.py
+++ b/langchain/llms/openai.py
@@ -172,6 +172,15 @@ def lc_serializable(self) -> bool:
     disallowed_special: Union[Literal["all"], Collection[str]] = "all"
     """Set of special tokens that are not allowed。"""
     tiktoken_model_name: Optional[str] = None
+    """The model name to pass to tiktoken when using this class. 
+    Tiktoken is used to count the number of tokens in documents to constrain 
+    them to be under a certain limit. By default, when set to None, this will 
+    be the same as the embedding model name. However, there are some cases 
+    where you may want to use this Embedding class with a model name not 
+    supported by tiktoken. This can include when using Azure embeddings or 
+    when using one of the many model providers that expose an OpenAI-like 
+    API but with different models. In those cases, in order to avoid erroring 
+    when tiktoken is called, you can specify a model name to use here."""
 
     def __new__(cls, **data: Any) -> Union[OpenAIChat, BaseOpenAI]:  # type: ignore
         """Initialize the OpenAI object."""
@@ -493,7 +502,12 @@ def get_token_ids(self, text: str) -> List[int]:
             )
 
         model_name = self.tiktoken_model_name or self.model_name
-        enc = tiktoken.model.encoding_for_model(model_name)
+        try:
+            enc = tiktoken.encoding_for_model(model_name)
+        except KeyError:
+            logger.warning("Warning: model not found. Using cl100k_base encoding.")
+            model = "cl100k_base"
+            enc = tiktoken.get_encoding(model)
 
         return enc.encode(
             text,

From fb701eb584e7e4820cdf9e5dbb97b738c81c83e0 Mon Sep 17 00:00:00 2001
From: Harrison Chase <hw.chase.17@gmail.com>
Date: Sun, 25 Jun 2023 12:03:54 -0700
Subject: [PATCH 3/3] cr

---
 langchain/embeddings/openai.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/langchain/embeddings/openai.py b/langchain/embeddings/openai.py
index bcc19fed05f96..f3cb66547b7ed 100644
--- a/langchain/embeddings/openai.py
+++ b/langchain/embeddings/openai.py
@@ -346,7 +346,12 @@ async def _aget_len_safe_embeddings(
         tokens = []
         indices = []
         model_name = self.tiktoken_model_name or self.model
-        encoding = tiktoken.model.encoding_for_model(model_name)
+        try:
+            encoding = tiktoken.encoding_for_model(model_name)
+        except KeyError:
+            logger.warning("Warning: model not found. Using cl100k_base encoding.")
+            model = "cl100k_base"
+            encoding = tiktoken.get_encoding(model)
         for i, text in enumerate(texts):
             if self.model.endswith("001"):
                 # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500