!1087 tokenizer解码index越界提示

Lin · gitee-org · commit cfc0dff8ae1c · 2023-08-31T01:15:00.000Z
Merge pull request !1087 from zyw_hw/tokenizer_index
diff --git a/mindformers/models/base_tokenizer.py b/mindformers/models/base_tokenizer.py
@@ -4238,6 +4238,9 @@ def convert_ids_to_tokens(
         if isinstance(ids, int):
             if ids in self.added_tokens_decoder:
                 return self.added_tokens_decoder[ids]
+            if ids >= self.vocab_size:
+                raise IndexError(f"The token id {ids} is out of the size of vocabulary, please check your tokenizer "
+                                 f"and corresponding vocabulary files.")
             return self._convert_id_to_token(ids)
         tokens = []
         for index in ids:
@@ -4247,6 +4250,10 @@ def convert_ids_to_tokens(
             if index in self.added_tokens_decoder:
                 tokens.append(self.added_tokens_decoder[index])
             else:
+                if index >= self.vocab_size:
+                    raise IndexError(
+                        f"The token id {index} is out of the size of vocabulary, please check your tokenizer "
+                        f"and corresponding vocabulary files.")
                 tokens.append(self._convert_id_to_token(index))
         return tokens
 
diff --git a/mindformers/models/glm/chatglm_6b_tokenizer.py b/mindformers/models/glm/chatglm_6b_tokenizer.py
@@ -302,6 +302,10 @@ def _decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_sp
             token_ids = [token_ids]
         if self.pad_token_id in token_ids:  # remove pad
             token_ids = list(filter(self.pad_token_id.__ne__, token_ids))
+        for token_id in token_ids:
+            if token_id not in self.added_tokens_decoder and token_id >= self.vocab_size:
+                raise IndexError(f"The token id {token_id} is out of the size of vocabulary, please check "
+                                 f"your tokenizer and corresponding vocabulary files.")
         return self.sp_tokenizer.decode(token_ids)
 
     # pylint:disable=arguments-differ
@@ -358,36 +362,6 @@ def _convert_token_to_id(self, token):
             return self.added_tokens_encoder[token]
         return self.sp_tokenizer[token]
 
-    # pylint:disable=arguments-differ
-    def convert_ids_to_tokens(self, ids: Union[int, List[int]], skip_special_tokens: bool = False):
-        """
-        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
-        added tokens.
-
-        Args:
-            ids (`int` or `List[int]`):
-                The token id (or token ids) to convert to tokens.
-            skip_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not to remove special tokens in the decoding.
-
-        Returns:
-            `str` or `List[str]`: The decoded token(s).
-        """
-        if isinstance(ids, int):
-            if ids in self.added_tokens_decoder:
-                return self.added_tokens_decoder[ids]
-            return self._convert_id_to_token(ids)
-        tokens = []
-        for index in ids:
-            index = int(index)
-            if skip_special_tokens and index in self.all_special_ids:
-                continue
-            if index in self.added_tokens_decoder:
-                tokens.append(self.added_tokens_decoder[index])
-            else:
-                tokens.append(self._convert_id_to_token(index))
-        return tokens
-
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
         return self.sp_tokenizer[index]
diff --git a/mindformers/models/glm2/glm2_tokenizer.py b/mindformers/models/glm2/glm2_tokenizer.py
@@ -218,6 +218,10 @@ def _decode(self,
 
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
+        if index >= self.vocab_size:
+            raise IndexError(
+                f"The token id {index} is out of the size of vocabulary, please check your tokenizer "
+                f"and corresponding vocabulary files.")
         return self.tokenizer.convert_id_to_token(index)
 
     def convert_tokens_to_string(self, tokens: List[str]) -> str: