@@ -302,6 +302,10 @@ def _decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_sp
302302 token_ids = [token_ids ]
303303 if self .pad_token_id in token_ids : # remove pad
304304 token_ids = list (filter (self .pad_token_id .__ne__ , token_ids ))
305+ for token_id in token_ids :
306+ if token_id not in self .added_tokens_decoder and token_id >= self .vocab_size :
307+ raise IndexError (f"The token id { token_id } is out of the size of vocabulary, please check "
308+ f"your tokenizer and corresponding vocabulary files." )
305309 return self .sp_tokenizer .decode (token_ids )
306310
307311 # pylint:disable=arguments-differ
@@ -358,36 +362,6 @@ def _convert_token_to_id(self, token):
358362 return self .added_tokens_encoder [token ]
359363 return self .sp_tokenizer [token ]
360364
361- # pylint:disable=arguments-differ
362- def convert_ids_to_tokens (self , ids : Union [int , List [int ]], skip_special_tokens : bool = False ):
363- """
364- Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
365- added tokens.
366-
367- Args:
368- ids (`int` or `List[int]`):
369- The token id (or token ids) to convert to tokens.
370- skip_special_tokens (`bool`, *optional*, defaults to `False`):
371- Whether or not to remove special tokens in the decoding.
372-
373- Returns:
374- `str` or `List[str]`: The decoded token(s).
375- """
376- if isinstance (ids , int ):
377- if ids in self .added_tokens_decoder :
378- return self .added_tokens_decoder [ids ]
379- return self ._convert_id_to_token (ids )
380- tokens = []
381- for index in ids :
382- index = int (index )
383- if skip_special_tokens and index in self .all_special_ids :
384- continue
385- if index in self .added_tokens_decoder :
386- tokens .append (self .added_tokens_decoder [index ])
387- else :
388- tokens .append (self ._convert_id_to_token (index ))
389- return tokens
390-
391365 def _convert_id_to_token (self , index ):
392366 """Converts an index (integer) in a token (str) using the vocab."""
393367 return self .sp_tokenizer [index ]
0 commit comments