2525from tokenizers .models import BPE
2626
2727from .. import AddedToken
28- from ..convert_slow_tokenizer import LlamaConverter
28+ from ..convert_slow_tokenizer import LlamaConverter , Qwen2Converter
2929from ..utils import logging
3030from ..utils .logging import tqdm
3131
101101 "output.weight" : "lm_head.weight" ,
102102 "output_norm" : "model.norm" ,
103103 },
104+ "qwen2" : {
105+ "token_embd" : "model.embed_tokens" ,
106+ "blk" : "model.layers" ,
107+ "ffn_up" : "mlp.up_proj" ,
108+ "ffn_down" : "mlp.down_proj" ,
109+ "ffn_gate" : "mlp.gate_proj" ,
110+ "ffn_norm" : "post_attention_layernorm" ,
111+ "attn_norm" : "input_layernorm" ,
112+ "attn_q" : "self_attn.q_proj" ,
113+ "attn_v" : "self_attn.v_proj" ,
114+ "attn_k" : "self_attn.k_proj" ,
115+ "attn_output" : "self_attn.o_proj" ,
116+ "output.weight" : "lm_head.weight" ,
117+ "output_norm" : "model.norm" ,
118+ },
104119}
105120
106121
133148 "attention.layer_norm_rms_epsilon" : "rms_norm_eps" ,
134149 "vocab_size" : "vocab_size" ,
135150 },
151+ "qwen2" : {
152+ "context_length" : "max_position_embeddings" ,
153+ "block_count" : "num_hidden_layers" ,
154+ "feed_forward_length" : "intermediate_size" ,
155+ "embedding_length" : "hidden_size" ,
156+ "rope.dimension_count" : None ,
157+ "rope.freq_base" : "rope_theta" ,
158+ "attention.head_count" : "num_attention_heads" ,
159+ "attention.head_count_kv" : "num_key_value_heads" ,
160+ "attention.layer_norm_rms_epsilon" : "rms_norm_eps" ,
161+ "vocab_size" : "vocab_size" ,
162+ },
136163 "tokenizer" : {
137- "ggml.model" : "model_type" ,
138164 "ggml.bos_token_id" : "bos_token_id" ,
139165 "ggml.eos_token_id" : "eos_token_id" ,
140166 "ggml.unknown_token_id" : "unk_token_id" ,
@@ -490,14 +516,15 @@ def __init__(self, dict_):
490516 for k , v in dict_ .items ():
491517 setattr (self , k , v )
492518
493- if not hasattr (self , "tokens" ) or not hasattr (self , "scores" ):
494- raise ValueError ("tokens and scores need to be passed for a LLaMa tokenizer to be instantiated." )
495- else :
519+ if not hasattr (self , "merges" ):
520+ if not hasattr (self , "tokens" ) or not hasattr (self , "scores" ):
521+ raise ValueError (
522+ "tokens and scores need to be passed for a LLaMa tokenizer without merges to be instantiated."
523+ )
496524 tokens = self .tokens
497525 scores = self .scores
498526 vocab = {t : scores [i ] for i , t in enumerate (tokens )}
499527
500- if not hasattr (self , "merges" ):
501528 logger .warning ("Merges were not in checkpoint, building merges on the fly." )
502529 merges = []
503530 for merge , piece_score in tqdm (vocab .items ()):
@@ -562,16 +589,37 @@ def decoder(self, replacement, add_prefix_space):
562589 return decoders .Sequence (sequence )
563590
564591
592+ class GGUFQwen2Converter (Qwen2Converter ):
593+ def __init__ (self , tokenizer_dict ):
594+ self .original_tokenizer = GGUFTokenizerSkeleton (tokenizer_dict )
595+
596+ def converted (self ) -> Tokenizer :
597+ vocab = {word : i for i , word in enumerate (self .original_tokenizer .tokens )}
598+ merges = self .original_tokenizer .merges
599+ tokenizer = super ().converted (vocab , merges )
600+
601+ tokenizer .add_special_tokens (
602+ [
603+ AddedToken ("<|endoftext|>" , normalized = False , special = True ),
604+ AddedToken ("<|im_start|>" , normalized = False , special = True ),
605+ AddedToken ("<|im_end|>" , normalized = False , special = True ),
606+ ]
607+ )
608+ return tokenizer
609+
610+
565611GGUF_TO_FAST_CONVERTERS = {
566612 "llama" : GGUFLlamaConverter ,
613+ "qwen2" : GGUFQwen2Converter ,
567614}
568615
569616
570- def convert_gguf_tokenizer (tokenizer_dict ) -> Tokenizer :
617+ def convert_gguf_tokenizer (architecture , tokenizer_dict ) -> Tokenizer :
571618 """
572619 Utilities to convert a slow tokenizer instance in a fast tokenizer instance.
573620
574621 Args:
622+ architecture (`str`): The model architecture derived from gguf file.
575623 transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
576624 Instance of a slow tokenizer to convert in the backend tokenizer for
577625 [`~tokenization_utils_base.PreTrainedTokenizerFast`].
@@ -580,6 +628,6 @@ def convert_gguf_tokenizer(tokenizer_dict) -> Tokenizer:
580628 A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
581629 [`~tokenization_utils_base.PreTrainedTokenizerFast`]
582630 """
583- tokenizer_class_name = tokenizer_dict [ "tokenizer_type" ]
631+ tokenizer_class_name = architecture
584632 converter_class = GGUF_TO_FAST_CONVERTERS [tokenizer_class_name ]
585633 return converter_class (tokenizer_dict ).converted ()
0 commit comments