1919 sys .path .insert (1 , str (Path (__file__ ).parent / 'gguf-py' / 'gguf' ))
2020import gguf
2121
22+ # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
23+
24+
25+ def bytes_to_unicode ():
26+ """
27+ Returns list of utf-8 byte and a corresponding list of unicode strings.
28+ The reversible bpe codes work on unicode strings.
29+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
30+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
31+ This is a significant percentage of your normal, say, 32K bpe vocab.
32+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
33+ And avoids mapping to whitespace/control characters the bpe code barfs on.
34+ """
35+ bs = list (range (ord ("!" ), ord ("~" )+ 1 ))+ list (range (ord ("¡" ), ord ("¬" )+ 1 ))+ list (range (ord ("®" ), ord ("ÿ" )+ 1 ))
36+ cs = bs [:]
37+ n = 0
38+ for b in range (2 ** 8 ):
39+ if b not in bs :
40+ bs .append (b )
41+ cs .append (2 ** 8 + n )
42+ n += 1
43+ return dict (zip (bs , (chr (n ) for n in cs )))
44+
2245
2346def count_model_parts (dir_model : Path ) -> int :
2447 num_parts = 0
@@ -107,8 +130,6 @@ def parse_args() -> argparse.Namespace:
107130print ("gguf: get tokenizer metadata" )
108131
109132tokens : list [bytearray ] = []
110- scores : list [float ] = []
111- toktypes : list [int ] = []
112133
113134# gpt2 tokenizer
114135gguf_writer .add_tokenizer_model ("gpt2" )
@@ -124,15 +145,28 @@ def parse_args() -> argparse.Namespace:
124145assert max (tokenizer .vocab .values ()) < vocab_size
125146
126147reverse_vocab = {id : encoded_tok for encoded_tok , id in tokenizer .vocab .items ()}
148+ byte_encoder = bytes_to_unicode ()
149+ byte_decoder = {v : k for k , v in byte_encoder .items ()}
127150
128151for i in range (vocab_size ):
129- tokens .append (reverse_vocab [i ] if i in reverse_vocab else f"[PAD{ i } ]" )
130- scores .append (0.0 ) # dummy
131- toktypes .append (gguf .TokenType .NORMAL )
152+ if i in reverse_vocab :
153+ try :
154+ text = bytearray ([byte_decoder [c ] for c in reverse_vocab [i ]])
155+ except KeyError :
156+ text = bytearray ()
157+ for c in reverse_vocab [i ]:
158+ if ord (c ) < 256 : # single byte character
159+ text .append (byte_decoder [ord (c )])
160+ else : # multibyte special token character
161+ text .extend (c .encode ('utf-8' ))
162+ else :
163+ print (f"Key { i } not in tokenizer vocabulary. Padding with an arbitrary token." )
164+ pad_token = f"[PAD{ i } ]" .encode ("utf8" )
165+ text = bytearray (pad_token )
166+
167+ tokens .append (text )
132168
133169gguf_writer .add_token_list (tokens )
134- gguf_writer .add_token_scores (scores )
135- gguf_writer .add_token_types (toktypes )
136170
137171special_vocab = gguf .SpecialVocab (dir_model , load_merges = True )
138172special_vocab .add_to_gguf (gguf_writer )
0 commit comments