@@ -130,28 +130,76 @@ def make_tensors_list() -> List[str]:
130130TENSORS_SET = set (TENSORS_LIST )
131131
132132
133+ def find_n_mult (n_ff : int , n_embd : int ) -> int :
134+ # hardcoded magic range
135+ for n_mult in range (256 , 1 , - 1 ):
136+ calc_ff = (((8 * n_embd ) // 3 + n_mult - 1 ) // n_mult )* n_mult
137+ if calc_ff == n_ff :
138+ return n_mult
139+ return 1
140+
133141@dataclass
134142class Params :
135143 n_vocab : int
136144 n_embd : int
137145 n_mult : int
138146 n_head : int
139147 n_layer : int
140- file_type : GGMLFileType
141148
142149 @staticmethod
143- def guessed (model : 'LazyModel' , file_type : GGMLFileType ) -> 'Params' :
144- n_vocab , n_embd = model ["tok_embeddings.weight" ].shape
150+ def guessed (model : 'LazyModel' ) -> 'Params' :
151+ # try transformer naming first
152+ n_vocab , n_embd = model ["model.embed_tokens.weight" ].shape if "model.embed_tokens.weight" in model else model ["tok_embeddings.weight" ].shape
153+
154+ # try transformer naming first
155+ if "model.layers.0.self_attn.q_proj.weight" in model :
156+ n_layer = next (i for i in itertools .count () if f"model.layers.{ i } .self_attn.q_proj.weight" not in model )
157+ else :
158+ n_layer = next (i for i in itertools .count () if f"layers.{ i } .attention.wq.weight" not in model )
159+
160+ n_head = n_embd // 128 # guessed
145161
146162 return Params (
147163 n_vocab = n_vocab ,
148164 n_embd = n_embd ,
149165 n_mult = 256 ,
150- n_head = n_embd // 128 ,
151- n_layer = next (i for i in itertools .count () if f"layers.{ i } .attention.wq.weight" not in model ),
152- file_type = file_type ,
166+ n_head = n_head ,
167+ n_layer = n_layer ,
153168 )
154169
170+ @staticmethod
171+ def loadHFTransformerJson (model : 'LazyModel' , config_path : 'Path' ) -> 'Params' :
172+ config = json .load (open (config_path ))
173+
174+ n_vocab = config ["vocab_size" ];
175+ n_embd = config ["hidden_size" ];
176+ n_head = config ["num_attention_heads" ];
177+ n_layer = config ["num_hidden_layers" ];
178+ n_ff = config ["intermediate_size" ];
179+
180+ n_mult = find_n_mult (n_ff , n_embd );
181+
182+ return Params (
183+ n_vocab = n_vocab ,
184+ n_embd = n_embd ,
185+ n_mult = n_mult ,
186+ n_head = n_head ,
187+ n_layer = n_layer ,
188+ )
189+
190+ @staticmethod
191+ def load (model_plus : 'ModelPlus' ) -> 'Params' :
192+ orig_config_path = model_plus .paths [0 ].parent / "params.json"
193+ hf_transformer_config_path = model_plus .paths [0 ].parent / "config.json"
194+
195+ if hf_transformer_config_path .exists ():
196+ params = Params .loadHFTransformerJson (model_plus .model , hf_transformer_config_path )
197+ else :
198+ params = Params .guessed (model_plus .model )
199+
200+ print (f'params: n_vocab:{ params .n_vocab } n_embd:{ params .n_embd } n_mult:{ params .n_mult } n_head:{ params .n_head } n_layer:{ params .n_layer } ' )
201+ return params
202+
155203
156204class SentencePieceVocab :
157205 def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ]) -> None :
@@ -595,18 +643,17 @@ def load() -> Tensor:
595643 return LazyTensor (load , lazy_tensor .shape , lazy_tensor .data_type , f'permute({ n_head } ) ' + lazy_tensor .description )
596644
597645
598- def convert_transformers_to_orig (model : LazyModel ) -> LazyModel :
646+ def convert_transformers_to_orig (model : LazyModel , params : Params ) -> LazyModel :
599647 out : LazyModel = {}
600648 out ["tok_embeddings.weight" ] = model ["model.embed_tokens.weight" ]
601649 out ["norm.weight" ] = model ["model.norm.weight" ]
602650 out ["output.weight" ] = model ["lm_head.weight" ]
603651
604- n_head = model ["model.layers.0.self_attn.q_proj.weight" ].shape [1 ] // 128
605652 for i in itertools .count ():
606653 if f"model.layers.{ i } .self_attn.q_proj.weight" not in model :
607654 break
608- out [f"layers.{ i } .attention.wq.weight" ] = permute_lazy (model [f"model.layers.{ i } .self_attn.q_proj.weight" ], n_head )
609- out [f"layers.{ i } .attention.wk.weight" ] = permute_lazy (model [f"model.layers.{ i } .self_attn.k_proj.weight" ], n_head )
655+ out [f"layers.{ i } .attention.wq.weight" ] = permute_lazy (model [f"model.layers.{ i } .self_attn.q_proj.weight" ], params . n_head )
656+ out [f"layers.{ i } .attention.wk.weight" ] = permute_lazy (model [f"model.layers.{ i } .self_attn.k_proj.weight" ], params . n_head )
610657 out [f"layers.{ i } .attention.wv.weight" ] = model [f"model.layers.{ i } .self_attn.v_proj.weight" ]
611658 out [f"layers.{ i } .attention.wo.weight" ] = model [f"model.layers.{ i } .self_attn.o_proj.weight" ]
612659
@@ -920,7 +967,7 @@ class OutputFile:
920967 def __init__ (self , fname_out : Path ) -> None :
921968 self .fout = open (fname_out , "wb" )
922969
923- def write_file_header (self , params : Params ) -> None :
970+ def write_file_header (self , params : Params , file_type : GGMLFileType ) -> None :
924971 self .fout .write (b"ggjt" [::- 1 ]) # magic
925972 values = [
926973 1 , # file version
@@ -930,7 +977,7 @@ def write_file_header(self, params: Params) -> None:
930977 params .n_head ,
931978 params .n_layer ,
932979 params .n_embd // params .n_head , # rot (obsolete)
933- params . file_type .value ,
980+ file_type .value ,
934981 ]
935982 self .fout .write (struct .pack ("i" * len (values ), * values ))
936983
@@ -958,10 +1005,10 @@ def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
9581005 of .fout .close ()
9591006
9601007 @staticmethod
961- def write_all (fname_out : Path , params : Params , model : LazyModel , vocab : Vocab ) -> None :
1008+ def write_all (fname_out : Path , params : Params , file_type : GGMLFileType , model : LazyModel , vocab : Vocab ) -> None :
9621009 check_vocab_size (params , vocab )
9631010 of = OutputFile (fname_out )
964- of .write_file_header (params )
1011+ of .write_file_header (params , file_type )
9651012 print ("Writing vocab..." )
9661013 of .write_vocab (vocab )
9671014
@@ -997,11 +1044,11 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi
9971044 raise Exception (f"Unexpected combination of types: { name_to_type } " )
9981045
9991046
1000- def do_necessary_conversions (model : LazyModel ) -> LazyModel :
1047+ def do_necessary_conversions (model : LazyModel , params : Params ) -> LazyModel :
10011048 model = handle_quantization (model )
10021049
10031050 if "lm_head.weight" in model :
1004- model = convert_transformers_to_orig (model )
1051+ model = convert_transformers_to_orig (model , params )
10051052 model = filter_and_sort_tensors (model )
10061053
10071054 return model
@@ -1107,14 +1154,14 @@ def load_vocab(path: Path) -> SentencePieceVocab:
11071154 return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None )
11081155
11091156
1110- def default_outfile (model_paths : List [Path ], params : Params ) -> Path :
1157+ def default_outfile (model_paths : List [Path ], file_type : GGMLFileType ) -> Path :
11111158 namestr = {
11121159 GGMLFileType .AllF32 : "f32" ,
11131160 GGMLFileType .MostlyF16 : "f16" ,
11141161 GGMLFileType .MostlyQ4_0 : "q4_0" ,
11151162 GGMLFileType .MostlyQ4_1 : "q4_1" ,
11161163 GGMLFileType .PerLayerIsQ4_1 : "q4_1" ,
1117- }[params . file_type ]
1164+ }[file_type ]
11181165 ret = model_paths [0 ].parent / f"ggml-model-{ namestr } .bin"
11191166 if ret in model_paths :
11201167 sys .stderr .write (
@@ -1164,13 +1211,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
11641211 else :
11651212 vocab_dir = args .vocab_dir if args .vocab_dir else model_plus .paths [0 ].parent
11661213 vocab = load_vocab (vocab_dir )
1214+ params = Params .load (model_plus )
11671215 model = model_plus .model
1168- model = do_necessary_conversions (model )
1216+ model = do_necessary_conversions (model , params )
11691217 output_type = pick_output_type (model , args .outtype )
11701218 model = convert_to_output_type (model , output_type )
1171- params = Params .guessed (model , output_type )
1172- outfile = args .outfile or default_outfile (model_plus .paths , params )
1173- OutputFile .write_all (outfile , params , model , vocab )
1219+ outfile = args .outfile or default_outfile (model_plus .paths , output_type )
1220+ OutputFile .write_all (outfile , params , output_type , model , vocab )
11741221 print (f"Wrote { outfile } " )
11751222
11761223
0 commit comments