@@ -213,6 +213,8 @@ class ModelType(Enum):
213213
214214 SeedOSS = 0x2B00
215215
216+ Apertus = 0x2C00
217+
216218 BCE_Embedding = 0x10000100
217219 BCE_ReRanker = 0x10000101
218220 BGE_M3 = 0x10000102
@@ -620,7 +622,7 @@ def dump_state_dict(f, weight_names, model_files, ggml_type, config, state_dict_
620622 tensor_ggml_type = GGMLType .F16
621623 else :
622624 # 1d weight: convert it to float32
623- assert tensor .ndim = = 1 , f'shape of { name } = { tensor .shape } '
625+ assert tensor .ndim < = 1 , f'shape of { name } = { tensor .shape } '
624626 tensor_ggml_type = GGMLType .F32
625627
626628 dump_tensor (f , name , tensor , tensor_ggml_type )
@@ -804,6 +806,14 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) ->
804806 tokidx = model ['model' ]['vocab' ][tok ]
805807 all_tokens [tok ] = tokidx
806808
809+ id_to_tok = {}
810+ for tok in all_tokens :
811+ i = all_tokens [tok ]
812+ if i in id_to_tok :
813+ raise Exception (f"{ i } dup: `{ id_to_tok [i ]} ` and `{ tok } `" )
814+ else :
815+ id_to_tok [i ] = tok
816+
807817 all_ids = sorted (list (all_tokens .values ()))
808818
809819 vocab_size : int = all_ids [- 1 ] + 1
@@ -7447,6 +7457,76 @@ def get_weight_names(config):
74477457
74487458 return weight_names
74497459
7460+ class ApertusConverter (BaseConverter ):
7461+ MODEL_TYPE = ModelType .Apertus
7462+
7463+ @classmethod
7464+ def state_dict_pp (cls , config , state_dict ):
7465+ r = {}
7466+ for k in state_dict :
7467+ t : torch .Tensor = state_dict [k ]
7468+ new_k : str = k
7469+ if 'attention_layernorm' in new_k :
7470+ new_k = new_k .replace ('.attention_layernorm.' , '.input_layernorm.' )
7471+ elif 'feedforward_layernorm' in new_k :
7472+ new_k = new_k .replace ('.feedforward_layernorm.' , '.post_attention_layernorm.' )
7473+ r [new_k ] = t
7474+ return r
7475+
7476+ @staticmethod
7477+ def dump_config (f , config , ggml_type ):
7478+ assert not config .tie_word_embeddings
7479+ assert config .rope_scaling ['rope_type' ] == 'llama3'
7480+ assert not config .attention_bias , "attention_bias must be False"
7481+ assert config .hidden_act == 'xielu'
7482+ assert not config .mlp_bias
7483+ assert config .qk_norm
7484+
7485+ config .hidden_act = 'silu'
7486+ dump_llama_like_config (f , config , ggml_type )
7487+
7488+ config_values = [
7489+ config .num_key_value_heads ,
7490+ ]
7491+ f .write (struct .pack ("<" + "i" * len (config_values ), * config_values ))
7492+
7493+ config_values = [
7494+ config .rope_theta ,
7495+ config .rope_scaling ['original_max_position_embeddings' ],
7496+ config .rope_scaling ['factor' ],
7497+ config .rope_scaling ['low_freq_factor' ],
7498+ config .rope_scaling ['high_freq_factor' ],
7499+ ]
7500+ f .write (struct .pack ("<fifff" , * config_values ))
7501+
7502+ @staticmethod
7503+ def get_weight_names (config ):
7504+ weight_names = ["model.embed_tokens.weight" ]
7505+ for i in range (config .num_hidden_layers ):
7506+ weight_names += [
7507+ f"model.layers.{ i } .input_layernorm.weight" ,
7508+ f"model.layers.{ i } .mlp.down_proj.weight" ,
7509+ f"model.layers.{ i } .mlp.up_proj.weight" ,
7510+ f"model.layers.{ i } .mlp.act_fn.alpha_n" ,
7511+ f"model.layers.{ i } .mlp.act_fn.alpha_p" ,
7512+ f"model.layers.{ i } .mlp.act_fn.beta" ,
7513+ f"model.layers.{ i } .mlp.act_fn.eps" ,
7514+ f"model.layers.{ i } .post_attention_layernorm.weight" ,
7515+ f"model.layers.{ i } .self_attn.k_proj.weight" ,
7516+ f"model.layers.{ i } .self_attn.k_norm.weight" ,
7517+ f"model.layers.{ i } .self_attn.o_proj.weight" ,
7518+ f"model.layers.{ i } .self_attn.q_proj.weight" ,
7519+ f"model.layers.{ i } .self_attn.q_norm.weight" ,
7520+ f"model.layers.{ i } .self_attn.v_proj.weight" ,
7521+ ]
7522+
7523+ weight_names += [
7524+ "model.norm.weight" ,
7525+ "lm_head.weight"
7526+ ]
7527+
7528+ return weight_names
7529+
74507530def convert_grok_1_base (args , vocab , ggml_type ):
74517531 def ffn_size (emb_size , widening_factor ):
74527532 _ffn_size = int (widening_factor * emb_size ) * 2 // 3
@@ -8046,6 +8126,8 @@ def main():
80468126 GptOssConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
80478127 elif arch == 'SeedOssForCausalLM' :
80488128 SeedOSSConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
8129+ elif arch == 'ApertusForCausalLM' :
8130+ ApertusConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
80498131 elif arch == 'deepseek-r1-distill-qwen3' :
80508132 QWen3Converter .MODEL_TYPE = ModelType .DeepSeek_R1_Distill_QWen3
80518133 QWen3Converter .convert (config , model_files , vocab , ggml_type , args .save_path )
0 commit comments