Skip to content

Commit 72ca3d1

Browse files
committed
support Aperus
1 parent 004364a commit 72ca3d1

File tree

13 files changed

+547
-2
lines changed

13 files changed

+547
-2
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ set(core_files src/backend.cpp
5959
models/adept.cpp
6060
models/allenai.cpp
6161
models/alphageo.cpp
62+
models/apertus.cpp
6263
models/apriel.cpp
6364
models/aquila.cpp
6465
models/baichuan.cpp

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ LittleAcademia[<a href="https://github.com/foldl/little-academia" style="text-
3131

3232
**What's New:**
3333

34+
* 2025-09-03: Apertus
3435
* 2025-08-22: Seed-OSS
3536
* 2025-08-11: GPT-OSS
3637
* 2025-08-05: Pangu-Embedded

convert.py

Lines changed: 83 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,8 @@ class ModelType(Enum):
213213

214214
SeedOSS = 0x2B00
215215

216+
Apertus = 0x2C00
217+
216218
BCE_Embedding = 0x10000100
217219
BCE_ReRanker = 0x10000101
218220
BGE_M3 = 0x10000102
@@ -620,7 +622,7 @@ def dump_state_dict(f, weight_names, model_files, ggml_type, config, state_dict_
620622
tensor_ggml_type = GGMLType.F16
621623
else:
622624
# 1d weight: convert it to float32
623-
assert tensor.ndim == 1, f'shape of {name} = {tensor.shape}'
625+
assert tensor.ndim <= 1, f'shape of {name} = {tensor.shape}'
624626
tensor_ggml_type = GGMLType.F32
625627

626628
dump_tensor(f, name, tensor, tensor_ggml_type)
@@ -804,6 +806,14 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) ->
804806
tokidx = model['model']['vocab'][tok]
805807
all_tokens[tok] = tokidx
806808

809+
id_to_tok = {}
810+
for tok in all_tokens:
811+
i = all_tokens[tok]
812+
if i in id_to_tok:
813+
raise Exception(f"{i} dup: `{id_to_tok[i]}` and `{tok}`")
814+
else:
815+
id_to_tok[i] = tok
816+
807817
all_ids = sorted(list(all_tokens.values()))
808818

809819
vocab_size: int = all_ids[-1] + 1
@@ -7447,6 +7457,76 @@ def get_weight_names(config):
74477457

74487458
return weight_names
74497459

7460+
class ApertusConverter(BaseConverter):
7461+
MODEL_TYPE = ModelType.Apertus
7462+
7463+
@classmethod
7464+
def state_dict_pp(cls, config, state_dict):
7465+
r = {}
7466+
for k in state_dict:
7467+
t: torch.Tensor = state_dict[k]
7468+
new_k: str = k
7469+
if 'attention_layernorm' in new_k:
7470+
new_k = new_k.replace('.attention_layernorm.', '.input_layernorm.')
7471+
elif 'feedforward_layernorm' in new_k:
7472+
new_k = new_k.replace('.feedforward_layernorm.', '.post_attention_layernorm.')
7473+
r[new_k] = t
7474+
return r
7475+
7476+
@staticmethod
7477+
def dump_config(f, config, ggml_type):
7478+
assert not config.tie_word_embeddings
7479+
assert config.rope_scaling['rope_type'] == 'llama3'
7480+
assert not config.attention_bias, "attention_bias must be False"
7481+
assert config.hidden_act == 'xielu'
7482+
assert not config.mlp_bias
7483+
assert config.qk_norm
7484+
7485+
config.hidden_act = 'silu'
7486+
dump_llama_like_config(f, config, ggml_type)
7487+
7488+
config_values = [
7489+
config.num_key_value_heads,
7490+
]
7491+
f.write(struct.pack("<" + "i" * len(config_values), *config_values))
7492+
7493+
config_values = [
7494+
config.rope_theta,
7495+
config.rope_scaling['original_max_position_embeddings'],
7496+
config.rope_scaling['factor'],
7497+
config.rope_scaling['low_freq_factor'],
7498+
config.rope_scaling['high_freq_factor'],
7499+
]
7500+
f.write(struct.pack("<fifff", *config_values))
7501+
7502+
@staticmethod
7503+
def get_weight_names(config):
7504+
weight_names = ["model.embed_tokens.weight"]
7505+
for i in range(config.num_hidden_layers):
7506+
weight_names += [
7507+
f"model.layers.{i}.input_layernorm.weight",
7508+
f"model.layers.{i}.mlp.down_proj.weight",
7509+
f"model.layers.{i}.mlp.up_proj.weight",
7510+
f"model.layers.{i}.mlp.act_fn.alpha_n",
7511+
f"model.layers.{i}.mlp.act_fn.alpha_p",
7512+
f"model.layers.{i}.mlp.act_fn.beta",
7513+
f"model.layers.{i}.mlp.act_fn.eps",
7514+
f"model.layers.{i}.post_attention_layernorm.weight",
7515+
f"model.layers.{i}.self_attn.k_proj.weight",
7516+
f"model.layers.{i}.self_attn.k_norm.weight",
7517+
f"model.layers.{i}.self_attn.o_proj.weight",
7518+
f"model.layers.{i}.self_attn.q_proj.weight",
7519+
f"model.layers.{i}.self_attn.q_norm.weight",
7520+
f"model.layers.{i}.self_attn.v_proj.weight",
7521+
]
7522+
7523+
weight_names += [
7524+
"model.norm.weight",
7525+
"lm_head.weight"
7526+
]
7527+
7528+
return weight_names
7529+
74507530
def convert_grok_1_base(args, vocab, ggml_type):
74517531
def ffn_size(emb_size, widening_factor):
74527532
_ffn_size = int(widening_factor * emb_size) * 2 // 3
@@ -8046,6 +8126,8 @@ def main():
80468126
GptOssConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
80478127
elif arch == 'SeedOssForCausalLM':
80488128
SeedOSSConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
8129+
elif arch == 'ApertusForCausalLM':
8130+
ApertusConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
80498131
elif arch == 'deepseek-r1-distill-qwen3':
80508132
QWen3Converter.MODEL_TYPE = ModelType.DeepSeek_R1_Distill_QWen3
80518133
QWen3Converter.convert(config, model_files, vocab, ggml_type, args.save_path)

docs/models.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55
* Adept Persimmon (`PersimmonForCausalLM`)
66
* [x] [Chat-8B](https://huggingface.co/adept/persimmon-8b-chat)
77

8+
* Apertus (`ApertusForCausalLM`)
9+
* [x] [8B-Instruct-2509](https://huggingface.co/swiss-ai/Apertus-8B-Instruct-2509/tree/9579a2de0fd74118ba3f3714cdc13585607762a8), [70B-Instruct-2509](https://huggingface.co/swiss-ai/Apertus-70B-Instruct-2509/tree/69eeb773ba2dadf22ab9d2dbebd5771bdfdcc1e8)
10+
11+
Note: Use `--set enable-thinking 1` to enable thinking.
12+
813
* Apriel (`AprielForCausalLM`)
914
* [x] [Instruct-5B](https://huggingface.co/ServiceNow-AI/Apriel-5B-Instruct/tree/a9a4831718a2fad437f25ace0d0259953fcaaa26)
1015

0 commit comments

Comments
 (0)