Skip to content

Commit f1cf813

Browse files
committed
support ling & ring 2.0.
1 parent ffe6abf commit f1cf813

File tree

12 files changed

+385
-25
lines changed

12 files changed

+385
-25
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ LittleAcademia[<a href="https://github.com/foldl/little-academia" style="text-
3131

3232
**What's New:**
3333

34+
* 2025-09-15: Ling/Ring-mini-2.0
3435
* 2025-09-08: GroveMoE
3536
* 2025-09-03: Apertus
3637
* 2025-08-22: Seed-OSS

convert.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,8 @@ class ModelType(Enum):
217217

218218
GroveMoE = 0x2D00
219219

220+
BailingMoE2 = 0x2E00
221+
220222
BCE_Embedding = 0x10000100
221223
BCE_ReRanker = 0x10000101
222224
BGE_M3 = 0x10000102
@@ -6374,6 +6376,78 @@ def dump_config(f, config, ggml_type):
63746376
def get_weight_names(config):
63756377
return DeepSeekV1Converter.get_weight_names(config)
63766378

6379+
class BailingMoe2Converter(BaseConverter):
6380+
MODEL_TYPE = ModelType.BailingMoE2
6381+
6382+
@classmethod
6383+
def state_dict_pp(cls, config, state_dict):
6384+
r = {}
6385+
for name in state_dict:
6386+
tensor: torch.Tensor = state_dict[name]
6387+
if name == 'model.word_embeddings.weight':
6388+
r['model.embed_tokens.weight'] = tensor
6389+
elif name == "lm_head.weight":
6390+
if config.norm_head:
6391+
tensor = tensor / (torch.norm(tensor, p=2, dim=0, keepdim=True) + 1e-7)
6392+
r[name] = tensor
6393+
elif name.endswith('query_key_value.weight'):
6394+
head_dim = config.head_dim
6395+
num_heads = config.num_attention_heads
6396+
num_key_value_heads = config.num_key_value_heads
6397+
6398+
q, k, v = tensor.split([num_heads * head_dim, num_key_value_heads * head_dim, num_key_value_heads * head_dim], dim=-2)
6399+
6400+
r[name.replace('attention.query_key_value', 'self_attn.q_proj')] = q
6401+
r[name.replace('attention.query_key_value', 'self_attn.k_proj')] = k
6402+
r[name.replace('attention.query_key_value', 'self_attn.v_proj')] = v
6403+
6404+
elif name.endswith('attention.dense.weight'):
6405+
r[name.replace('attention.dense', 'self_attn.o_proj')] = tensor
6406+
elif name.endswith('attention.query_layernorm.weight'):
6407+
r[name.replace('attention.query_layernorm', 'self_attn.q_norm')] = tensor
6408+
elif name.endswith('attention.key_layernorm.weight'):
6409+
r[name.replace('attention.key_layernorm', 'self_attn.k_norm')] = tensor
6410+
else:
6411+
r[name] = tensor
6412+
return r
6413+
6414+
@staticmethod
6415+
def dump_config(f, config, ggml_type):
6416+
6417+
assert config.rope_scaling is None
6418+
assert config.use_qk_norm
6419+
assert config.moe_router_enable_expert_bias
6420+
assert (config.num_nextn_predict_layers is None) or (config.num_nextn_predict_layers == 0)
6421+
assert config.moe_shared_expert_intermediate_size == config.moe_intermediate_size
6422+
6423+
BailingMoeConverter.dump_config(f, config, ggml_type)
6424+
6425+
config_values = [
6426+
int(config.head_dim * config.partial_rotary_factor),
6427+
config.n_group,
6428+
config.topk_group,
6429+
config.routed_scaling_factor,
6430+
]
6431+
f.write(struct.pack("<iiif", *config_values))
6432+
6433+
@staticmethod
6434+
def get_weight_names(config):
6435+
weight_names = BailingMoeConverter.get_weight_names(config)
6436+
for i in range(config.num_hidden_layers):
6437+
6438+
weight_names += [
6439+
f"model.layers.{i}.self_attn.k_norm.weight",
6440+
f"model.layers.{i}.self_attn.q_norm.weight",
6441+
]
6442+
6443+
if (config.n_routed_experts is not None
6444+
and (i >= config.first_k_dense_replace)
6445+
and (i % config.moe_layer_freq == 0)):
6446+
weight_names += [
6447+
f"model.layers.{i}.mlp.gate.expert_bias",
6448+
]
6449+
return weight_names
6450+
63776451
class DeepSeekV2Converter(BaseConverter):
63786452
MODEL_TYPE = ModelType.DeepSeekV2Light
63796453

@@ -8161,6 +8235,8 @@ def main():
81618235
AquilaConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
81628236
elif arch == 'BailingMoeForCausalLM':
81638237
BailingMoeConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
8238+
elif arch == 'BailingMoeV2ForCausalLM':
8239+
BailingMoe2Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
81648240
elif arch == 'AprielForCausalLM':
81658241
AprielConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
81668242
elif arch in ['Qwen3MoeForCausalLM', 'Qwen3ForCausalLM']:

docs/binding.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,8 @@ Examples:
157157

158158
![](code_highlight.png)
159159

160+
Note [a bug](https://github.com/nim-lang/Nim/pull/25105) in Nim library.
161+
160162
### Others
161163

162164
* [V-lang/VB.net/C#](https://github.com/foldl/chatllm.cpp/issues/41)

docs/models.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@
119119
* Ling/Ring (`BailingMoeForCausalLM`)
120120
* [x] [Lite](https://huggingface.co/inclusionAI/Ling-lite/tree/a80ae6c479251f1ae33dda517ab83cdc6a312f99), [Coder-Lite](https://huggingface.co/inclusionAI/Ling-Coder-lite/tree/4a8647acf9d3855d599adaaaf4bf6ca14239d2ab)
121121
* [x] v1.5: [Ling-lite-1.5-2507](https://huggingface.co/inclusionAI/Ling-lite-1.5-2507/tree/6656efdc763a77102207fc66b176e4c5d07a316b), [Ring-lite2507](https://huggingface.co/inclusionAI/Ring-lite-2507/commit/8cf0ec244871c90102b353cef3568e061fd2504f)
122+
* [x] v2: [Ling-mini-2.0](https://huggingface.co/inclusionAI/Ling-mini-2.0/tree/56c261e07b78d95dad61336fcbdb21ef4fdbcabe), [Ring-mini-2.0](https://huggingface.co/inclusionAI/Ring-mini-2.0/tree/d4eac003b34b59b733f05039a876616d840a37d6)
122123

123124
* LlaMA-like (`LlamaForCausalLM`, `Llama4ForConditionalGeneration`):
124125
* [x] All LlaMA-1 models

models/bailing.cpp

Lines changed: 167 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "deepseek.h"
2+
#include "qwen.h"
23

34
namespace chatllm::bailing::moe
45
{
@@ -90,6 +91,171 @@ namespace chatllm::bailing::moe
9091
: deepseek::v1_moe::ConditionalGeneration0<NUM_EXPERTS, EXPERTS_PER_TOK, EXPERTS_PER_TOK>(config, runtime_config, MODEL_TYPE_BAILINGMOE, config.head_dim)
9192
{}
9293
};
94+
}
9395

94-
REGISTER_MODEL_LOADER(BAILINGMOE, bailing::moe, 1);
96+
namespace chatllm::bailing2::moe
97+
{
98+
struct Config : public bailing::moe::Config
99+
{
100+
int rope_dim;
101+
int n_group;
102+
int topk_group;
103+
float routed_scaling_factor;
104+
};
105+
106+
typedef bailing::moe::Tokenizer Tokenizer;
107+
108+
const int NUM_EXPERTS = 256;
109+
const int EXPERTS_PER_TOK = 8;
110+
111+
class BailingSparseMoE : public BaseSparseMLP
112+
{
113+
public:
114+
BailingSparseMoE(InitContext *ctx, int hidden_size, int intermediate_size, int num_experts = NUM_EXPERTS, int experts_per_tok = EXPERTS_PER_TOK)
115+
: BaseSparseMLP(ctx, hidden_size, intermediate_size, num_experts, experts_per_tok, ActFunc::SILU, true),
116+
n_group(-1), topk_group(-1)
117+
{
118+
score_func = ScoreFunc::Sigmoid;
119+
always_scaling = true;
120+
}
121+
protected:
122+
ggml::tensor *select_experts(ComputeContext *ctx, ggml::tensor *corrected_score) override;
123+
124+
public:
125+
int n_group;
126+
int topk_group;
127+
};
128+
129+
ggml::tensor *BailingSparseMoE::select_experts(ComputeContext *ctx, ggml::tensor *corrected_score)
130+
{
131+
const int n_expert = num_local_experts;
132+
const int experts_per_group = n_expert / n_group;
133+
CHATLLM_CHECK(ggml::get_dim(corrected_score, 2) == 1);
134+
135+
ggml::tensor * selected_experts = nullptr;
136+
137+
ggml::tensor *grouped_scores = ggml::reshape_4d(ctx, corrected_score, experts_per_group, num_experts_per_tok,
138+
ggml::get_dim(corrected_score, 1), ggml::get_dim(corrected_score, 2));
139+
selected_experts = ggml::top_k(ctx, grouped_scores, topk_group);
140+
141+
ggml::tensor *selected_experts_i64 = ggml::cast_int_to_i64(ctx, selected_experts);
142+
143+
CHATLLM_CHECK(ggml::get_dim(grouped_scores, 3) == 1);
144+
grouped_scores = ggml::reshape_4d(ctx, grouped_scores, 1, ggml::get_dim(grouped_scores, 0), ggml::get_dim(grouped_scores, 1), ggml::get_dim(grouped_scores, 2));
145+
ggml::tensor *selected_group_scores = ggml::scale(ctx, grouped_scores, 0.0f);
146+
grouped_scores = ggml::get_rows(ctx, grouped_scores, selected_experts);
147+
selected_group_scores = ggml::set_rows(ctx, selected_group_scores, selected_experts_i64, grouped_scores);
148+
149+
selected_group_scores = ggml::reshape_3d(ctx, selected_group_scores,
150+
ggml::get_dim(corrected_score, 0), ggml::get_dim(corrected_score, 1), ggml::get_dim(corrected_score, 2));
151+
152+
selected_experts = ggml::top_k(ctx, selected_group_scores, num_experts_per_tok);
153+
154+
return selected_experts;
155+
}
156+
157+
class ConditionalGeneration : public BaseModelForConditionalGeneration
158+
{
159+
public:
160+
typedef CombinedMLP<BailingSparseMoE, SiLUMLP> BailingMoEMLP;
161+
typedef LMBlock1<RMSNorm, qwen::v3::QWen3SelfAttention, RMSNorm, BailingMoEMLP> BailingMoEBlock;
162+
typedef BaseModelForConditionalGeneration Base;
163+
typedef HeterogeneousModel ModelClass;
164+
public:
165+
ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type = MODEL_TYPE_BAILING_MOE2)
166+
: BaseModelForConditionalGeneration(type, config, runtime_config, 4096 * 4),
167+
config(config)
168+
{
169+
const size_t tensor_ovhd = ggml_tensor_overhead();
170+
const int moe_layer_num = get_moe_layer_num();
171+
const int dense_layer_num = config.num_hidden_layers - moe_layer_num;
172+
const size_t num_tensors = 3
173+
+ moe_layer_num * (12 + 7)
174+
+ dense_layer_num * 14;
175+
const size_t ctx_size = num_tensors * tensor_ovhd;
176+
w_ctx_.gctx = GGMLContext({.mem_size = ctx_size, .mem_buffer = nullptr, .no_alloc = true});
177+
w_ctx_.dtype = config.dtype;
178+
179+
CHATLLM_CHECK((NUM_EXPERTS == config.n_routed_experts)
180+
&& (EXPERTS_PER_TOK == config.num_experts_per_tok))
181+
<< "unsupported MoE param";
182+
183+
#define config_rope(attention) do { \
184+
attention.freq_base = config.rope_theta; \
185+
attention.rope_dim = config.rope_dim; \
186+
} while (false)
187+
188+
auto create_layer = [&](InitContext *ctx, int layer_index) -> Block * {
189+
if (is_layer_moe(layer_index))
190+
{
191+
auto layer = new BailingMoEBlock(ctx, config.hidden_size, config.num_attention_heads, config.intermediate_size,
192+
config.moe_intermediate_size, config.moe_intermediate_size * config.n_shared_experts,
193+
config.num_key_value_heads,
194+
config.head_dim,
195+
config.max_length);
196+
layer->mlp.mlp1.norm_topk_prob = config.norm_topk_prob != 0;
197+
layer->mlp.mlp1.routed_scaling_factor = config.routed_scaling_factor;
198+
layer->mlp.mlp1.n_group = config.n_group;
199+
layer->mlp.mlp1.topk_group = config.topk_group;
200+
config_rope(layer->attention);
201+
return layer;
202+
}
203+
else
204+
{
205+
auto layer = new qwen::v3::QWen3Block(ctx, config.hidden_size, config.num_attention_heads, config.intermediate_size,
206+
config.num_key_value_heads, config.head_dim, config.max_length);
207+
config_rope(layer->attention);
208+
return layer;
209+
}
210+
};
211+
212+
auto transformer = new ModelClass(&w_ctx_, config.num_hidden_layers, config.hidden_size,
213+
create_embedding<Embedding>(&w_ctx_, config),
214+
create_final_norm<RMSNorm>(&w_ctx_, config),
215+
create_lm_head(&w_ctx_, config, false), create_layer);
216+
217+
Base::transformer = transformer;
218+
219+
#undef config_rope
220+
221+
w_ctx_.check_used_mem_size(true);
222+
}
223+
224+
void load(ModelLoader &loader) override
225+
{
226+
loader.add_tensor_name_translations({
227+
{".mlp2.", ".shared_experts."},
228+
{".mlp1.gate.", ".gate."},
229+
{".mlp1.experts.", ".experts."},
230+
{".mlp1.gate_score_correction_bias", ".gate.expert_bias"},
231+
});
232+
233+
BaseModelForConditionalGeneration::load(loader);
234+
}
235+
236+
public:
237+
const Config config;
238+
239+
bool is_layer_moe(int layer_index)
240+
{
241+
return (layer_index >= config.first_k_dense_replace) && (layer_index % config.moe_layer_freq == 0);
242+
}
243+
244+
int get_moe_layer_num()
245+
{
246+
int r = 0;
247+
for (int i = 0; i < config.num_hidden_layers; i++)
248+
{
249+
if (is_layer_moe(i))
250+
r++;
251+
}
252+
return r;
253+
}
254+
};
255+
}
256+
257+
namespace chatllm
258+
{
259+
REGISTER_MODEL_LOADER(BAILINGMOE, bailing::moe, 1);
260+
REGISTER_MODEL_LOADER(BAILING_MOE2, bailing2::moe, 1);
95261
}

models/qwen.cpp

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1334,13 +1334,9 @@ namespace chatllm::qwen::v3
13341334

13351335
}
13361336

1337-
class QWen3Block : public LMBlock1<RMSNorm, QWen3SelfAttention, RMSNorm, SiLUMLP>
1338-
{
1339-
public:
1340-
QWen3Block(InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size, int num_kv_heads, int head_dim, int max_length)
1341-
: LMBlock1(ctx, hidden_size, num_attention_heads, intermediate_size, num_kv_heads, head_dim, max_length)
1342-
{}
1343-
};
1337+
QWen3Block::QWen3Block(InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size, int num_kv_heads, int head_dim, int max_length)
1338+
: LMBlock1(ctx, hidden_size, num_attention_heads, intermediate_size, num_kv_heads, head_dim, max_length)
1339+
{}
13441340

13451341
template <int NUM_EXPERTS, int EXPERTS_PER_TOK> class QWen3MoEBlock : public LMBlock1<RMSNorm, QWen3SelfAttention, RMSNorm, v2_moe::QWenSparseMoE<NUM_EXPERTS, EXPERTS_PER_TOK>>
13461342
{

models/qwen.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -551,6 +551,12 @@ namespace chatllm::qwen
551551
QWen3SelfAttention(InitContext *ctx, int hidden_size, int num_attention_heads, int num_kv_heads, int head_dim, int max_length);
552552
};
553553

554+
class QWen3Block : public LMBlock1<RMSNorm, QWen3SelfAttention, RMSNorm, SiLUMLP>
555+
{
556+
public:
557+
QWen3Block(InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size, int num_kv_heads, int head_dim, int max_length);
558+
};
559+
554560
class ConditionalGeneration : public BaseModelForConditionalGeneration
555561
{
556562
public:

scripts/models.json

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2706,6 +2706,22 @@
27062706
}
27072707
}
27082708
},
2709+
"ling2": {
2710+
"brief": "Ling 2.0 — a family of MoE-based large language models that combine SOTA performance with high efficiency.",
2711+
"default": "mini",
2712+
"license": "MIT",
2713+
"variants": {
2714+
"mini": {
2715+
"default": "q8",
2716+
"quantized": {
2717+
"q8": {
2718+
"size": 17277819184,
2719+
"url": "chatllm_quantized_bailing/ling-mini-2.bin"
2720+
}
2721+
}
2722+
}
2723+
}
2724+
},
27092725
"ring": {
27102726
"brief": "Ring-lite is a lightweight, fully open-sourced MoE (Mixture of Experts) LLM designed for complex reasoning tasks.",
27112727
"default": "lite-2507",
@@ -2722,6 +2738,22 @@
27222738
}
27232739
}
27242740
},
2741+
"ring2": {
2742+
"brief": "Ring-mini-2.0 — a high-performance inference-oriented MoE model deeply optimized based on the Ling 2.0 architecture.",
2743+
"default": "mini",
2744+
"license": "MIT",
2745+
"variants": {
2746+
"mini": {
2747+
"default": "q8",
2748+
"quantized": {
2749+
"q8": {
2750+
"size": 17277815376,
2751+
"url": "chatllm_quantized_bailing/ring-mini-2.bin"
2752+
}
2753+
}
2754+
}
2755+
}
2756+
},
27252757
"openhands-lm": {
27262758
"brief": "OpenHands LM is built on the foundation of Qwen Coder 2.5 Instruct 32B, leveraging its powerful base capabilities for coding tasks.",
27272759
"default": "32b",

0 commit comments

Comments
 (0)