support ling & ring 2.0.

foldl · foldl · commit f1cf81398284 · 2025-09-15T17:44:16.000+08:00
diff --git a/README.md b/README.md
@@ -31,6 +31,7 @@ LittleAcademia[<a href="https://github.com/foldl/little-academia"   style="text-
 
 **What's New:**
 
+* 2025-09-15: Ling/Ring-mini-2.0
 * 2025-09-08: GroveMoE
 * 2025-09-03: Apertus
 * 2025-08-22: Seed-OSS
diff --git a/convert.py b/convert.py
@@ -217,6 +217,8 @@ class ModelType(Enum):
 
     GroveMoE        = 0x2D00
 
+    BailingMoE2     = 0x2E00
+
     BCE_Embedding           = 0x10000100
     BCE_ReRanker            = 0x10000101
     BGE_M3                  = 0x10000102
@@ -6374,6 +6376,78 @@ def dump_config(f, config, ggml_type):
     def get_weight_names(config):
         return DeepSeekV1Converter.get_weight_names(config)
 
+class BailingMoe2Converter(BaseConverter):
+    MODEL_TYPE = ModelType.BailingMoE2
+
+    @classmethod
+    def state_dict_pp(cls, config, state_dict):
+        r = {}
+        for name in state_dict:
+            tensor: torch.Tensor = state_dict[name]
+            if name == 'model.word_embeddings.weight':
+                r['model.embed_tokens.weight'] = tensor
+            elif name == "lm_head.weight":
+                if config.norm_head:
+                    tensor = tensor / (torch.norm(tensor, p=2, dim=0, keepdim=True) + 1e-7)
+                r[name] = tensor
+            elif name.endswith('query_key_value.weight'):
+                head_dim = config.head_dim
+                num_heads = config.num_attention_heads
+                num_key_value_heads = config.num_key_value_heads
+
+                q, k, v = tensor.split([num_heads * head_dim, num_key_value_heads * head_dim, num_key_value_heads * head_dim], dim=-2)
+
+                r[name.replace('attention.query_key_value', 'self_attn.q_proj')] = q
+                r[name.replace('attention.query_key_value', 'self_attn.k_proj')] = k
+                r[name.replace('attention.query_key_value', 'self_attn.v_proj')] = v
+
+            elif name.endswith('attention.dense.weight'):
+                r[name.replace('attention.dense', 'self_attn.o_proj')] = tensor
+            elif name.endswith('attention.query_layernorm.weight'):
+                r[name.replace('attention.query_layernorm', 'self_attn.q_norm')] = tensor
+            elif name.endswith('attention.key_layernorm.weight'):
+                r[name.replace('attention.key_layernorm', 'self_attn.k_norm')] = tensor
+            else:
+                r[name] = tensor
+        return r
+
+    @staticmethod
+    def dump_config(f, config, ggml_type):
+
+        assert config.rope_scaling is None
+        assert config.use_qk_norm
+        assert config.moe_router_enable_expert_bias
+        assert (config.num_nextn_predict_layers is None) or (config.num_nextn_predict_layers == 0)
+        assert config.moe_shared_expert_intermediate_size == config.moe_intermediate_size
+
+        BailingMoeConverter.dump_config(f, config, ggml_type)
+
+        config_values = [
+            int(config.head_dim * config.partial_rotary_factor),
+            config.n_group,
+            config.topk_group,
+            config.routed_scaling_factor,
+        ]
+        f.write(struct.pack("<iiif", *config_values))
+
+    @staticmethod
+    def get_weight_names(config):
+        weight_names = BailingMoeConverter.get_weight_names(config)
+        for i in range(config.num_hidden_layers):
+
+            weight_names += [
+                f"model.layers.{i}.self_attn.k_norm.weight",
+                f"model.layers.{i}.self_attn.q_norm.weight",
+            ]
+
+            if (config.n_routed_experts is not None
+                and (i >= config.first_k_dense_replace)
+                and (i % config.moe_layer_freq == 0)):
+                weight_names += [
+                    f"model.layers.{i}.mlp.gate.expert_bias",
+                ]
+        return weight_names
+
 class DeepSeekV2Converter(BaseConverter):
     MODEL_TYPE = ModelType.DeepSeekV2Light
 
@@ -8161,6 +8235,8 @@ def main():
         AquilaConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'BailingMoeForCausalLM':
         BailingMoeConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
+    elif arch == 'BailingMoeV2ForCausalLM':
+        BailingMoe2Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'AprielForCausalLM':
         AprielConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch in ['Qwen3MoeForCausalLM', 'Qwen3ForCausalLM']:
diff --git a/docs/binding.md b/docs/binding.md
@@ -157,6 +157,8 @@ Examples:
 
     ![](code_highlight.png)
 
+    Note [a bug](https://github.com/nim-lang/Nim/pull/25105) in Nim library.
+
 ### Others
 
 * [V-lang/VB.net/C#](https://github.com/foldl/chatllm.cpp/issues/41)
diff --git a/docs/models.md b/docs/models.md
@@ -119,6 +119,7 @@
 * Ling/Ring (`BailingMoeForCausalLM`)
     * [x] [Lite](https://huggingface.co/inclusionAI/Ling-lite/tree/a80ae6c479251f1ae33dda517ab83cdc6a312f99), [Coder-Lite](https://huggingface.co/inclusionAI/Ling-Coder-lite/tree/4a8647acf9d3855d599adaaaf4bf6ca14239d2ab)
     * [x] v1.5: [Ling-lite-1.5-2507](https://huggingface.co/inclusionAI/Ling-lite-1.5-2507/tree/6656efdc763a77102207fc66b176e4c5d07a316b), [Ring-lite2507](https://huggingface.co/inclusionAI/Ring-lite-2507/commit/8cf0ec244871c90102b353cef3568e061fd2504f)
+    * [x] v2: [Ling-mini-2.0](https://huggingface.co/inclusionAI/Ling-mini-2.0/tree/56c261e07b78d95dad61336fcbdb21ef4fdbcabe), [Ring-mini-2.0](https://huggingface.co/inclusionAI/Ring-mini-2.0/tree/d4eac003b34b59b733f05039a876616d840a37d6)
 
 * LlaMA-like (`LlamaForCausalLM`, `Llama4ForConditionalGeneration`):
     * [x] All LlaMA-1 models
diff --git a/models/bailing.cpp b/models/bailing.cpp
@@ -1,4 +1,5 @@
 #include "deepseek.h"
+#include "qwen.h"
 
 namespace chatllm::bailing::moe
 {
@@ -90,6 +91,171 @@ namespace chatllm::bailing::moe
             : deepseek::v1_moe::ConditionalGeneration0<NUM_EXPERTS, EXPERTS_PER_TOK, EXPERTS_PER_TOK>(config, runtime_config, MODEL_TYPE_BAILINGMOE, config.head_dim)
         {}
     };
+}
 
-    REGISTER_MODEL_LOADER(BAILINGMOE,            bailing::moe, 1);
+namespace chatllm::bailing2::moe
+{
+    struct Config : public bailing::moe::Config
+    {
+        int rope_dim;
+        int n_group;
+        int topk_group;
+        float routed_scaling_factor;
+    };
+
+    typedef bailing::moe::Tokenizer Tokenizer;
+
+    const int NUM_EXPERTS                   =  256;
+    const int EXPERTS_PER_TOK               =  8;
+
+    class BailingSparseMoE : public BaseSparseMLP
+    {
+    public:
+        BailingSparseMoE(InitContext *ctx, int hidden_size, int intermediate_size, int num_experts = NUM_EXPERTS, int experts_per_tok = EXPERTS_PER_TOK)
+            : BaseSparseMLP(ctx, hidden_size, intermediate_size, num_experts, experts_per_tok, ActFunc::SILU, true),
+              n_group(-1), topk_group(-1)
+        {
+            score_func = ScoreFunc::Sigmoid;
+            always_scaling = true;
+        }
+    protected:
+        ggml::tensor *select_experts(ComputeContext *ctx, ggml::tensor *corrected_score) override;
+
+    public:
+        int n_group;
+        int topk_group;
+    };
+
+    ggml::tensor *BailingSparseMoE::select_experts(ComputeContext *ctx, ggml::tensor *corrected_score)
+    {
+        const int n_expert = num_local_experts;
+        const int experts_per_group = n_expert / n_group;
+        CHATLLM_CHECK(ggml::get_dim(corrected_score, 2) == 1);
+
+        ggml::tensor * selected_experts = nullptr;
+
+        ggml::tensor *grouped_scores = ggml::reshape_4d(ctx, corrected_score, experts_per_group, num_experts_per_tok,
+                                                        ggml::get_dim(corrected_score, 1), ggml::get_dim(corrected_score, 2));
+        selected_experts = ggml::top_k(ctx, grouped_scores, topk_group);
+
+        ggml::tensor *selected_experts_i64 = ggml::cast_int_to_i64(ctx, selected_experts);
+
+        CHATLLM_CHECK(ggml::get_dim(grouped_scores, 3) == 1);
+        grouped_scores                      = ggml::reshape_4d(ctx, grouped_scores, 1, ggml::get_dim(grouped_scores, 0), ggml::get_dim(grouped_scores, 1), ggml::get_dim(grouped_scores, 2));
+        ggml::tensor *selected_group_scores = ggml::scale(ctx, grouped_scores, 0.0f);
+        grouped_scores        = ggml::get_rows(ctx, grouped_scores, selected_experts);
+        selected_group_scores = ggml::set_rows(ctx, selected_group_scores, selected_experts_i64, grouped_scores);
+
+        selected_group_scores = ggml::reshape_3d(ctx, selected_group_scores,
+            ggml::get_dim(corrected_score, 0), ggml::get_dim(corrected_score, 1), ggml::get_dim(corrected_score, 2));
+
+        selected_experts = ggml::top_k(ctx, selected_group_scores, num_experts_per_tok);
+
+        return selected_experts;
+    }
+
+    class ConditionalGeneration : public BaseModelForConditionalGeneration
+    {
+    public:
+        typedef CombinedMLP<BailingSparseMoE, SiLUMLP> BailingMoEMLP;
+        typedef LMBlock1<RMSNorm, qwen::v3::QWen3SelfAttention, RMSNorm, BailingMoEMLP> BailingMoEBlock;
+        typedef BaseModelForConditionalGeneration Base;
+        typedef HeterogeneousModel ModelClass;
+    public:
+        ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type = MODEL_TYPE_BAILING_MOE2)
+            : BaseModelForConditionalGeneration(type, config, runtime_config, 4096 * 4),
+              config(config)
+        {
+            const size_t tensor_ovhd = ggml_tensor_overhead();
+            const int moe_layer_num = get_moe_layer_num();
+            const int dense_layer_num = config.num_hidden_layers - moe_layer_num;
+            const size_t num_tensors = 3
+                                + moe_layer_num * (12 + 7)
+                                + dense_layer_num * 14;
+            const size_t ctx_size = num_tensors * tensor_ovhd;
+            w_ctx_.gctx = GGMLContext({.mem_size = ctx_size, .mem_buffer = nullptr, .no_alloc = true});
+            w_ctx_.dtype = config.dtype;
+
+            CHATLLM_CHECK((NUM_EXPERTS == config.n_routed_experts)
+                            && (EXPERTS_PER_TOK == config.num_experts_per_tok))
+                << "unsupported MoE param";
+
+            #define config_rope(attention)     do { \
+                    attention.freq_base      = config.rope_theta;                           \
+                    attention.rope_dim       = config.rope_dim;                             \
+                } while (false)
+
+            auto create_layer = [&](InitContext *ctx, int layer_index) -> Block * {
+                if (is_layer_moe(layer_index))
+                {
+                    auto layer = new BailingMoEBlock(ctx, config.hidden_size, config.num_attention_heads, config.intermediate_size,
+                        config.moe_intermediate_size, config.moe_intermediate_size * config.n_shared_experts,
+                        config.num_key_value_heads,
+                        config.head_dim,
+                        config.max_length);
+                    layer->mlp.mlp1.norm_topk_prob          = config.norm_topk_prob != 0;
+                    layer->mlp.mlp1.routed_scaling_factor   = config.routed_scaling_factor;
+                    layer->mlp.mlp1.n_group                 = config.n_group;
+                    layer->mlp.mlp1.topk_group              = config.topk_group;
+                    config_rope(layer->attention);
+                    return layer;
+                }
+                else
+                {
+                    auto layer = new qwen::v3::QWen3Block(ctx, config.hidden_size, config.num_attention_heads, config.intermediate_size,
+                                                config.num_key_value_heads, config.head_dim, config.max_length);
+                    config_rope(layer->attention);
+                    return layer;
+                }
+            };
+
+            auto transformer = new ModelClass(&w_ctx_, config.num_hidden_layers, config.hidden_size,
+                create_embedding<Embedding>(&w_ctx_, config),
+                create_final_norm<RMSNorm>(&w_ctx_, config),
+                create_lm_head(&w_ctx_, config, false), create_layer);
+
+            Base::transformer = transformer;
+
+            #undef config_rope
+
+            w_ctx_.check_used_mem_size(true);
+        }
+
+        void load(ModelLoader &loader) override
+        {
+            loader.add_tensor_name_translations({
+                {".mlp2.",              ".shared_experts."},
+                {".mlp1.gate.",         ".gate."},
+                {".mlp1.experts.",      ".experts."},
+                {".mlp1.gate_score_correction_bias",      ".gate.expert_bias"},
+            });
+
+            BaseModelForConditionalGeneration::load(loader);
+        }
+
+    public:
+        const Config config;
+
+        bool is_layer_moe(int layer_index)
+        {
+            return (layer_index >= config.first_k_dense_replace) && (layer_index % config.moe_layer_freq == 0);
+        }
+
+        int get_moe_layer_num()
+        {
+            int r = 0;
+            for (int i = 0; i < config.num_hidden_layers; i++)
+            {
+                if (is_layer_moe(i))
+                    r++;
+            }
+            return r;
+        }
+    };
+}
+
+namespace chatllm
+{
+    REGISTER_MODEL_LOADER(BAILINGMOE,            bailing::moe,  1);
+    REGISTER_MODEL_LOADER(BAILING_MOE2,          bailing2::moe, 1);
 }
diff --git a/models/qwen.cpp b/models/qwen.cpp
@@ -1334,13 +1334,9 @@ namespace chatllm::qwen::v3
 
     }
 
-    class QWen3Block : public LMBlock1<RMSNorm, QWen3SelfAttention, RMSNorm, SiLUMLP>
-    {
-    public:
-        QWen3Block(InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size, int num_kv_heads, int head_dim, int max_length)
-            : LMBlock1(ctx, hidden_size, num_attention_heads, intermediate_size, num_kv_heads, head_dim, max_length)
-        {}
-    };
+    QWen3Block::QWen3Block(InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size, int num_kv_heads, int head_dim, int max_length)
+        : LMBlock1(ctx, hidden_size, num_attention_heads, intermediate_size, num_kv_heads, head_dim, max_length)
+    {}
 
     template <int NUM_EXPERTS, int EXPERTS_PER_TOK> class QWen3MoEBlock : public LMBlock1<RMSNorm, QWen3SelfAttention, RMSNorm, v2_moe::QWenSparseMoE<NUM_EXPERTS, EXPERTS_PER_TOK>>
     {
diff --git a/models/qwen.h b/models/qwen.h
@@ -551,6 +551,12 @@ namespace chatllm::qwen
             QWen3SelfAttention(InitContext *ctx, int hidden_size, int num_attention_heads, int num_kv_heads, int head_dim, int max_length);
         };
 
+        class QWen3Block : public LMBlock1<RMSNorm, QWen3SelfAttention, RMSNorm, SiLUMLP>
+        {
+        public:
+            QWen3Block(InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size, int num_kv_heads, int head_dim, int max_length);
+        };
+
         class ConditionalGeneration : public BaseModelForConditionalGeneration
         {
         public:
diff --git a/scripts/models.json b/scripts/models.json
@@ -2706,6 +2706,22 @@
             }
         }
     },
+    "ling2": {
+        "brief": "Ling 2.0 — a family of MoE-based large language models that combine SOTA performance with high efficiency.",
+        "default": "mini",
+        "license": "MIT",
+        "variants": {
+            "mini": {
+                "default": "q8",
+                "quantized": {
+                    "q8": {
+                        "size": 17277819184,
+                        "url": "chatllm_quantized_bailing/ling-mini-2.bin"
+                    }
+                }
+            }
+        }
+    },
     "ring": {
         "brief": "Ring-lite is a lightweight, fully open-sourced MoE (Mixture of Experts) LLM designed for complex reasoning tasks.",
         "default": "lite-2507",
@@ -2722,6 +2738,22 @@
             }
         }
     },
+    "ring2": {
+        "brief": "Ring-mini-2.0 — a high-performance inference-oriented MoE model deeply optimized based on the Ling 2.0 architecture.",
+        "default": "mini",
+        "license": "MIT",
+        "variants": {
+            "mini": {
+                "default": "q8",
+                "quantized": {
+                    "q8": {
+                        "size": 17277815376,
+                        "url": "chatllm_quantized_bailing/ring-mini-2.bin"
+                    }
+                }
+            }
+        }
+    },
     "openhands-lm": {
         "brief": "OpenHands LM is built on the foundation of Qwen Coder 2.5 Instruct 32B, leveraging its powerful base capabilities for coding tasks.",
         "default": "32b",
diff --git a/src/custom_ops.cpp b/src/custom_ops.cpp
diff --git a/src/layers.cpp b/src/layers.cpp
diff --git a/src/layers.h b/src/layers.h
diff --git a/src/models_priv.h b/src/models_priv.h

Original file line number	Diff line number	Diff line change
`@@ -1334,13 +1334,9 @@ namespace chatllm::qwen::v3`
`1334`	`1334`
`1335`	`1335`	`}`
`1336`	`1336`
`1337`		`- class QWen3Block : public LMBlock1<RMSNorm, QWen3SelfAttention, RMSNorm, SiLUMLP>`
`1338`		`- {`
`1339`		`- public:`
`1340`		`- QWen3Block(InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size, int num_kv_heads, int head_dim, int max_length)`
`1341`		`- : LMBlock1(ctx, hidden_size, num_attention_heads, intermediate_size, num_kv_heads, head_dim, max_length)`
`1342`		`- {}`
`1343`		`- };`
	`1337`	`+ QWen3Block::QWen3Block(InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size, int num_kv_heads, int head_dim, int max_length)`
	`1338`	`+ : LMBlock1(ctx, hidden_size, num_attention_heads, intermediate_size, num_kv_heads, head_dim, max_length)`
	`1339`	`+ {}`
`1344`	`1340`
`1345`	`1341`	`template <int NUM_EXPERTS, int EXPERTS_PER_TOK> class QWen3MoEBlock : public LMBlock1<RMSNorm, QWen3SelfAttention, RMSNorm, v2_moe::QWenSparseMoE<NUM_EXPERTS, EXPERTS_PER_TOK>>`
`1346`	`1342`	`{`