refactor: share single Lmhead class across NPU and other hardware.

yingxudeng · yingxudeng · commit 4966a97c3cd6 · 2025-11-07T17:59:16.000+08:00
diff --git a/xllm/core/framework/model/causal_lm.h b/xllm/core/framework/model/causal_lm.h
@@ -66,11 +66,13 @@ class CausalLM : public torch::nn::Module {
 
   virtual const torch::TensorOptions& options() const = 0;
 
-  virtual layer::LmHead get_lm_head() = 0;
-  virtual void set_lm_head(layer::LmHead& head) = 0;
+#if defined(USE_NPU)
+  virtual layer::NpuLmHead get_lm_head() = 0;
+  virtual void set_lm_head(layer::NpuLmHead& head) = 0;
   virtual std::vector<layer::WordEmbedding> get_word_embedding() = 0;
   virtual void set_word_embedding(
       std::vector<layer::WordEmbedding>& embedding) = 0;
+#endif
 };
 
 template <typename Model>
@@ -104,10 +106,12 @@ class CausalLMImpl : public CausalLM {
   virtual void update_expert_weight(int32_t layer_id) {
     return model_->update_expert_weight(layer_id);
   }
+#if defined(USE_NPU)
+  layer::NpuLmHead get_lm_head() override { return model_->get_lm_head(); };
 
-  layer::LmHead get_lm_head() override { return model_->get_lm_head(); };
-
-  void set_lm_head(layer::LmHead& head) override { model_->set_lm_head(head); };
+  void set_lm_head(layer::NpuLmHead& head) override {
+    model_->set_lm_head(head);
+  };
 
   std::vector<layer::WordEmbedding> get_word_embedding() override {
     return model_->get_word_embedding();
@@ -117,7 +121,7 @@ class CausalLMImpl : public CausalLM {
       std::vector<layer::WordEmbedding>& embedding) override {
     model_->set_word_embedding(embedding);
   };
-
+#endif
   torch::Device device() const override { return options_.device(); }
 
   const torch::TensorOptions& options() const override { return options_; }
diff --git a/xllm/core/framework/model/causal_vlm.h b/xllm/core/framework/model/causal_vlm.h
@@ -63,10 +63,12 @@ class CausalVLMImpl : public CausalVLM {
   }
 
   virtual void update_expert_weight(int32_t layer_id) { return; }
+#if defined(USE_NPU)
+  layer::NpuLmHead get_lm_head() override { return model_->get_lm_head(); };
 
-  layer::LmHead get_lm_head() override { return model_->get_lm_head(); };
-
-  void set_lm_head(layer::LmHead& head) override { model_->set_lm_head(head); };
+  void set_lm_head(layer::NpuLmHead& head) override {
+    model_->set_lm_head(head);
+  };
 
   std::vector<layer::WordEmbedding> get_word_embedding() override {
     return model_->get_word_embedding();
@@ -76,7 +78,7 @@ class CausalVLMImpl : public CausalVLM {
       std::vector<layer::WordEmbedding>& embedding) override {
     model_->set_word_embedding(embedding);
   };
-
+#endif
   torch::Device device() const override { return options_.device(); }
 
   const torch::TensorOptions& options() const override { return options_; }
diff --git a/xllm/core/layers/lm_head.h b/xllm/core/layers/lm_head.h
@@ -24,42 +24,16 @@ namespace xllm {
 namespace layer {
 
 #if defined(USE_NPU)
-class LmHead : public torch::nn::ModuleHolder<NpuLmHeadImpl> {
+class NpuLmHead : public torch::nn::ModuleHolder<NpuLmHeadImpl> {
  public:
   using torch::nn::ModuleHolder<NpuLmHeadImpl>::ModuleHolder;
   using Impl __attribute__((__unused__)) = NpuLmHeadImpl;
 
-  LmHead(const ModelContext& context)
+  NpuLmHead(const ModelContext& context)
       : ModuleHolder(std::make_shared<NpuLmHeadImpl>(context)) {}
 };
 
-/**
- * TODO: Rename the original LmHead definition to NpuLmHead,
- * and define the current one as LmHead to unify NPU's LmHead
- * related code with MLU and GPU
- */
-class LmHeadNative : public torch::nn::ModuleHolder<ColumnParallelLinearImpl> {
- public:
-  using torch::nn::ModuleHolder<ColumnParallelLinearImpl>::ModuleHolder;
-  using Impl __attribute__((__unused__)) = ColumnParallelLinearImpl;
-
-  LmHeadNative(int64_t in_features,
-               int64_t out_features,
-               bool bias,
-               bool gather_output,
-               const QuantArgs& quant_args,
-               const ParallelArgs& parallel_args,
-               const torch::TensorOptions& options)
-      : ModuleHolder(std::make_shared<ColumnParallelLinearImpl>(in_features,
-                                                                out_features,
-                                                                bias,
-                                                                gather_output,
-                                                                quant_args,
-                                                                parallel_args,
-                                                                options)) {}
-};
-
-#else
+#endif
 class LmHead : public torch::nn::ModuleHolder<ColumnParallelLinearImpl> {
  public:
   using torch::nn::ModuleHolder<ColumnParallelLinearImpl>::ModuleHolder;
@@ -80,7 +54,6 @@ class LmHead : public torch::nn::ModuleHolder<ColumnParallelLinearImpl> {
                                                                 parallel_args,
                                                                 options)) {}
 };
-#endif
 
 }  // namespace layer
 }  // namespace xllm
diff --git a/xllm/core/runtime/acl_graph_executor_test.cpp b/xllm/core/runtime/acl_graph_executor_test.cpp
@@ -234,12 +234,12 @@ class SimpleCausalLM : public CausalLM {
     // Simple implementation for testing
   }
 
-  layer::LmHead get_lm_head() override {
+  layer::NpuLmHead get_lm_head() override {
     // Simple implementation for testing
-    return layer::LmHead(nullptr);
+    return layer::NpuLmHead(nullptr);
   }
 
-  void set_lm_head(layer::LmHead& head) override {
+  void set_lm_head(layer::NpuLmHead& head) override {
     // Simple implementation for testing
   }
 
diff --git a/xllm/core/runtime/llm_worker_impl.h b/xllm/core/runtime/llm_worker_impl.h
@@ -44,10 +44,10 @@ class LLMWorkerImpl : public WorkerImpl {
 
   std::optional<ForwardOutput> step(
       const BatchedForwardInputs& inputs) override;
+#if defined(USE_NPU)
+  layer::NpuLmHead get_lm_head() { return model_->get_lm_head(); };
 
-  layer::LmHead get_lm_head() { return model_->get_lm_head(); };
-
-  void set_lm_head(layer::LmHead& head) { model_->set_lm_head(head); };
+  void set_lm_head(layer::NpuLmHead& head) { model_->set_lm_head(head); };
 
   std::vector<layer::WordEmbedding> get_word_embedding() {
     return model_->get_word_embedding();
@@ -56,7 +56,7 @@ class LLMWorkerImpl : public WorkerImpl {
   void set_word_embedding(std::vector<layer::WordEmbedding>& embedding) {
     model_->set_word_embedding(embedding);
   };
-
+#endif
  private:
   std::unique_ptr<BeamSearcher> beam_searcher_;
 };
diff --git a/xllm/core/runtime/speculative_worker_impl.cpp b/xllm/core/runtime/speculative_worker_impl.cpp
@@ -108,14 +108,15 @@ bool SpeculativeWorkerImpl::init_model(const std::string& model_weights_path) {
     CHECK_EQ(draft_impl_->get_status(), WorkerImpl::Status::UNINITIALIZED);
     result = draft_impl_->WorkerImpl::init_model(model_weights_path);
   }
-
+#if defined(USE_NPU)
   if (draft_impl_->get_status() == WorkerImpl::Status::LOADED) {
     // Deepseek MTP
     auto head = impl_->get_lm_head();
     draft_impl_->set_lm_head(head);
     auto word_embedding = impl_->get_word_embedding();
     draft_impl_->set_word_embedding(word_embedding);
   }
+#endif
   return result;
 }
 
diff --git a/xllm/models/llm/deepseek_v2.h b/xllm/models/llm/deepseek_v2.h
@@ -297,7 +297,7 @@ class DeepseekV2ForCausalLMImpl : public torch::nn::Module {
  public:
   DeepseekV2ForCausalLMImpl(const ModelContext& context) {
     model_ = register_module("model", DeepseekV2Model(context));
-    lm_head_ = register_module("lm_head", layer::LmHead(context));
+    lm_head_ = register_module("lm_head", layer::NpuLmHead(context));
     first_k_dense_replace_ = context.get_model_args().first_k_dense_replace();
   }
 
@@ -342,10 +342,10 @@ class DeepseekV2ForCausalLMImpl : public torch::nn::Module {
   void update_expert_weight(int32_t layer_id) {
     model_->update_expert_weight(layer_id + first_k_dense_replace_);
   }
+#if defined(USE_NPU)
+  layer::NpuLmHead get_lm_head() { return lm_head_; }
 
-  layer::LmHead get_lm_head() { return lm_head_; }
-
-  void set_lm_head(layer::LmHead& head) { lm_head_ = head; }
+  void set_lm_head(layer::NpuLmHead& head) { lm_head_ = head; }
 
   std::vector<layer::WordEmbedding> get_word_embedding() {
     return model_->get_word_embedding();
@@ -355,9 +355,11 @@ class DeepseekV2ForCausalLMImpl : public torch::nn::Module {
     model_->set_word_embedding(word_embedding);
   }
 
+ private:
+  layer::NpuLmHead lm_head_{nullptr};
+#endif
  private:
   DeepseekV2Model model_{nullptr};
-  layer::LmHead lm_head_{nullptr};
   int32_t first_k_dense_replace_;
 };
 TORCH_MODULE(DeepseekV2ForCausalLM);
diff --git a/xllm/models/llm/deepseek_v2_mtp.h b/xllm/models/llm/deepseek_v2_mtp.h
@@ -295,9 +295,10 @@ class DeepseekV2MtpForCausalLMImpl : public torch::nn::Module {
     return;
   }
   void update_expert_weight(int32_t layer_id) { return; }
-  layer::LmHead get_lm_head() { return lm_head_; }
+#if defined(USE_NPU)
+  layer::NpuLmHead get_lm_head() { return lm_head_; }
 
-  void set_lm_head(layer::LmHead& head) { lm_head_ = head; }
+  void set_lm_head(layer::NpuLmHead& head) { lm_head_ = head; }
 
   std::vector<layer::WordEmbedding> get_word_embedding() {
     return model_->get_word_embedding();
@@ -307,9 +308,11 @@ class DeepseekV2MtpForCausalLMImpl : public torch::nn::Module {
     model_->set_word_embedding(word_embedding);
   }
 
+ private:
+  layer::NpuLmHead lm_head_{nullptr};
+#endif
  private:
   DeepseekV2MtpModel model_{nullptr};
-  layer::LmHead lm_head_{nullptr};
 };
 TORCH_MODULE(DeepseekV2MtpForCausalLM);
 
diff --git a/xllm/models/llm/embedding_model_base.h b/xllm/models/llm/embedding_model_base.h
@@ -73,10 +73,10 @@ class LlmForEmbeddingImplBase : public torch::nn::Module {
     return;
   }
   virtual void update_expert_weight(int32_t layer_id) { return; }
+#if defined(USE_NPU)
+  virtual layer::NpuLmHead get_lm_head() { return lm_head_; }
 
-  virtual layer::LmHead get_lm_head() { return lm_head_; }
-
-  virtual void set_lm_head(layer::LmHead& head) { lm_head_ = head; }
+  virtual void set_lm_head(layer::NpuLmHead& head) { lm_head_ = head; }
 
   virtual std::vector<layer::WordEmbedding> get_word_embedding() {
     return model_->get_word_embedding();
@@ -87,13 +87,15 @@ class LlmForEmbeddingImplBase : public torch::nn::Module {
     model_->set_word_embedding(word_embedding);
   }
 
+ protected:
+  layer::NpuLmHead lm_head_{nullptr};
+#endif
  protected:
   // parameter members, must be registered
   LlmModelType model_{nullptr};
   int device_id = 0;
   bool tie_word_embeddings{false};
   // test
-  layer::LmHead lm_head_{nullptr};
 };
 
 }  // namespace xllm
diff --git a/xllm/models/llm/glm4_moe.h b/xllm/models/llm/glm4_moe.h
@@ -254,7 +254,7 @@ class Glm4MoeForCausalLMImpl : public torch::nn::Module {
  public:
   Glm4MoeForCausalLMImpl(const ModelContext& context) {
     model_ = register_module("model", Glm4MoeModel(context));
-    lm_head_ = register_module("lm_head", layer::LmHead(context));
+    lm_head_ = register_module("lm_head", layer::NpuLmHead(context));
   }
 
   // tokens: [num_tokens]
@@ -296,10 +296,10 @@ class Glm4MoeForCausalLMImpl : public torch::nn::Module {
     return;
   }
   virtual void update_expert_weight(int32_t layer_id) { return; }
+#if defined(USE_NPU)
+  layer::NpuLmHead get_lm_head() { return lm_head_; }
 
-  layer::LmHead get_lm_head() { return lm_head_; }
-
-  void set_lm_head(layer::LmHead& head) { lm_head_ = head; }
+  void set_lm_head(layer::NpuLmHead& head) { lm_head_ = head; }
 
   std::vector<layer::WordEmbedding> get_word_embedding() {
     return model_->get_word_embedding();
@@ -309,9 +309,11 @@ class Glm4MoeForCausalLMImpl : public torch::nn::Module {
     model_->set_word_embedding(word_embedding);
   }
 
+ private:
+  layer::NpuLmHead lm_head_{nullptr};
+#endif
  private:
   Glm4MoeModel model_{nullptr};
-  layer::LmHead lm_head_{nullptr};
 };
 TORCH_MODULE(Glm4MoeForCausalLM);
 
diff --git a/xllm/models/llm/glm4_moe_mtp.h b/xllm/models/llm/glm4_moe_mtp.h
@@ -284,9 +284,10 @@ class Glm4MoeMtpForCausalLMImpl : public torch::nn::Module {
     return;
   }
   void update_expert_weight(int32_t layer_id) { return; }
-  layer::LmHead get_lm_head() { return lm_head_; }
+#if defined(USE_NPU)
+  layer::NpuLmHead get_lm_head() { return lm_head_; }
 
-  void set_lm_head(layer::LmHead& head) { lm_head_ = head; }
+  void set_lm_head(layer::NpuLmHead& head) { lm_head_ = head; }
 
   std::vector<layer::WordEmbedding> get_word_embedding() {
     return model_->get_word_embedding();
@@ -296,9 +297,11 @@ class Glm4MoeMtpForCausalLMImpl : public torch::nn::Module {
     model_->set_word_embedding(word_embedding);
   }
 
+ private:
+  layer::NpuLmHead lm_head_{nullptr};
+#endif
  private:
   Glm4MoeMtpModel model_{nullptr};
-  layer::LmHead lm_head_{nullptr};
 };
 TORCH_MODULE(Glm4MoeMtpForCausalLM);
 
diff --git a/xllm/models/llm/llama.h b/xllm/models/llm/llama.h
@@ -246,7 +246,7 @@ class LlamaForCausalLMImpl : public torch::nn::Module {
     // register submodules
     model_ = register_module("model", LlamaModel(context));
     device_id_ = options.device().index();
-    lm_head_ = register_module("lm_head", layer::LmHead(context));
+    lm_head_ = register_module("lm_head", layer::NpuLmHead(context));
   }
   // tokens: [num_tokens]
   // positions: [num_tokens] token pos in the sequence
@@ -285,10 +285,10 @@ class LlamaForCausalLMImpl : public torch::nn::Module {
     return;
   }
   void update_expert_weight(int32_t layer_id) { return; }
+#if defined(USE_NPU)
+  layer::NpuLmHead get_lm_head() { return lm_head_; }
 
-  layer::LmHead get_lm_head() { return lm_head_; }
-
-  void set_lm_head(layer::LmHead& head) { lm_head_ = head; }
+  void set_lm_head(layer::NpuLmHead& head) { lm_head_ = head; }
 
   std::vector<layer::WordEmbedding> get_word_embedding() {
     return model_->get_word_embedding();
@@ -298,11 +298,13 @@ class LlamaForCausalLMImpl : public torch::nn::Module {
     model_->set_word_embedding(word_embedding);
   }
 
+ private:
+  layer::NpuLmHead lm_head_{nullptr};
+#endif
  private:
   // parameter members, must be registered
   LlamaModel model_{nullptr};
   int device_id_ = 0;
-  layer::LmHead lm_head_{nullptr};
 };
 TORCH_MODULE(LlamaForCausalLM);
 
diff --git a/xllm/models/llm/llm_model_base.h b/xllm/models/llm/llm_model_base.h
diff --git a/xllm/models/llm/qwen3_embedding.h b/xllm/models/llm/qwen3_embedding.h
diff --git a/xllm/models/llm/qwen3_moe.h b/xllm/models/llm/qwen3_moe.h
diff --git a/xllm/models/vlm/minicpmv.h b/xllm/models/vlm/minicpmv.h
diff --git a/xllm/models/vlm/qwen2_5_vl.h b/xllm/models/vlm/qwen2_5_vl.h

Original file line number	Diff line number	Diff line change
`@@ -234,12 +234,12 @@ class SimpleCausalLM : public CausalLM {`
`234`	`234`	`// Simple implementation for testing`
`235`	`235`	`}`
`236`	`236`
`237`		`- layer::LmHead get_lm_head() override {`
	`237`	`+ layer::NpuLmHead get_lm_head() override {`
`238`	`238`	`// Simple implementation for testing`
`239`		`- return layer::LmHead(nullptr);`
	`239`	`+ return layer::NpuLmHead(nullptr);`
`240`	`240`	`}`
`241`	`241`
`242`		`- void set_lm_head(layer::LmHead& head) override {`
	`242`	`+ void set_lm_head(layer::NpuLmHead& head) override {`
`243`	`243`	`// Simple implementation for testing`
`244`	`244`	`}`
`245`	`245`
Original file line number	Diff line number	Diff line change
`@@ -108,14 +108,15 @@ bool SpeculativeWorkerImpl::init_model(const std::string& model_weights_path) {`
`108`	`108`	`CHECK_EQ(draft_impl_->get_status(), WorkerImpl::Status::UNINITIALIZED);`
`109`	`109`	`result = draft_impl_->WorkerImpl::init_model(model_weights_path);`
`110`	`110`	`}`
`111`		`-`
	`111`	`+#if defined(USE_NPU)`
`112`	`112`	`if (draft_impl_->get_status() == WorkerImpl::Status::LOADED) {`
`113`	`113`	`// Deepseek MTP`
`114`	`114`	`auto head = impl_->get_lm_head();`
`115`	`115`	`draft_impl_->set_lm_head(head);`
`116`	`116`	`auto word_embedding = impl_->get_word_embedding();`
`117`	`117`	`draft_impl_->set_word_embedding(word_embedding);`
`118`	`118`	`}`
	`119`	`+#endif`
`119`	`120`	`return result;`
`120`	`121`	`}`
`121`	`122`