@@ -6496,7 +6496,7 @@ struct llm_build_context {
64966496 LLM_NORM_RMS, cb, il);
64976497 cb(cur, "ffn_norm", il);
64986498
6499- cur = build_moe (cur, n_tokens, il);
6499+ cur = build_moe_ffn (cur, n_tokens, LLM_FFN_SILU , il);
65006500 }
65016501
65026502 cur = ggml_add(ctx0, cur, ffn_inp);
@@ -6528,7 +6528,8 @@ struct llm_build_context {
65286528 return gf;
65296529 }
65306530
6531- ggml_tensor * build_moe(ggml_tensor * cur, int32_t n_tokens, int il) {
6531+ // REVIEW: will be replaced by https://github.com/ggerganov/llama.cpp/pull/6505
6532+ ggml_tensor * build_moe_ffn(ggml_tensor * cur, int32_t n_tokens, llm_ffn_op_type type_op, int il) {
65326533 ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
65336534 cb(logits, "ffn_moe_logits", il);
65346535
@@ -6560,13 +6561,25 @@ struct llm_build_context {
65606561 ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
65616562 cb(cur_up, "ffn_moe_up", il);
65626563
6563- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6564- cb(cur_gate , "ffn_moe_gate", il);
6564+ ggml_tensor * gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6565+ cb(gate , "ffn_moe_gate", il);
65656566
6566- cur_gate = ggml_silu(ctx0, cur_gate);
6567- cb(cur_gate, "ffn_moe_silu", il);
6567+ switch (type_op) {
6568+ case LLM_FFN_SILU:
6569+ {
6570+ gate = ggml_silu(ctx0, gate);
6571+ cb(gate, "ffn_moe_silu", il);
6572+ } break;
6573+ case LLM_FFN_GELU:
6574+ {
6575+ gate = ggml_gelu(ctx0, gate);
6576+ cb(gate, "ffn_moe_gelu", il);
6577+ } break;
6578+ default:
6579+ GGML_ASSERT(false);
6580+ }
65686581
6569- cur_expert = ggml_mul(ctx0, cur_up, cur_gate );
6582+ cur_expert = ggml_mul(ctx0, cur_up, gate );
65706583 cb(cur_expert, "ffn_moe_gate_par", il);
65716584
65726585 cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
@@ -7034,7 +7047,7 @@ struct llm_build_context {
70347047 LLM_NORM_RMS, cb, il);
70357048 cb(cur, "ffn_norm", il);
70367049
7037- cur = build_moe (cur, n_tokens, il);
7050+ cur = build_moe_ffn (cur, n_tokens, LLM_FFN_GELU , il);
70387051
70397052 // Grok
70407053 // if layer_out_norm is present then apply it before adding the input
@@ -7170,7 +7183,7 @@ struct llm_build_context {
71707183 LLM_NORM, cb, il);
71717184 cb(cur, "attn_out_norm", il);
71727185
7173- cur = build_moe (cur, n_tokens, il);
7186+ cur = build_moe_ffn (cur, n_tokens, LLM_FFN_SILU , il);
71747187
71757188 cur = ggml_add(ctx0, cur, ffn_inp);
71767189 cb(cur, "ffn_out", il);
0 commit comments