[Op][Transformations] Adjustment of internal GQA op shape infer and decomposition to Enable NPU #29766

wine99 · 2025-03-27T04:02:44Z

The KV cache handling logic differs between dynamic and static shapes.

In the case of dynamic shapes, the KV cache buffer only holds valid data. So it only needs a ConcatOP
For static shapes, the valid data is stored at the end of the buffer, with the beginning of the buffer being set to 0. So the ConcatOP will make the buffer greater than buffer size, it need slice the real size data.

The following scripts work for both CPU GPU (dynamic) and NPU (static)

import onnxruntime as rt
import os
import numpy as np
import time

import onnxruntime.tools.add_openvino_win_libs as utils
utils.add_openvino_libs_to_path()
from transformers import PreTrainedTokenizerFast

LOOP_TIME = 2
NUM_INFERENCE = 16 # how many 2nd token

def get_average_time(time_list):
    return (sum(time_list) - max(time_list) - min(time_list)) / (len(time_list) - 2)

GTA = False
test_phi3 = True
test_lama3 = False
is_npu = False

if test_phi3:
    gta_modelPath = os.path.join('C:\\', 'Users', 'gta', 'Downloads', 'Phi-3-mini-4k-instruct-onnx', 'model.onnx')
    if is_npu:
        gta_modelPath = os.path.join('C:\\', 'Users', 'gta', 'Downloads', 'Phi-3-mini-4k-instruct-onnx-rows-newalgo-int4', 'model.onnx')
    gta_tokenizerPath = os.path.join('C:\\', 'Users', 'gta', 'Downloads', 'Phi-3-mini-4k-instruct-onnx', 'tokenizer.json')
    server_modelPath = os.path.join('D:\\', 'models', 'llm', 'Phi-3-mini-4k-instruct-onnx', 'model.onnx')
    server_tokenizerPath = os.path.join('D:\\', 'models', 'llm', 'Phi-3-mini-4k-instruct-onnx', 'tokenizer.json')

if test_lama3:
    gta_modelPath = os.path.join('C:\\', 'Users', 'gta', 'Downloads', 'llama3.1-8B-instruct-onnx', 'model.onnx')
    gta_tokenizerPath = os.path.join('C:\\', 'Users', 'gta', 'Downloads', 'llama3.1-8B-instruct-onnx', 'tokenizer.json')
    server_modelPath = os.path.join('D:\\', 'models', 'llm', 'llama3.1-8B-instruct-onnx', 'model.onnx')
    server_tokenizerPath = os.path.join('D:\\', 'models', 'llm', 'llama3.1-8B-instruct-onnx', 'tokenizer.json')

if GTA:
    modelPath = gta_modelPath
    tokenizerPath = gta_tokenizerPath
else:
    modelPath = server_modelPath
    tokenizerPath = server_tokenizerPath

so = rt.SessionOptions()
# so.log_severity_level = 3
# so.enable_profiling = False

sess = rt.InferenceSession(modelPath, so, providers=['CPUExecutionProvider'])
#sess = rt.InferenceSession(modelPath, so, providers=['OpenVINOExecutionProvider'], provider_options=[{'device_type' : "CPU", 'cache_dir': "cpucache"}])
# sess = rt.InferenceSession(modelPath, so, providers=['OpenVINOExecutionProvider'], provider_options=[{'device_type' : "NPU", 'load_config':'{ "NPU": { "NPUW_CACHE_DIR": "ncache", "NPUW_DEVICES": "NPU", "NPU_USE_NPUW": "YES", "NPUW_DUMP_IO":"NO", "NPUW_FOLD": "YES", "NPUW_DUMP_SUBS": "NO", "NPUW_DQ": "YES", "NPUW_HOST_GATHER": "NO", "NPU_COMPILATION_MODE_PARAMS": "compute-layers-with-higher-precision=Sqrt,ReduceMean,Power,RMS" } }'}])
tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizerPath)

outputs = sess.get_outputs()
output_names = list(map(lambda output: output.name, outputs))


def get_phi3_param():
    num_layers = 32
    batch_size = 1
    num_heads = 32
    sequence_length = 512
    hidden_size = 96
    return num_layers, batch_size, num_heads, sequence_length, hidden_size

def get_llama3_param():
    num_layers = 32
    batch_size = 1
    num_heads = 8
    sequence_length = 512
    hidden_size = 128
    return num_layers, batch_size, num_heads, sequence_length, hidden_size

if test_phi3:
    num_layers, batch_size, num_heads, sequence_length, hidden_size = get_phi3_param()

if test_lama3:
    num_layers, batch_size, num_heads, sequence_length, hidden_size = get_llama3_param()

def create_numpy_inputs(inputToken):
    tokenLen = len(inputToken)
    npinput_ids = np.array([inputToken], dtype=np.int64)
    if is_npu:
        npattention_mask = np.array([[1] * tokenLen + [0] * (sequence_length - tokenLen)], dtype=np.int64)
    else:
        npattention_mask = np.array([[1] * tokenLen], dtype=np.int64)
    return npinput_ids, npattention_mask

def init_npinput(inputToken):
    flattened_past_key_values = {}
    for index in range(num_layers):
        if is_npu:
            key_state = np.zeros((batch_size, num_heads, sequence_length - len(inputToken), hidden_size), dtype=np.float32)
            value_state = np.zeros((batch_size, num_heads, sequence_length - len(inputToken), hidden_size), dtype=np.float32)
        else:
            key_state = np.zeros((batch_size, num_heads, 0, hidden_size), dtype=np.float32)
            value_state = np.zeros((batch_size, num_heads, 0, hidden_size), dtype=np.float32)
        flattened_past_key_values[f'past_key_values.{index}.key'] = key_state
        flattened_past_key_values[f'past_key_values.{index}.value'] = value_state
    flattened_past_key_values['input_ids'], flattened_past_key_values['attention_mask'] = create_numpy_inputs(inputToken)
    return flattened_past_key_values


for loop_idx in range(LOOP_TIME):
    first_token_time_list = []
    secod_token_time_list = []
    print(f"start loop {loop_idx}")

    if test_phi3:
        input = """<|user|> 
The Sun is yellow because <|end|>
<|assistant|>
"""
    if test_lama3:
        input = """<|begin_of_text|><|user|>
The Sun is yellow because <|end|>
<|assistant|>
"""

    inputToken = tokenizer.encode(input)
    history_tokens = inputToken
    flattened_past_key_values = init_npinput(inputToken)
    lastTokenLen = len(inputToken)

    before = time.time()
    results = sess.run(output_names, flattened_past_key_values)
    after = time.time()
    first_token_time_list.append(int((after - before) * 1000))

    last_generated_token = np.argmax(results[0][-1, -1, :], axis=-1)
    print(last_generated_token ,end=' ')
    history_tokens.append(last_generated_token)
    for i in range(NUM_INFERENCE):
        # update kvcahe
        for index in range(len(output_names)):
            if not output_names[index].startswith('present'):
                continue
            outputname = output_names[index]
            inputname = outputname.replace('present', 'past_key_values')
            flattened_past_key_values[inputname] = results[index]

        # update input token
        flattened_past_key_values[f'input_ids'] = np.array([[last_generated_token]], dtype=np.int64)
        if is_npu:
            flattened_past_key_values[f'attention_mask'] = np.array([[1] * len(history_tokens) + [0] * (sequence_length - len(history_tokens))], dtype=np.int64)
        else:
            flattened_past_key_values[f'attention_mask'] = np.array([[1] * len(history_tokens)], dtype=np.int64)

        before = time.time()
        results = sess.run(output_names, flattened_past_key_values)
        after = time.time()
        secod_token_time_list.append(int((after - before) * 1000))

        last_generated_token = np.argmax(results[0][-1, -1, :], axis=-1)
        print(last_generated_token ,end=' ')
        history_tokens.append(last_generated_token)

    print(f"1st token times: {first_token_time_list}, avg {first_token_time_list[0]} ms")
    print(f"2nd token times: {secod_token_time_list}, avg {int(get_average_time(secod_token_time_list))} ms")

print(tokenizer.decode(history_tokens))
print(f"loop {LOOP_TIME} times finished")

src/core/dev_api/openvino/op/group_query_attention.hpp

src/core/src/op/group_query_attention.cpp

itikhono · 2025-04-10T08:03:01Z

...n/transformations/src/transformations/op_conversions/group_query_attention_decomposition.cpp

+
+    ov::Output<ov::Node> present_k;
+    ov::Output<ov::Node> present_v;
+    if (is_static_input) {


can we use "ShapeOf -> Gather" to cut K.get_partial_shape()[2].get_length() and cover dynamic case?
for static case "ShapeOf -> Gather" will be const folded, for dynamic case "ShapeOf -> Gather" remains in the graph

Such case seems to be already handled by get_dimensions helper and stored as concat_kv_len,
Looks like the idea of this change, is to introduce different behavior for static and dynamic shapes intentionally, and treat static shape as indicator of "maximum sequence length", while dynamic as the actual size that is changing between inference calls.

The NPU doesn't support dynamic shape, which causes a runtime issue when doing type inference. At the beginning of the implementation, we used gatherOp, but the SliceOp can get better performance.

mitruska

The main concern regarding proposed changes is that the shape inference for static and dynamic shape is not unified, and the "static" shape at the operator level is used as a flag to comply with plugin specific requirements (CPU/GPU vs NPU). The shared GQA op should be rather plugin-independent by design. As I understand, in the proposed changes, static shape is assumed to be an "NPU" case where kv sequence len dimension means "maximum" sequence length, while dynamic is assumed to be the actual sequence length to be supported by CPU/GPU.
It may lead to unexpected behaviour.

Have you considered any alternative solutions, like transformation level flag/fallback for proper decomposition for the target plugin?

mitruska · 2025-04-11T15:56:08Z

...n/transformations/src/transformations/op_conversions/group_query_attention_decomposition.cpp

+
+    ov::Output<ov::Node> present_k;
+    ov::Output<ov::Node> present_v;
+    if (is_static_input) {


Such case seems to be already handled by get_dimensions helper and stored as concat_kv_len,
Looks like the idea of this change, is to introduce different behavior for static and dynamic shapes intentionally, and treat static shape as indicator of "maximum sequence length", while dynamic as the actual size that is changing between inference calls.

sgbihu · 2025-04-14T07:01:30Z

Have you considered any alternative solutions, like transformation level flag/fallback for proper decomposition for the target plugin?

@mitruska We have considered the logic that does not depend on device type. It just depends on the shape. So if a static shape for CPU, this logic also works.

And I have changed some logic to make the shape infer more reasonable. The return shape should be same as input. Could you help review it and give me more inputs?

praasz

OK, for core part

src/core/src/op/group_query_attention.cpp

...ansformations/include/transformations/op_conversions/group_query_attention_decomposition.hpp

mitruska

I can see the goal of the proposed changes, but I'm not fully convinced that this is a long term solution. As GQA is still a part of dev API, it can be modified, but at some point we may need to make it public and compatible with frontend frameworks, for example ONNX Attention.
If those changes are needed right now, then I don't want to block it, but I recommend to ensure that other plugins and transformation team approve such approach as possible to maintain for common GQA op.
cc: @a-sidorova @itikhono @jane-intel

zhangYiIntel · 2025-04-25T08:09:55Z

For the correctness, I think the PR is good and we should add test like

src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp

But one question about this PR is that the usage of kv-cache is much different from the current stateful model method. In stateful model, the kv-cache is maintained internally inside the stateful model by CPU/GPU which avoids memory-copy between devices and multi-query is already support in stateful model. Now with this PR the kv-cache management is done by application. In terms of reducing memory I/O, I think it's a sub-optimal way compared to stateful model method.

sgbihu · 2025-04-27T06:10:16Z

For the correctness, I think the PR is good and we should add test like
src/common/transformations/tests/op_conversions/scaled_dot_product_decomposition_test.cpp

For the test, we already added the GQA's test in PR28163 and this is a follow up for NPU.

But one question about this PR is that the usage of kv-cache is much different from the current stateful model method. In stateful model, the kv-cache is maintained internally inside the stateful model by CPU/GPU which avoids memory-copy between devices and multi-query is already support in stateful model. Now with this PR the kv-cache management is done by application. In terms of reducing memory I/O, I think it's a sub-optimal way compared to stateful model method.

The script wants to simulate the GenAI feature and only for test purpose. The GQA doesn't care about the assign/readvalue OP for stateless model, it just an operator. So only implement the GQA related part and reserver things will be covered by ORT-GenAI.

zhangYiIntel

LGTM, but I still have a question about KV-cache management.

zhangYiIntel · 2025-04-27T08:04:16Z

...n/transformations/src/transformations/op_conversions/group_query_attention_decomposition.cpp

+            v0::Constant::create(ov::element::i64, ov::Shape{1}, {K.get_partial_shape()[2].get_length()}));
+        const auto past_kv_len_const = register_new_node(
+            v0::Constant::create(ov::element::i64, ov::Shape{1}, {past_key.get_partial_shape()[2].get_length()}));
+        past_key = register_new_node<v8::Slice>(past_key, current_kv_len_const, past_kv_len_const, one, two);


Do we have document which designs the cache layout for static shape ?

From first glimpse, we may think the cache grows afterwards, which is

index: 0->past_len->cur_len data layout: [past cache]|[current cache]

However, the code here assumes that past data is placed after current data, I think the memory growth direction is different from ordinary thinking. It's better that we could have a document or an agreement about this

index: 0->cur_len->past_len data layout: [current cache]|[past cache]

We only describe the logic in the PR description. And your understanding is not correct, the latest cache always at the end of the buffer. This part wants to pop the 0 at begin of the buffer. Then L120 is the concat logic.

Even if the concat part is the real concat of past_kv + cur_kv, the layout of past_kv cache is still confusing, why 0s are padding before past_kv, will 0s will be padded after past_kv in some other implementation ? The problem here is that we apply a strong assumption about the layout of past_kv, but there is no document about this assumption.

Updated comments.

sgbihu · 2025-05-21T04:18:33Z

build_jenkins

sgbihu · 2025-05-21T08:27:29Z

build_jenkins

sgbihu · 2025-05-21T14:29:11Z

build_jenkins

Co-authored-by: Pawel Raasz <[email protected]>

sgbihu · 2025-05-21T23:33:08Z

build_jenkins

mlukasze · 2025-05-22T05:52:26Z

Thank you @wine99 we got it!

wine99 requested review from a team as code owners March 27, 2025 04:02

wine99 requested review from itikhono and removed request for a team March 27, 2025 04:02

wine99 marked this pull request as draft March 27, 2025 04:02

github-actions bot added category: Core OpenVINO Core (aka ngraph) category: transformations OpenVINO Runtime library - Transformations labels Mar 27, 2025

sys-openvino-ci added the ExternalPR External contributor label Mar 27, 2025

mlukasze requested a review from mitruska March 27, 2025 05:48

sgbihu force-pushed the gqa_npu branch from 0e01dfb to 5f85704 Compare April 8, 2025 05:59

wine99 marked this pull request as ready for review April 8, 2025 06:13

praasz reviewed Apr 9, 2025

View reviewed changes

src/core/dev_api/openvino/op/group_query_attention.hpp Outdated Show resolved Hide resolved

src/core/src/op/group_query_attention.cpp Outdated Show resolved Hide resolved

itikhono reviewed Apr 10, 2025

View reviewed changes

mitruska reviewed Apr 11, 2025

View reviewed changes

sgbihu force-pushed the gqa_npu branch from 5f85704 to 32ec134 Compare April 14, 2025 06:39

sgbihu force-pushed the gqa_npu branch from 32ec134 to aada4ac Compare April 14, 2025 07:08

praasz approved these changes Apr 14, 2025

View reviewed changes

src/core/src/op/group_query_attention.cpp Outdated Show resolved Hide resolved

...ansformations/include/transformations/op_conversions/group_query_attention_decomposition.hpp Outdated Show resolved Hide resolved

mitruska reviewed Apr 16, 2025

View reviewed changes

mitruska requested review from a-sidorova and e-nugmanova April 16, 2025 20:22

sgbihu force-pushed the gqa_npu branch from 8d74de7 to 7802a8f Compare April 17, 2025 02:47

sgbihu force-pushed the gqa_npu branch from 7802a8f to e0164e5 Compare April 24, 2025 05:14

itikhono requested a review from CuriousPanCake April 24, 2025 12:14

yuxu42 requested a review from zhangYiIntel April 25, 2025 05:47

sgbihu force-pushed the gqa_npu branch from e0164e5 to 472bced Compare April 27, 2025 06:32

zhangYiIntel reviewed Apr 27, 2025

View reviewed changes

zhangYiIntel approved these changes Apr 28, 2025

View reviewed changes

sgbihu force-pushed the gqa_npu branch from 1813895 to 4c81cca Compare May 20, 2025 23:50

mlukasze requested review from itikhono, mitruska and sgbihu May 21, 2025 04:31

sgbihu force-pushed the gqa_npu branch from 4c81cca to 798b271 Compare May 21, 2025 05:49

mlukasze removed the request for review from e-nugmanova May 21, 2025 09:01

mitruska approved these changes May 21, 2025

View reviewed changes

sgbihu force-pushed the gqa_npu branch from 798b271 to 4e8a0fb Compare May 21, 2025 14:28

wine99 and others added 9 commits May 22, 2025 07:32

Enable NPU

4d6ba11

refactor cos and sin

7fa664d

refactor

bc94dbe

Fix GPU execution error

9006a0e

Refine the KV cahce logic

487581b

Update src/core/src/op/group_query_attention.cpp based on review input

4971cae

Co-authored-by: Pawel Raasz <[email protected]>

Micro change based on code review

335bea5

Add comments based on code review

26e6901

Update code based on review inputs

c290e86

sgbihu force-pushed the gqa_npu branch from 4e8a0fb to c290e86 Compare May 21, 2025 23:32

sgbihu enabled auto-merge May 21, 2025 23:38

sgbihu added this pull request to the merge queue May 22, 2025

sgbihu approved these changes May 22, 2025

View reviewed changes

Merged via the queue into openvinotoolkit:master with commit 8e77c28 May 22, 2025
213 of 217 checks passed

sgbihu deleted the gqa_npu branch May 22, 2025 05:45

mlukasze added this to the 2025.2 milestone May 22, 2025

[Op][Transformations] Adjustment of internal GQA op shape infer and decomposition to Enable NPU #29766

[Op][Transformations] Adjustment of internal GQA op shape infer and decomposition to Enable NPU #29766

Uh oh!

Conversation

wine99 commented Mar 27, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

itikhono Apr 10, 2025

Choose a reason for hiding this comment

Uh oh!

mitruska Apr 11, 2025

Choose a reason for hiding this comment

Uh oh!

sgbihu Apr 14, 2025

Choose a reason for hiding this comment

Uh oh!

mitruska left a comment

Choose a reason for hiding this comment

Uh oh!

mitruska Apr 11, 2025

Choose a reason for hiding this comment

Uh oh!

sgbihu commented Apr 14, 2025

Uh oh!

praasz left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

mitruska left a comment

Choose a reason for hiding this comment

Uh oh!

zhangYiIntel commented Apr 25, 2025

Uh oh!

sgbihu commented Apr 27, 2025

Uh oh!

zhangYiIntel left a comment

Choose a reason for hiding this comment

Uh oh!

zhangYiIntel Apr 27, 2025

Choose a reason for hiding this comment

Uh oh!

sgbihu Apr 27, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

zhangYiIntel Apr 27, 2025

Choose a reason for hiding this comment

Uh oh!

sgbihu Apr 28, 2025

Choose a reason for hiding this comment

Uh oh!

sgbihu commented May 21, 2025

Uh oh!

sgbihu commented May 21, 2025

Uh oh!

sgbihu commented May 21, 2025

Uh oh!

sgbihu commented May 21, 2025

Uh oh!

Uh oh!

mlukasze commented May 22, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

10 participants

wine99 commented Mar 27, 2025 •

edited

Loading

sgbihu Apr 27, 2025 •

edited

Loading