Skip to content

Commit 5dc25af

Browse files
[microNPU][ETHOSU] Add Vela's logic to select configuration block (#15186)
For the case when cascader is enabled, the logic of choosing the optimal configuration block from TVM will be used in other cases, the Vela's logic will be used except the cases when dev_force_block_config option is specified.
1 parent a60b815 commit 5dc25af

File tree

4 files changed

+145
-9
lines changed

4 files changed

+145
-9
lines changed

python/tvm/relay/backend/contrib/ethosu/vela_api.py

Lines changed: 84 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,12 @@
2727

2828
import numpy as np # type: ignore
2929
from ethosu.vela import api as vapi # type: ignore
30+
from ethosu.vela.architecture_allocator import find_block_config
3031
from ethosu.vela.architecture_features import Accelerator, create_default_arch
32+
from ethosu.vela.operation import NpuBlockType
33+
from ethosu.vela.register_command_stream_generator import resampling_mode_map
34+
from ethosu.vela.register_command_stream_util import to_kernel
35+
from ethosu.vela.shape4d import Shape4D
3136

3237
import tvm
3338
from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator as tirtocs
@@ -56,6 +61,9 @@ def get_optimal_block_config(
5661
Therefore, we need to pick an optimal block configuration considering bandwidth
5762
to bring IFM blocks and the number of OFM block computes need to happen
5863
to cover the OFM as indicated by the npu op.
64+
For the case when cascader is enabled, the logic of choosing the optimal configuration block
65+
from TVM will be used in other cases, the Vela's logic will be used except
66+
the cases when dev_force_block_config option is specified.
5967
6068
Parameters
6169
----------
@@ -73,8 +81,82 @@ def get_optimal_block_config(
7381
if options and options.dev_force_block_config:
7482
block_config = [int(v) for v in options.dev_force_block_config.split("x")]
7583
return vapi.NpuShape3D(height=block_config[0], width=block_config[1], depth=block_config[2])
76-
all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_config)
77-
return _get_optimal_block_config(all_valid_block_configs)
84+
elif options and options.enable_cascader:
85+
all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_config)
86+
return _get_optimal_block_config(all_valid_block_configs)
87+
else:
88+
return _find_block_config_with_vela(npu_op, accel_config)
89+
90+
91+
def _find_block_config_with_vela(
92+
npu_op: vapi.NpuOperation, accelerator: vapi.NpuAccelerator
93+
) -> vapi.NpuShape3D:
94+
"""An internal function to get block config using Vela's logic.
95+
96+
Parameters
97+
----------
98+
npu_op : ethosu.vela.api.NpuOperation
99+
The NPU operation
100+
accelerator : ethosu.vela.api.NpuAccelerator
101+
The NPU accelerator
102+
103+
Returns
104+
-------
105+
ethosu.vela.api.NpuShape3D :
106+
The optimal block config for the operator
107+
"""
108+
if isinstance(npu_op, vapi.NpuConv2DOperation):
109+
block_type = NpuBlockType.ConvolutionMxN
110+
elif isinstance(npu_op, vapi.NpuConvDepthWiseOperation):
111+
block_type = NpuBlockType.ConvolutionDepthWise
112+
elif isinstance(npu_op, vapi.NpuPoolingOperation):
113+
block_type = (
114+
NpuBlockType.ReduceSum
115+
if npu_op.sub_op_type == vapi.NpuPoolingOp.REDUCE_SUM
116+
else NpuBlockType.Pooling
117+
)
118+
elif isinstance(npu_op, vapi.NpuElementWiseOperation):
119+
block_type = NpuBlockType.ElementWise
120+
else:
121+
assert 0, "Unsupported operation"
122+
123+
ifm_shape = Shape4D(1, npu_op.ifm.shape.height, npu_op.ifm.shape.width, npu_op.ifm.shape.depth)
124+
ifm2_shape = None
125+
if npu_op.ifm2:
126+
ifm2_shape = Shape4D(
127+
1, npu_op.ifm2.shape.height, npu_op.ifm2.shape.width, npu_op.ifm2.shape.depth
128+
)
129+
ofm_shape = Shape4D(1, npu_op.ofm.shape.height, npu_op.ofm.shape.width, npu_op.ofm.shape.depth)
130+
131+
ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
132+
ifm_bits = npu_op.ifm.data_type.size_in_bits()
133+
lut_banks = 0
134+
if npu_op.activation:
135+
lut_banks = 2 if npu_op.activation.op_type == vapi.NpuActivationOp.TABLE_LOOKUP else 0
136+
137+
has_scaling = True
138+
for tensor in [npu_op.ifm, npu_op.ifm2, npu_op.ofm]:
139+
if tensor and tensor.quantization is None:
140+
has_scaling = False
141+
break
142+
143+
arch = create_default_arch(Accelerator.from_npu_accelerator(accelerator))
144+
145+
cfg = find_block_config(
146+
arch,
147+
block_type,
148+
ofm_shape,
149+
ifm_shape,
150+
ifm2_shape,
151+
npu_op.ifm2_scalar is not None,
152+
ifm_bits,
153+
to_kernel(npu_op.kernel),
154+
lut_banks,
155+
has_scaling,
156+
ifm_resampling_mode,
157+
)
158+
assert cfg is not None, f"There is no configuration suitable for {accelerator}"
159+
return vapi.NpuShape3D(cfg.ofm_block.height, cfg.ofm_block.width, cfg.ofm_block.depth)
78160

79161

80162
def _get_optimal_block_config(all_valid_block_configs: List[vapi.NpuShape3D]) -> vapi.NpuShape3D:

tests/python/contrib/test_ethosu/test_networks.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@
4444
@pytest.mark.parametrize(
4545
"accel_type, model_url, workspace_size",
4646
[
47-
("ethos-u65-256", MOBILENET_V1_URL, 2338848),
47+
("ethos-u65-256", MOBILENET_V1_URL, 2338864),
4848
("ethos-u65-256", MOBILENET_V2_URL, 2264320),
49-
("ethos-u55-256", MOBILENET_V1_URL, 1793376),
49+
("ethos-u55-256", MOBILENET_V1_URL, 1793392),
5050
("ethos-u55-256", MOBILENET_V2_URL, 2217152),
5151
("ethos-u55-128", MOBILENET_V2_URL, 2217152),
5252
("ethos-u55-64", MOBILENET_V2_URL, 2217152),

tests/python/contrib/test_ethosu/test_replace_conv2d.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -633,11 +633,15 @@ def _get_func(
633633

634634
reference_mod = trial[0]
635635
params = trial[1:]
636-
func = _get_func(*params[:-1])
637-
mod, _ = _lower_to_tir(func, cascader=total_cascader(params[-1]))
638-
script = mod.script()
639-
mod = tvm.script.from_source(script)
640-
tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
636+
config = {
637+
"enable_cascader": True,
638+
}
639+
with tvm.transform.PassContext(opt_level=3, config={"relay.ext.ethos-u.options": config}):
640+
func = _get_func(*params[:-1])
641+
mod, _ = _lower_to_tir(func, cascader=total_cascader(params[-1]))
642+
script = mod.script()
643+
mod = tvm.script.from_source(script)
644+
tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
641645

642646

643647
# fmt: off

tests/python/contrib/test_ethosu/test_vela_api.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,28 @@ def main(
222222
__tvm_meta__ = None
223223

224224

225+
# fmt: off
226+
@tvm.script.ir_module
227+
class Module3:
228+
@T.prim_func
229+
def main(ethos_u_0_i0: T.Buffer((1, 299, 299, 2), "int8"), ethosu_write: T.Buffer((1, 299, 299, 3), "int8")):
230+
T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol": "main", "tir.noalias": T.bool(True)})
231+
p2_global = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
232+
ax0_ax1_fused_ax2_fused_ax3_fused = T.int32()
233+
p2_global_1 = T.Buffer((128,), "uint8", data=p2_global)
234+
with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1056):
235+
p1_encoded = T.Buffer((128,), "uint8")
236+
T.call_extern("handle", "ethosu_copy", p1_encoded[0], 128, p2_global_1[0])
237+
nn = T.int32()
238+
T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", T.int64(179570))
239+
ethos_u_0_i0_1 = T.Buffer((178802,), "int8", data=ethos_u_0_i0.data)
240+
ethosu_write_1 = T.Buffer((268203,), "int8", data=ethosu_write.data)
241+
T.call_extern("handle", "ethosu_conv2d", "int8", 299, 299, 2, 299, 0, 299, ethos_u_0_i0_1[0], 0, 0, 0, T.float32(0.0039215683937072754), -128, "NHWC", 598, 2, 1, "int8", 299, 299, 3, 299, 0, 299, ethosu_write_1[0], 0, 0, 0, T.float32(0.025585981085896492), -128, "NHWC", 897, 3, 1, 2, 3, 1, 1, 1, 2, p2_global_1[0], 96, T.int8(-1), T.int8(-1), 0, p2_global_1[96], 32, T.int8(-1), T.int8(-1), 2, 0, 2, 1, "NONE", 0, 0, "TFL", "NONE", 32, 12, 8)
242+
243+
__tvm_meta__ = None
244+
# fmt: on
245+
246+
225247
def test_get_optimal_block_config():
226248
block_configs_cases = [
227249
{
@@ -559,5 +581,33 @@ def verify(test_vec, mock_enc_w):
559581
verify(_test_vec, _mock_enc_w)
560582

561583

584+
def test_find_block_config_with_vela():
585+
block_configs_cases = [
586+
{
587+
"accel_type": vapi.NpuAccelerator.Ethos_U55_256,
588+
"ref": vapi.NpuShape3D(30, 12, 8),
589+
},
590+
{
591+
"accel_type": vapi.NpuAccelerator.Ethos_U55_128,
592+
"ref": vapi.NpuShape3D(17, 10, 8),
593+
},
594+
{
595+
"accel_type": vapi.NpuAccelerator.Ethos_U55_64,
596+
"ref": vapi.NpuShape3D(25, 5, 8),
597+
},
598+
{
599+
"accel_type": vapi.NpuAccelerator.Ethos_U55_32,
600+
"ref": vapi.NpuShape3D(25, 5, 4),
601+
},
602+
]
603+
604+
mod = Module3
605+
ethosu_conv2d_call = mod["main"].body.body.seq[1].body.value
606+
npu_op, _ = tirtocs.translate_ethosu_conv2d(ethosu_conv2d_call)
607+
608+
for case in block_configs_cases:
609+
assert vela_api._find_block_config_with_vela(npu_op, case["accel_type"]) == case["ref"]
610+
611+
562612
if __name__ == "__main__":
563613
tvm.testing.main()

0 commit comments

Comments
 (0)