[microNPU][ETHOSU] Add Vela's logic to select configuration block (#15186)

Aleksei-grovety · web-flow · commit 5dc25afc873f · 2023-07-04T09:34:11.000+01:00
For the case when cascader is enabled, the logic of choosing the optimal configuration block from TVM will be used in other cases, the Vela's logic will be used except the cases when dev_force_block_config option is specified.
diff --git a/python/tvm/relay/backend/contrib/ethosu/vela_api.py b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
@@ -27,7 +27,12 @@
 
 import numpy as np  # type: ignore
 from ethosu.vela import api as vapi  # type: ignore
+from ethosu.vela.architecture_allocator import find_block_config
 from ethosu.vela.architecture_features import Accelerator, create_default_arch
+from ethosu.vela.operation import NpuBlockType
+from ethosu.vela.register_command_stream_generator import resampling_mode_map
+from ethosu.vela.register_command_stream_util import to_kernel
+from ethosu.vela.shape4d import Shape4D
 
 import tvm
 from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator as tirtocs
@@ -56,6 +61,9 @@ def get_optimal_block_config(
     Therefore, we need to pick an optimal block configuration considering bandwidth
     to bring IFM blocks and the number of OFM block computes need to happen
     to cover the OFM as indicated by the npu op.
+    For the case when cascader is enabled, the logic of choosing the optimal configuration block
+    from TVM will be used in other cases, the Vela's logic will be used except
+    the cases when dev_force_block_config option is specified.
 
     Parameters
     ----------
@@ -73,8 +81,82 @@ def get_optimal_block_config(
     if options and options.dev_force_block_config:
         block_config = [int(v) for v in options.dev_force_block_config.split("x")]
         return vapi.NpuShape3D(height=block_config[0], width=block_config[1], depth=block_config[2])
-    all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_config)
-    return _get_optimal_block_config(all_valid_block_configs)
+    elif options and options.enable_cascader:
+        all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_config)
+        return _get_optimal_block_config(all_valid_block_configs)
+    else:
+        return _find_block_config_with_vela(npu_op, accel_config)
+
+
+def _find_block_config_with_vela(
+    npu_op: vapi.NpuOperation, accelerator: vapi.NpuAccelerator
+) -> vapi.NpuShape3D:
+    """An internal function to get block config using Vela's logic.
+
+    Parameters
+    ----------
+    npu_op : ethosu.vela.api.NpuOperation
+        The NPU operation
+    accelerator : ethosu.vela.api.NpuAccelerator
+        The NPU accelerator
+
+    Returns
+    -------
+    ethosu.vela.api.NpuShape3D :
+        The optimal block config for the operator
+    """
+    if isinstance(npu_op, vapi.NpuConv2DOperation):
+        block_type = NpuBlockType.ConvolutionMxN
+    elif isinstance(npu_op, vapi.NpuConvDepthWiseOperation):
+        block_type = NpuBlockType.ConvolutionDepthWise
+    elif isinstance(npu_op, vapi.NpuPoolingOperation):
+        block_type = (
+            NpuBlockType.ReduceSum
+            if npu_op.sub_op_type == vapi.NpuPoolingOp.REDUCE_SUM
+            else NpuBlockType.Pooling
+        )
+    elif isinstance(npu_op, vapi.NpuElementWiseOperation):
+        block_type = NpuBlockType.ElementWise
+    else:
+        assert 0, "Unsupported operation"
+
+    ifm_shape = Shape4D(1, npu_op.ifm.shape.height, npu_op.ifm.shape.width, npu_op.ifm.shape.depth)
+    ifm2_shape = None
+    if npu_op.ifm2:
+        ifm2_shape = Shape4D(
+            1, npu_op.ifm2.shape.height, npu_op.ifm2.shape.width, npu_op.ifm2.shape.depth
+        )
+    ofm_shape = Shape4D(1, npu_op.ofm.shape.height, npu_op.ofm.shape.width, npu_op.ofm.shape.depth)
+
+    ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
+    ifm_bits = npu_op.ifm.data_type.size_in_bits()
+    lut_banks = 0
+    if npu_op.activation:
+        lut_banks = 2 if npu_op.activation.op_type == vapi.NpuActivationOp.TABLE_LOOKUP else 0
+
+    has_scaling = True
+    for tensor in [npu_op.ifm, npu_op.ifm2, npu_op.ofm]:
+        if tensor and tensor.quantization is None:
+            has_scaling = False
+            break
+
+    arch = create_default_arch(Accelerator.from_npu_accelerator(accelerator))
+
+    cfg = find_block_config(
+        arch,
+        block_type,
+        ofm_shape,
+        ifm_shape,
+        ifm2_shape,
+        npu_op.ifm2_scalar is not None,
+        ifm_bits,
+        to_kernel(npu_op.kernel),
+        lut_banks,
+        has_scaling,
+        ifm_resampling_mode,
+    )
+    assert cfg is not None, f"There is no configuration suitable for {accelerator}"
+    return vapi.NpuShape3D(cfg.ofm_block.height, cfg.ofm_block.width, cfg.ofm_block.depth)
 
 
 def _get_optimal_block_config(all_valid_block_configs: List[vapi.NpuShape3D]) -> vapi.NpuShape3D:
diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
@@ -44,9 +44,9 @@
 @pytest.mark.parametrize(
     "accel_type, model_url, workspace_size",
     [
-        ("ethos-u65-256", MOBILENET_V1_URL, 2338848),
+        ("ethos-u65-256", MOBILENET_V1_URL, 2338864),
         ("ethos-u65-256", MOBILENET_V2_URL, 2264320),
-        ("ethos-u55-256", MOBILENET_V1_URL, 1793376),
+        ("ethos-u55-256", MOBILENET_V1_URL, 1793392),
         ("ethos-u55-256", MOBILENET_V2_URL, 2217152),
         ("ethos-u55-128", MOBILENET_V2_URL, 2217152),
         ("ethos-u55-64", MOBILENET_V2_URL, 2217152),
diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -633,11 +633,15 @@ def _get_func(
 
     reference_mod = trial[0]
     params = trial[1:]
-    func = _get_func(*params[:-1])
-    mod, _ = _lower_to_tir(func, cascader=total_cascader(params[-1]))
-    script = mod.script()
-    mod = tvm.script.from_source(script)
-    tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
+    config = {
+        "enable_cascader": True,
+    }
+    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.ethos-u.options": config}):
+        func = _get_func(*params[:-1])
+        mod, _ = _lower_to_tir(func, cascader=total_cascader(params[-1]))
+        script = mod.script()
+        mod = tvm.script.from_source(script)
+        tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
 
 
 # fmt: off
diff --git a/tests/python/contrib/test_ethosu/test_vela_api.py b/tests/python/contrib/test_ethosu/test_vela_api.py
@@ -222,6 +222,28 @@ def main(
     __tvm_meta__ = None
 
 
+# fmt: off
+@tvm.script.ir_module
+class Module3:
+    @T.prim_func
+    def main(ethos_u_0_i0: T.Buffer((1, 299, 299, 2), "int8"), ethosu_write: T.Buffer((1, 299, 299, 3), "int8")):
+        T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol": "main", "tir.noalias": T.bool(True)})
+        p2_global = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin": T.bool(True)})
+        ax0_ax1_fused_ax2_fused_ax3_fused = T.int32()
+        p2_global_1 = T.Buffer((128,), "uint8", data=p2_global)
+        with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1056):
+            p1_encoded = T.Buffer((128,), "uint8")
+            T.call_extern("handle", "ethosu_copy", p1_encoded[0], 128, p2_global_1[0])
+        nn = T.int32()
+        T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", T.int64(179570))
+        ethos_u_0_i0_1 = T.Buffer((178802,), "int8", data=ethos_u_0_i0.data)
+        ethosu_write_1 = T.Buffer((268203,), "int8", data=ethosu_write.data)
+        T.call_extern("handle", "ethosu_conv2d", "int8", 299, 299, 2, 299, 0, 299, ethos_u_0_i0_1[0], 0, 0, 0, T.float32(0.0039215683937072754), -128, "NHWC", 598, 2, 1, "int8", 299, 299, 3, 299, 0, 299, ethosu_write_1[0], 0, 0, 0, T.float32(0.025585981085896492), -128, "NHWC", 897, 3, 1, 2, 3, 1, 1, 1, 2, p2_global_1[0], 96, T.int8(-1), T.int8(-1), 0, p2_global_1[96], 32, T.int8(-1), T.int8(-1), 2, 0, 2, 1, "NONE", 0, 0, "TFL", "NONE", 32, 12, 8)
+
+    __tvm_meta__ = None
+# fmt: on
+
+
 def test_get_optimal_block_config():
     block_configs_cases = [
         {
@@ -559,5 +581,33 @@ def verify(test_vec, mock_enc_w):
         verify(_test_vec, _mock_enc_w)
 
 
+def test_find_block_config_with_vela():
+    block_configs_cases = [
+        {
+            "accel_type": vapi.NpuAccelerator.Ethos_U55_256,
+            "ref": vapi.NpuShape3D(30, 12, 8),
+        },
+        {
+            "accel_type": vapi.NpuAccelerator.Ethos_U55_128,
+            "ref": vapi.NpuShape3D(17, 10, 8),
+        },
+        {
+            "accel_type": vapi.NpuAccelerator.Ethos_U55_64,
+            "ref": vapi.NpuShape3D(25, 5, 8),
+        },
+        {
+            "accel_type": vapi.NpuAccelerator.Ethos_U55_32,
+            "ref": vapi.NpuShape3D(25, 5, 4),
+        },
+    ]
+
+    mod = Module3
+    ethosu_conv2d_call = mod["main"].body.body.seq[1].body.value
+    npu_op, _ = tirtocs.translate_ethosu_conv2d(ethosu_conv2d_call)
+
+    for case in block_configs_cases:
+        assert vela_api._find_block_config_with_vela(npu_op, case["accel_type"]) == case["ref"]
+
+
 if __name__ == "__main__":
     tvm.testing.main()