Updated arm kernel type signature to match caller

Lunderberg · Lunderberg · commit 02e9ddf8d917 · 2023-07-05T07:59:32.000-05:00
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py
@@ -46,7 +46,7 @@ def _body():
             ib = tvm.tir.ir_builder.create()
             ib.emit(
                 tvm.tir.call_extern(
-                    cc.dtype,
+                    "int32",
                     f"{func_prefix}_{uniq_id}",
                     aa.access_ptr("r"),
                     cc.access_ptr("w"),
@@ -59,7 +59,7 @@ def _reduce_reset():
             ib = tvm.tir.ir_builder.create()
             ib.emit(
                 tvm.tir.call_extern(
-                    cc.dtype, f"{func_prefix}_reset_{uniq_id}", cc.access_ptr("w"), cc.strides[0]
+                    "int32", f"{func_prefix}_reset_{uniq_id}", cc.access_ptr("w"), cc.strides[0]
                 )
             )
             return ib.get()
@@ -96,7 +96,7 @@ def max_impl(uniq_id):
 #endif
 __attribute__((always_inline)) static inline int32_t max8_reset_{uniq_id}(
     int8_t *res,
-    int N) {{
+    int32_t N) {{
   memset(res, (int8_t)-128, N * sizeof(*res));
   return 0;
 }}
@@ -107,7 +107,7 @@ def max_impl(uniq_id):
 __attribute__((always_inline)) static inline int32_t max8_loop_{uniq_id}(
     int8_t *arg,
     int8_t *res,
-    int N) {{
+    int32_t N) {{
   for ( int i = 0; i < N; ++ i )
     if ( arg[i] > res[i] )
       res[i] = arg[i];
@@ -120,7 +120,7 @@ def max_impl(uniq_id):
 __attribute__((always_inline)) static inline int32_t max8_{uniq_id}(
     int8_t *arg,
     int8_t *res,
-    int N) {{
+    int32_t N) {{
   int32_t *parg32, *pres32;
   int una_arg = (int32_t)arg & 0x3, una_res = (int32_t)res & 0x3;
   int32_t retcode = 0;
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
@@ -155,14 +155,14 @@ def _load_tensor_vars(halfwords, tensor_w) -> Iterator[str]:
         var_name = f"{_get_int16_alias(halfwords[i])}__{_get_int16_alias(halfwords[i+1])}"
         y, x = halfwords[i + 1] or halfwords[i]
         tensor_index = (y * tensor_w + x + offset) // 2
-        yield f"int32_t tensor__{var_name} = tensor[{tensor_index}];"
+        yield f"int32_t tensor__{var_name} = ((int32_t*)tensor)[{tensor_index}];"
 
 
 def _load_kernel_vars(halfwords) -> Iterator[str]:
     assert len(halfwords) % 2 == 0
     for i in range(0, len(halfwords), 2):
         var_name = f"{_get_int16_alias(halfwords[i])}__{_get_int16_alias(halfwords[i+1])}"
-        yield f"int32_t kernel__{var_name} = kernel[{i // 2}];"
+        yield f"int32_t kernel__{var_name} = ((int32_t*)kernel)[{i // 2}];"
 
 
 def _get_draft_macs(
@@ -280,7 +280,7 @@ def _write_sums_to_memory(num_outputs, offset, stride) -> Iterator[str]:
 
     if stride > 1:
         for i in range(num_outputs):
-            yield f"((int16_t*) output)[{i * stride + offset}] = (int16_t) requant_{i};"
+            yield f"output[{i * stride + offset}] = (int16_t) requant_{i};"
 
     else:
         num_packed = (num_outputs - offset) // 2
@@ -296,13 +296,13 @@ def _write_sums_to_memory(num_outputs, offset, stride) -> Iterator[str]:
             )
 
         if offset == 1:
-            yield "((int16_t*) output)[1] = (int16_t) requant_0;"
+            yield "output[1] = (int16_t) requant_0;"
 
         for i in range(num_packed):
             yield f"output[{offset + i}] = packed_res_{i};"
 
         if (offset + num_outputs) % 2 == 1:
-            yield f"((int16_t*) output)[{num_packed * 2}] = (int16_t) requant_{num_packed * 2};"
+            yield f"output[{num_packed * 2}] = (int16_t) requant_{num_packed * 2};"
 
 
 def tensordot_int16_impl(
@@ -390,7 +390,7 @@ def insert_lines(lines):
         #define {function_name.upper()}_EXISTS
         #include <arm_acle.h>
         __attribute__((always_inline)) static inline int32_t {function_name}(
-            int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, int32_t *scale
+            int16_t *output, int16_t *tensor, int16_t *kernel, int32_t *bias, int32_t *scale
         ) {{
           {_init_biased_accumulators(num_outputs)}