@@ -155,14 +155,14 @@ def _load_tensor_vars(halfwords, tensor_w) -> Iterator[str]:
155155 var_name = f"{ _get_int16_alias (halfwords [i ])} __{ _get_int16_alias (halfwords [i + 1 ])} "
156156 y , x = halfwords [i + 1 ] or halfwords [i ]
157157 tensor_index = (y * tensor_w + x + offset ) // 2
158- yield f"int32_t tensor__{ var_name } = tensor[{ tensor_index } ];"
158+ yield f"int32_t tensor__{ var_name } = ((int32_t*) tensor) [{ tensor_index } ];"
159159
160160
161161def _load_kernel_vars (halfwords ) -> Iterator [str ]:
162162 assert len (halfwords ) % 2 == 0
163163 for i in range (0 , len (halfwords ), 2 ):
164164 var_name = f"{ _get_int16_alias (halfwords [i ])} __{ _get_int16_alias (halfwords [i + 1 ])} "
165- yield f"int32_t kernel__{ var_name } = kernel[{ i // 2 } ];"
165+ yield f"int32_t kernel__{ var_name } = ((int32_t*) kernel) [{ i // 2 } ];"
166166
167167
168168def _get_draft_macs (
@@ -280,7 +280,7 @@ def _write_sums_to_memory(num_outputs, offset, stride) -> Iterator[str]:
280280
281281 if stride > 1 :
282282 for i in range (num_outputs ):
283- yield f"((int16_t*) output) [{ i * stride + offset } ] = (int16_t) requant_{ i } ;"
283+ yield f"output[{ i * stride + offset } ] = (int16_t) requant_{ i } ;"
284284
285285 else :
286286 num_packed = (num_outputs - offset ) // 2
@@ -296,13 +296,13 @@ def _write_sums_to_memory(num_outputs, offset, stride) -> Iterator[str]:
296296 )
297297
298298 if offset == 1 :
299- yield "((int16_t*) output) [1] = (int16_t) requant_0;"
299+ yield "output[1] = (int16_t) requant_0;"
300300
301301 for i in range (num_packed ):
302302 yield f"output[{ offset + i } ] = packed_res_{ i } ;"
303303
304304 if (offset + num_outputs ) % 2 == 1 :
305- yield f"((int16_t*) output) [{ num_packed * 2 } ] = (int16_t) requant_{ num_packed * 2 } ;"
305+ yield f"output[{ num_packed * 2 } ] = (int16_t) requant_{ num_packed * 2 } ;"
306306
307307
308308def tensordot_int16_impl (
@@ -390,7 +390,7 @@ def insert_lines(lines):
390390 #define { function_name .upper ()} _EXISTS
391391 #include <arm_acle.h>
392392 __attribute__((always_inline)) static inline int32_t { function_name } (
393- int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, int32_t *scale
393+ int16_t *output, int16_t *tensor, int16_t *kernel, int32_t *bias, int32_t *scale
394394 ) {{
395395 { _init_biased_accumulators (num_outputs )}
396396
0 commit comments