@@ -104,6 +104,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
104104 }
105105
106106#if defined(GGML_SIMD )
107+ #if defined(__riscv_v_intrinsic )
108+ // todo: RVV impl
109+ for (int i = 0 ; i < n ; ++ i ) {
110+ for (int j = 0 ; j < GGML_VEC_DOT_UNROLL ; ++ j ) {
111+ sumf [j ] += (ggml_float )(GGML_CPU_FP16_TO_FP32 (x [j ][i ])* GGML_CPU_FP16_TO_FP32 (y [i ]));
112+ }
113+ }
114+ #else
107115 const int np = (n & ~(GGML_F16_STEP - 1 ));
108116
109117 GGML_F16_VEC sum [GGML_VEC_DOT_UNROLL ][GGML_F16_ARR ] = { { GGML_F16_VEC_ZERO } };
@@ -134,6 +142,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
134142 sumf [j ] += (ggml_float )(GGML_CPU_FP16_TO_FP32 (x [j ][i ])* GGML_CPU_FP16_TO_FP32 (y [i ]));
135143 }
136144 }
145+ #endif
137146#else
138147 for (int i = 0 ; i < n ; ++ i ) {
139148 for (int j = 0 ; j < GGML_VEC_DOT_UNROLL ; ++ j ) {
@@ -228,6 +237,14 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
228237
229238 svst1_f32 (pg , y + np2 , ay1 );
230239 }
240+ #elif defined(__riscv_v_intrinsic )
241+ for (int i = 0 , avl ; i < n ; i += avl ) {
242+ avl = __riscv_vsetvl_e32m8 (n - i );
243+ vfloat32m8_t ax = __riscv_vle32_v_f32m8 (& x [i ], avl );
244+ vfloat32m8_t ay = __riscv_vle32_v_f32m8 (& y [i ], avl );
245+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8 (ax , v , ay , avl );
246+ __riscv_vse32_v_f32m8 (& y [i ], ny , avl );
247+ }
231248 #else
232249 const int np = (n & ~(GGML_F32_STEP - 1 ));
233250
@@ -261,6 +278,13 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
261278
262279inline static void ggml_vec_mad_f16 (const int n , ggml_fp16_t * GGML_RESTRICT y , const ggml_fp16_t * GGML_RESTRICT x , const float v ) {
263280#if defined(GGML_SIMD )
281+ #if defined(__riscv_v_intrinsic )
282+ // todo: RVV impl
283+ // scalar
284+ for (int i = 0 ; i < n ; ++ i ) {
285+ y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ]) + GGML_CPU_FP16_TO_FP32 (x [i ])* v );
286+ }
287+ #else
264288 const int np = (n & ~(GGML_F16_STEP - 1 ));
265289
266290 GGML_F16_VEC vx = GGML_F16_VEC_SET1 (v );
@@ -282,6 +306,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
282306 for (int i = np ; i < n ; ++ i ) {
283307 y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ]) + GGML_CPU_FP16_TO_FP32 (x [i ])* v );
284308 }
309+ #endif
285310#else
286311 // scalar
287312 for (int i = 0 ; i < n ; ++ i ) {
@@ -309,6 +334,16 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
309334 y [i ] += x [k ][i ]* v [k ][0 ];
310335 }
311336 }
337+ #elif defined(__riscv_v_intrinsic )
338+ for (int i = 0 , avl ; i < n ; i += avl ) {
339+ avl = __riscv_vsetvl_e32m8 (n - i );
340+ vfloat32m8_t ay = __riscv_vle32_v_f32m8 (& y [i ], avl );
341+ for (int k = 0 ; k < GGML_VEC_MAD_UNROLL ; k ++ ) {
342+ vfloat32m8_t ax = __riscv_vle32_v_f32m8 (& x [k ][i ], avl );
343+ ay = __riscv_vfmadd_vf_f32m8 (ax , v [k ][0 ], ay , avl );
344+ }
345+ __riscv_vse32_v_f32m8 (& y [i ], ay , avl );
346+ }
312347 #else
313348 const int np = (n & ~(GGML_F32_STEP - 1 ));
314349
@@ -360,6 +395,14 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
360395 for (int i = 0 ; i < n ; ++ i ) {
361396 y [i ] = x [i ]* s + b ;
362397 }
398+ #elif defined(__riscv_v_intrinsic )
399+ for (int i = 0 , avl ; i < n ; i += avl ) {
400+ avl = __riscv_vsetvl_e32m8 (n - i );
401+ vfloat32m8_t ax = __riscv_vle32_v_f32m8 (& x [i ], avl );
402+ vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8 (b , avl );
403+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8 (ax , s , vb , avl );
404+ __riscv_vse32_v_f32m8 (& y [i ], ny , avl );
405+ }
363406 #else
364407 const int np = (n & ~(GGML_F32_STEP - 1 ));
365408
@@ -421,6 +464,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
421464 ay1 = svmul_f32_m (pg , ay1 , vx );
422465 svst1_f32 (pg , y + np , ay1 );
423466 }
467+ #elif defined(__riscv_v_intrinsic )
468+ for (int i = 0 , avl ; i < n ; i += avl ) {
469+ avl = __riscv_vsetvl_e32m8 (n - i );
470+ vfloat32m8_t ay = __riscv_vle32_v_f32m8 (& y [i ], avl );
471+ vfloat32m8_t ny = __riscv_vfmul_vf_f32m8 (ay , v , avl );
472+ __riscv_vse32_v_f32m8 (& y [i ], ny , avl );
473+ }
424474 #else
425475 const int np = (n & ~(GGML_F32_STEP - 1 ));
426476
@@ -452,6 +502,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
452502
453503inline static void ggml_vec_scale_f16 (const int n , ggml_fp16_t * y , const float v ) {
454504#if defined(GGML_SIMD )
505+ #if defined(__riscv_v_intrinsic )
506+ // todo: RVV impl
507+ // scalar
508+ for (int i = 0 ; i < n ; ++ i ) {
509+ y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ])* v );
510+ }
511+ #else
455512 const int np = (n & ~(GGML_F16_STEP - 1 ));
456513
457514 GGML_F16_VEC vx = GGML_F16_VEC_SET1 (v );
@@ -471,6 +528,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
471528 for (int i = np ; i < n ; ++ i ) {
472529 y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ])* v );
473530 }
531+ #endif
474532#else
475533 // scalar
476534 for (int i = 0 ; i < n ; ++ i ) {
0 commit comments