@@ -333,7 +333,7 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f
333333
334334static void convert_fp16_to_fp32_cuda (const void * vx, float * y, const int k, cudaStream_t stream) {
335335 const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1 ) / CUDA_DEQUANTIZE_BLOCK_SIZE;
336- dequantize_block<32 , 1 , convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0 , stream>>> (vx, y, k);
336+ dequantize_block<1 , 1 , convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0 , stream>>> (vx, y, k);
337337}
338338
339339static void convert_mul_mat_vec_f16_cuda (const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
@@ -885,16 +885,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
885885 const int64_t i12 = i02 % ne12;
886886
887887 const int64_t i0 = i03*ne02 + i02;
888+ const int64_t i0_offset_low = row_low/ne01;
889+ const int64_t i0_offset_high = row_high/ne01;
890+
888891 int64_t i01_low = 0 ;
889892 int64_t i01_high = ne01;
890893 if (split) {
891- if (( i0 + 1 )*ne01 < row_low || i0*ne01 >= row_high ) {
894+ if (i0 < i0_offset_low || i0 > i0_offset_high ) {
892895 continue ;
893896 }
894- if (i0 == row_low/ne01 ) {
897+ if (i0 == i0_offset_low ) {
895898 i01_low = row_low % ne01;
896899 }
897- if (i0 == row_high/ne01 ) {
900+ if (i0 == i0_offset_high ) {
898901 i01_high = row_high % ne01;
899902 }
900903 }
@@ -908,10 +911,18 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
908911 cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
909912 cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
910913
911- char * src0_ddq_i = src0_ddq[id] + (i0 - row_low/ne01 )*src0_stride;
912- float * src0_ddf_i = src0_ddf[id] + (i0 - row_low/ne01 )*src0_stride;
914+ char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low )*src0_stride*src0_ts/src0_bs ;
915+ float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low )*src0_stride;
913916 float * src1_ddf_i = src1_ddf[id] + i1*src1_stride;
914- float * dst_ddf_i = dst_ddf[id] + (i0 - row_low/ne0)*dst_stride;
917+ float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
918+
919+ if (i0 - i0_offset_low > 0 ) {
920+ src0_ddq_i -= (row_low % ne01)*ne00*src0_ts / src0_bs;
921+ src0_ddf_i -= (row_low % ne01)*ne00;
922+ }
923+ if (i0 - i0_offset_low > 0 ) {
924+ dst_ddf_i -= (row_low % ne0)*ne1;
925+ }
915926
916927 // copy src0, src1 to device if necessary
917928 if (!src1_on_device) {
@@ -979,6 +990,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
979990}
980991
981992bool ggml_cuda_can_mul (const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
993+ GGML_ASSERT (src1->backend != GGML_BACKEND_GPU_SPLIT);
982994 (void ) src0;
983995 (void ) dst;
984996 return src1->backend == GGML_BACKEND_GPU;
@@ -992,6 +1004,7 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
9921004}
9931005
9941006bool ggml_cuda_can_mul_mat (const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1007+ GGML_ASSERT (src0->backend != GGML_BACKEND_GPU);
9951008 const int64_t ne10 = src1->ne [0 ];
9961009
9971010 const int64_t ne0 = dst->ne [0 ];
@@ -1029,21 +1042,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
10291042 if (src0->type == GGML_TYPE_F32) {
10301043 ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true );
10311044 }
1032- else if (src0->type == GGML_TYPE_F16) {
1033- if (ggml_cuda_mul_mat_use_f16 (src0, src1, dst)) {
1034- // ggml_cuda_op<GGML_CUDA_OP_TYPE_QQF, ggml_cuda_op_mul_mat_cublas>(src0, src1, dst);
1035- ggml_cuda_mul_mat_f16 (src0, src1, dst, wdata, wsize);
1036- }
1037- else {
1038- if (src1->ne [1 ] == 1 ) {
1039- ggml_cuda_op (src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false );
1040- } else {
1041- ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true );
1042- }
1043- }
1044- }
1045- else if (ggml_is_quantized (src0->type )) {
1046- if (src1->ne [1 ] == 1 ) {
1045+ else if (ggml_is_quantized (src0->type ) || src0->type == GGML_TYPE_F16) {
1046+ if (src1->ne [1 ] == 1 && src0->type != GGML_TYPE_F16) { // FIXME fp16 mul mat vec
10471047 ggml_cuda_op (src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false );
10481048 } else {
10491049 ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true );
0 commit comments