Skip to content

Commit 0d5b895

Browse files
Fixed issue with prompt processing
1 parent ad48bda commit 0d5b895

File tree

2 files changed

+27
-25
lines changed

2 files changed

+27
-25
lines changed

ggml-cuda.cu

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,7 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f
333333

334334
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
335335
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
336-
dequantize_block<32, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
336+
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
337337
}
338338

339339
static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
@@ -885,16 +885,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
885885
const int64_t i12 = i02 % ne12;
886886

887887
const int64_t i0 = i03*ne02 + i02;
888+
const int64_t i0_offset_low = row_low/ne01;
889+
const int64_t i0_offset_high = row_high/ne01;
890+
888891
int64_t i01_low = 0;
889892
int64_t i01_high = ne01;
890893
if (split) {
891-
if ((i0 + 1)*ne01 < row_low || i0*ne01 >= row_high) {
894+
if (i0 < i0_offset_low || i0 > i0_offset_high) {
892895
continue;
893896
}
894-
if (i0 == row_low/ne01) {
897+
if (i0 == i0_offset_low) {
895898
i01_low = row_low % ne01;
896899
}
897-
if (i0 == row_high/ne01) {
900+
if (i0 == i0_offset_high) {
898901
i01_high = row_high % ne01;
899902
}
900903
}
@@ -908,10 +911,18 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
908911
cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
909912
cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
910913

911-
char * src0_ddq_i = src0_ddq[id] + (i0 - row_low/ne01)*src0_stride;
912-
float * src0_ddf_i = src0_ddf[id] + (i0 - row_low/ne01)*src0_stride;
914+
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
915+
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
913916
float * src1_ddf_i = src1_ddf[id] + i1*src1_stride;
914-
float * dst_ddf_i = dst_ddf[id] + (i0 - row_low/ne0)*dst_stride;
917+
float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
918+
919+
if (i0 - i0_offset_low > 0) {
920+
src0_ddq_i -= (row_low % ne01)*ne00*src0_ts / src0_bs;
921+
src0_ddf_i -= (row_low % ne01)*ne00;
922+
}
923+
if (i0 - i0_offset_low > 0) {
924+
dst_ddf_i -= (row_low % ne0)*ne1;
925+
}
915926

916927
// copy src0, src1 to device if necessary
917928
if (!src1_on_device) {
@@ -979,6 +990,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
979990
}
980991

981992
bool ggml_cuda_can_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
993+
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
982994
(void) src0;
983995
(void) dst;
984996
return src1->backend == GGML_BACKEND_GPU;
@@ -992,6 +1004,7 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
9921004
}
9931005

9941006
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1007+
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU);
9951008
const int64_t ne10 = src1->ne[0];
9961009

9971010
const int64_t ne0 = dst->ne[0];
@@ -1029,21 +1042,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
10291042
if (src0->type == GGML_TYPE_F32) {
10301043
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
10311044
}
1032-
else if (src0->type == GGML_TYPE_F16) {
1033-
if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) {
1034-
// ggml_cuda_op<GGML_CUDA_OP_TYPE_QQF, ggml_cuda_op_mul_mat_cublas>(src0, src1, dst);
1035-
ggml_cuda_mul_mat_f16(src0, src1, dst, wdata, wsize);
1036-
}
1037-
else {
1038-
if (src1->ne[1] == 1) {
1039-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
1040-
} else {
1041-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
1042-
}
1043-
}
1044-
}
1045-
else if (ggml_is_quantized(src0->type)) {
1046-
if (src1->ne[1] == 1) {
1045+
else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
1046+
if (src1->ne[1] == 1 && src0->type != GGML_TYPE_F16) { // FIXME fp16 mul mat vec
10471047
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
10481048
} else {
10491049
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);

llama.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -684,9 +684,11 @@ struct llama_model_loader {
684684
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
685685

686686
#ifdef GGML_USE_CUBLAS
687-
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
688-
extra->layer = layer;
689-
tensor->extra = extra;
687+
if (backend == GGML_BACKEND_GPU || backend == GGML_BACKEND_GPU_SPLIT) {
688+
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
689+
extra->layer = layer;
690+
tensor->extra = extra;
691+
}
690692
#else
691693
(void) layer;
692694
#endif // GGML_USE_CUBLAS

0 commit comments

Comments
 (0)