@@ -694,6 +694,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
694694 const int nb2 = dst->nb [2 ];
695695 const int nb3 = dst->nb [3 ];
696696
697+ // strides for iteration over dims 3 and 2
697698 const int64_t src0_stride = ne00 * ne01;
698699 const int64_t src1_stride = ne10 * ne11;
699700 const int64_t dst_stride = ne0 * ne1;
@@ -706,6 +707,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
706707 struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra ;
707708 struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra ;
708709
710+ // indices of the devices on which the input data is stored
709711 int src0_id = src0_extra == nullptr ? -1 : src0_extra->i_device ;
710712 int src1_id = src1_extra == nullptr ? -1 : src1_extra->i_device ;
711713
@@ -731,12 +733,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
731733 size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0 };
732734
733735 for (int id = 0 ; id < g_device_count; ++id) {
736+ // if data is on one device (!= -1) but not this one, continue
734737 if (src0_id != -1 && src0_id != id) {
735738 continue ;
736739 }
737740 if (src1_id != -1 && src1_id != id) {
738741 continue ;
739742 }
743+
740744 bool split = src0_id == -1 && src1_id == -1 ;
741745 int64_t row_low, row_high;
742746 if (split) {
@@ -818,11 +822,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
818822 cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
819823 cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
820824
825+ // for split tensors the data begins at i0 == i0_offset_low
821826 char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
822827 float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
823828 float * src1_ddf_i = src1_ddf[id] + i1*src1_stride;
824829 float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
825830
831+ // for split tensors the data pointer needs to be rounded down
832+ // to the bin edge for i03, i02 bins beyond the first
826833 if (i0 - i0_offset_low > 0 ) {
827834 src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
828835 src0_ddf_i -= (row_low % ne01)*ne00;
@@ -844,6 +851,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
844851 }
845852 }
846853
854+ // convert src0 to f32 if it's necessary for the ggml_cuda_op
847855 if (src0_needs_f32 && !src0_is_f32) {
848856 to_fp32_cuda (src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
849857 CUDA_CHECK (cudaGetLastError ());
0 commit comments