@@ -694,6 +694,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
694
694
const int nb2 = dst->nb [2 ];
695
695
const int nb3 = dst->nb [3 ];
696
696
697
+ // strides for iteration over dims 3 and 2
697
698
const int64_t src0_stride = ne00 * ne01;
698
699
const int64_t src1_stride = ne10 * ne11;
699
700
const int64_t dst_stride = ne0 * ne1;
@@ -706,6 +707,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
706
707
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra ;
707
708
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra ;
708
709
710
+ // indices of the devices on which the input data is stored
709
711
int src0_id = src0_extra == nullptr ? -1 : src0_extra->i_device ;
710
712
int src1_id = src1_extra == nullptr ? -1 : src1_extra->i_device ;
711
713
@@ -731,12 +733,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
731
733
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0 };
732
734
733
735
for (int id = 0 ; id < g_device_count; ++id) {
736
+ // if data is on one device (!= -1) but not this one, continue
734
737
if (src0_id != -1 && src0_id != id) {
735
738
continue ;
736
739
}
737
740
if (src1_id != -1 && src1_id != id) {
738
741
continue ;
739
742
}
743
+
740
744
bool split = src0_id == -1 && src1_id == -1 ;
741
745
int64_t row_low, row_high;
742
746
if (split) {
@@ -818,11 +822,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
818
822
cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
819
823
cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
820
824
825
+ // for split tensors the data begins at i0 == i0_offset_low
821
826
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
822
827
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
823
828
float * src1_ddf_i = src1_ddf[id] + i1*src1_stride;
824
829
float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
825
830
831
+ // for split tensors the data pointer needs to be rounded down
832
+ // to the bin edge for i03, i02 bins beyond the first
826
833
if (i0 - i0_offset_low > 0 ) {
827
834
src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
828
835
src0_ddf_i -= (row_low % ne01)*ne00;
@@ -844,6 +851,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
844
851
}
845
852
}
846
853
854
+ // convert src0 to f32 if it's necessary for the ggml_cuda_op
847
855
if (src0_needs_f32 && !src0_is_f32) {
848
856
to_fp32_cuda (src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
849
857
CUDA_CHECK (cudaGetLastError ());
0 commit comments