Skip to content

Commit b0570b1

Browse files
Added comments
1 parent bda86fa commit b0570b1

File tree

1 file changed

+8
-0
lines changed

1 file changed

+8
-0
lines changed

ggml-cuda.cu

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -694,6 +694,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
694694
const int nb2 = dst->nb[2];
695695
const int nb3 = dst->nb[3];
696696

697+
// strides for iteration over dims 3 and 2
697698
const int64_t src0_stride = ne00 * ne01;
698699
const int64_t src1_stride = ne10 * ne11;
699700
const int64_t dst_stride = ne0 * ne1;
@@ -706,6 +707,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
706707
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
707708
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
708709

710+
// indices of the devices on which the input data is stored
709711
int src0_id = src0_extra == nullptr ? -1 : src0_extra->i_device;
710712
int src1_id = src1_extra == nullptr ? -1 : src1_extra->i_device;
711713

@@ -731,12 +733,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
731733
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
732734

733735
for (int id = 0; id < g_device_count; ++id) {
736+
// if data is on one device (!= -1) but not this one, continue
734737
if (src0_id != -1 && src0_id != id) {
735738
continue;
736739
}
737740
if (src1_id != -1 && src1_id != id) {
738741
continue;
739742
}
743+
740744
bool split = src0_id == -1 && src1_id == -1;
741745
int64_t row_low, row_high;
742746
if (split) {
@@ -818,11 +822,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
818822
cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
819823
cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
820824

825+
// for split tensors the data begins at i0 == i0_offset_low
821826
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
822827
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
823828
float * src1_ddf_i = src1_ddf[id] + i1*src1_stride;
824829
float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
825830

831+
// for split tensors the data pointer needs to be rounded down
832+
// to the bin edge for i03, i02 bins beyond the first
826833
if (i0 - i0_offset_low > 0) {
827834
src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
828835
src0_ddf_i -= (row_low % ne01)*ne00;
@@ -844,6 +851,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
844851
}
845852
}
846853

854+
// convert src0 to f32 if it's necessary for the ggml_cuda_op
847855
if (src0_needs_f32 && !src0_is_f32) {
848856
to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
849857
CUDA_CHECK(cudaGetLastError());

0 commit comments

Comments
 (0)