@@ -5891,6 +5891,325 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
5891
5891
}
5892
5892
}
5893
5893
5894
+ static void ggml_cuda_op_mul_mat (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5895
+ ggml_cuda_op_t op, bool src0_needs_f32) {
5896
+ const int64_t ne00 = src0->ne [0 ];
5897
+ const int64_t ne01 = src0->ne [1 ];
5898
+ const int64_t ne02 = src0->ne [2 ];
5899
+ const int64_t ne03 = src0->ne [3 ];
5900
+ const int64_t nrows0 = ggml_nrows (src0);
5901
+
5902
+ const bool use_src1 = src1 != nullptr ;
5903
+ const int64_t ne10 = use_src1 ? src1->ne [0 ] : 1 ;
5904
+ const int64_t ne11 = use_src1 ? src1->ne [1 ] : 1 ;
5905
+ const int64_t ne12 = use_src1 ? src1->ne [2 ] : 1 ;
5906
+ const int64_t ne13 = use_src1 ? src1->ne [3 ] : 1 ;
5907
+ const int64_t nrows1 = use_src1 ? ggml_nrows (src1) : 1 ;
5908
+
5909
+ GGML_ASSERT (ne03 == ne13);
5910
+
5911
+ const int64_t ne0 = dst->ne [0 ];
5912
+ const int64_t ne1 = dst->ne [1 ];
5913
+
5914
+ const int nb2 = dst->nb [2 ];
5915
+ const int nb3 = dst->nb [3 ];
5916
+
5917
+ GGML_ASSERT (dst->backend != GGML_BACKEND_GPU_SPLIT);
5918
+ GGML_ASSERT (!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
5919
+
5920
+ // strides for iteration over dims 3 and 2
5921
+ const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
5922
+ const int64_t num_iters = num_iters_0;
5923
+ const int64_t stride_mod = 1 ;
5924
+ const int64_t src0_stride = ne00 * ne01 * stride_mod;
5925
+ const int64_t src1_stride = ne10 * ne11 * stride_mod;
5926
+ const int64_t dst_stride = ne0 * ne1 * stride_mod;
5927
+
5928
+ const int64_t rows_per_iter = ne01;
5929
+ const int64_t i03_max = ne03;
5930
+ const int64_t i02_max = (ne02 >= ne12 ? ne02 : ne12);
5931
+ const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
5932
+
5933
+ const size_t src0_ts = ggml_type_size (src0->type );
5934
+ const size_t src0_bs = ggml_blck_size (src0->type );
5935
+
5936
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
5937
+ struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr ;
5938
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra ;
5939
+
5940
+ const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
5941
+ const bool src0_is_contiguous = ggml_is_contiguous (src0);
5942
+ const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
5943
+
5944
+ const bool src1_is_contiguous = use_src1 && ggml_is_contiguous (src1);
5945
+ const bool src1_stays_on_host = use_src1 && (
5946
+ dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
5947
+
5948
+ const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
5949
+ GGML_ASSERT (!(split && ne02 < ne12));
5950
+
5951
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (src0->type );
5952
+
5953
+ // dd = data device
5954
+ char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr }; // quantized
5955
+ float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr }; // float
5956
+ float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr };
5957
+ float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr };
5958
+
5959
+ // asq = actual size quantized, asf = actual size float
5960
+ size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0 };
5961
+ size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0 };
5962
+ size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0 };
5963
+ size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0 };
5964
+
5965
+ // if multiple devices are used they need to wait for the main device
5966
+ // here an event is recorded that signifies that the main device has finished calculating the input data
5967
+ if (split && g_device_count > 1 ) {
5968
+ CUDA_CHECK (cudaSetDevice (g_main_device));
5969
+ CUDA_CHECK (cudaEventRecord (src0_extra->events [g_main_device], g_cudaStreams_main[g_main_device]));
5970
+ }
5971
+
5972
+ for (int id = 0 ; id < g_device_count; ++id) {
5973
+ if (!split && id != g_main_device) {
5974
+ continue ;
5975
+ }
5976
+
5977
+ const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device;
5978
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
5979
+
5980
+ int64_t row_low, row_high;
5981
+ if (split) {
5982
+ const int64_t rounding = get_row_rounding (src0->type );
5983
+
5984
+ row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
5985
+ row_low -= row_low % rounding;
5986
+
5987
+ if (id == g_device_count - 1 ) {
5988
+ row_high = nrows0;
5989
+ } else {
5990
+ row_high = nrows0*g_tensor_split[id + 1 ];
5991
+ row_high -= row_high % rounding;
5992
+ }
5993
+ } else {
5994
+ row_low = 0 ;
5995
+ row_high = nrows0*i02_divisor;
5996
+ }
5997
+ if (row_low == row_high) {
5998
+ continue ;
5999
+ }
6000
+
6001
+ int64_t row_diff = row_high - row_low;
6002
+
6003
+ cudaSetDevice (id);
6004
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id];
6005
+
6006
+ // wait for main GPU data if necessary
6007
+ if (split && id != g_main_device) {
6008
+ CUDA_CHECK (cudaStreamWaitEvent (cudaStream_main, src0_extra->events [g_main_device]));
6009
+ }
6010
+
6011
+ if (src0_on_device && src0_is_contiguous) {
6012
+ if (src0_is_f32) {
6013
+ src0_ddf[id] = (float *) src0_extra->data_device [id];
6014
+ } else {
6015
+ src0_ddq[id] = (char *) src0_extra->data_device [id];
6016
+ }
6017
+ } else {
6018
+ if (src0_is_f32) {
6019
+ src0_ddf[id] = (float *) ggml_cuda_pool_malloc (row_diff*ne00 * sizeof (float ), &src0_asf[id]);
6020
+ } else {
6021
+ src0_ddq[id] = (char *) ggml_cuda_pool_malloc (row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
6022
+ }
6023
+ }
6024
+
6025
+ if (src0_needs_f32 && !src0_is_f32) {
6026
+ src0_ddf[id] = (float *) ggml_cuda_pool_malloc (row_diff*ne00 * sizeof (float ), &src0_asf[id]);
6027
+ }
6028
+
6029
+ if (use_src1 && !src1_stays_on_host) {
6030
+ if (src1_on_device && src1_is_contiguous) {
6031
+ src1_ddf[id] = (float *) src1_extra->data_device [id];
6032
+ } else {
6033
+ src1_ddf[id] = (float *) ggml_cuda_pool_malloc (num_iters*src1_stride * sizeof (float ), &src1_asf[id]);
6034
+ }
6035
+ }
6036
+ if (dst_on_device) {
6037
+ dst_ddf[id] = (float *) dst_extra->data_device [id];
6038
+ } else {
6039
+ size_t size_dst_ddf = split ? row_diff*ne1 * sizeof (float ) : num_iters*dst_stride * sizeof (float );
6040
+ dst_ddf[id] = (float *) ggml_cuda_pool_malloc (size_dst_ddf, &dst_asf[id]);
6041
+ }
6042
+
6043
+ for (int64_t i03 = 0 ; i03 < i03_max; i03++) {
6044
+ const int64_t i13 = i03 % ne13;
6045
+ for (int64_t i02 = 0 ; i02 < i02_max; i02++) {
6046
+ const int64_t i12 = i02 % ne12;
6047
+
6048
+ const int64_t i0 = i03*i02_max + i02;
6049
+
6050
+ // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
6051
+ const int64_t i0_offset_low = row_low/rows_per_iter;
6052
+ const int64_t i0_offset_high = row_high/rows_per_iter;
6053
+
6054
+ int64_t i01_low = 0 ;
6055
+ int64_t i01_high = rows_per_iter;
6056
+ if (split) {
6057
+ if (i0 < i0_offset_low || i0 > i0_offset_high) {
6058
+ continue ;
6059
+ }
6060
+ if (i0 == i0_offset_low) {
6061
+ i01_low = row_low % rows_per_iter;
6062
+ }
6063
+ if (i0 == i0_offset_high) {
6064
+ i01_high = row_high % rows_per_iter;
6065
+ }
6066
+ }
6067
+
6068
+ // There is possibly a bug in the Windows nvcc compiler regarding instruction reordering or optimizing out local variables.
6069
+ // Removing the first assert or changing the order of the arguments causes the second assert to fail.
6070
+ // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
6071
+ // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
6072
+ GGML_ASSERT (i01_low == 0 || g_device_count > 1 );
6073
+ GGML_ASSERT (i01_high == rows_per_iter || g_device_count > 1 );
6074
+
6075
+ const int64_t i01_diff = i01_high - i01_low;
6076
+ if (i01_diff == 0 ) {
6077
+ continue ;
6078
+ }
6079
+ const int64_t i11 = i13*ne12 + i12;
6080
+
6081
+ // for split tensors the data begins at i0 == i0_offset_low
6082
+ char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
6083
+ float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
6084
+ float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
6085
+ float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
6086
+
6087
+ // for split tensors the data pointer needs to be rounded down
6088
+ // to the bin edge for i03, i02 bins beyond the first
6089
+ if (i0 - i0_offset_low > 0 ) {
6090
+ src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
6091
+ src0_ddf_i -= (row_low % ne01)*ne00;
6092
+ dst_ddf_i -= (row_low % ne0)*ne1;
6093
+ }
6094
+
6095
+ // the main device memory buffer can be on VRAM scratch, with space for all partial results
6096
+ // in that case an offset on dst_ddf_i is needed
6097
+ if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
6098
+ dst_ddf_i += i01_low; // offset is 0 if no tensor split
6099
+ }
6100
+
6101
+ // copy src0, src1 to device if necessary
6102
+ if (use_src1 && !src1_stays_on_host) {
6103
+ if (src1->backend == GGML_BACKEND_CPU) {
6104
+ int64_t nrows1 = ne11;
6105
+ CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src1_ddf_i, src1, i03, i02, 0 , nrows1, cudaStream_main));
6106
+ } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
6107
+ if (id != g_main_device) {
6108
+ float * src1_ddf_i_source = (float *) src1_extra->data_device [g_main_device];
6109
+ src1_ddf_i_source += i11*src1_stride;
6110
+ CUDA_CHECK (cudaMemcpyAsync (src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof (float ),
6111
+ cudaMemcpyDeviceToDevice, cudaStream_main));
6112
+ }
6113
+ } else if (src1_on_device && !src1_is_contiguous) {
6114
+ GGML_ASSERT (!split);
6115
+ CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src1_ddf_i, src1, i03, i02, 0 , ne11, cudaStream_main));
6116
+ } else {
6117
+ GGML_ASSERT (false );
6118
+ }
6119
+ }
6120
+
6121
+ if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0 ) {
6122
+ if (src0_is_f32) {
6123
+ CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
6124
+ } else {
6125
+ CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
6126
+ }
6127
+ }
6128
+
6129
+ // convert src0 to f32 if it is necessary for the ggml_cuda_op
6130
+ if (src0_needs_f32 && !src0_is_f32) {
6131
+ to_fp32_cuda (src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
6132
+ CUDA_CHECK (cudaGetLastError ());
6133
+ }
6134
+
6135
+ // do the computation
6136
+ op (src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
6137
+ CUDA_CHECK (cudaGetLastError ());
6138
+
6139
+ // copy dst to host or other device if necessary
6140
+ if (!dst_on_device) {
6141
+ void * dst_off_device;
6142
+ cudaMemcpyKind kind;
6143
+ if (dst->backend == GGML_BACKEND_CPU) {
6144
+ dst_off_device = dst->data ;
6145
+ kind = cudaMemcpyDeviceToHost;
6146
+ } else if (dst->backend == GGML_BACKEND_GPU) {
6147
+ dst_off_device = dst_extra->data_device [g_main_device];
6148
+ kind = cudaMemcpyDeviceToDevice;
6149
+ } else {
6150
+ GGML_ASSERT (false );
6151
+ }
6152
+ if (split) {
6153
+ // src0 = weight matrix is saved as a transposed matrix for better memory layout.
6154
+ // dst is NOT transposed.
6155
+ // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
6156
+ // Instead they need to be copied to the correct slice in ne0 = dst row index.
6157
+ // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
6158
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof (float ) + i02*nb2 + i03*nb3);
6159
+ CUDA_CHECK (cudaMemcpy2DAsync (dhf_dst_i, ne0*sizeof (float ), dst_ddf_i, i01_diff*sizeof (float ),
6160
+ i01_diff*sizeof (float ), ne1, kind, cudaStream_main));
6161
+ } else {
6162
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
6163
+ CUDA_CHECK (cudaMemcpyAsync (dhf_dst_i, dst_ddf_i, dst_stride*sizeof (float ), kind, cudaStream_main));
6164
+ }
6165
+ }
6166
+
6167
+ // signify to main device that other device is done
6168
+ if (split && g_device_count > 1 && id != g_main_device) {
6169
+ CUDA_CHECK (cudaEventRecord (src0_extra->events [id], cudaStream_main));
6170
+ }
6171
+ }
6172
+ }
6173
+ }
6174
+
6175
+ // wait until each device is finished, then free their buffers
6176
+ for (int id = 0 ; id < g_device_count; ++id) {
6177
+ if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0 ) {
6178
+ continue ;
6179
+ }
6180
+
6181
+ CUDA_CHECK (cudaSetDevice (id));
6182
+
6183
+ if (src0_asq[id] > 0 ) {
6184
+ ggml_cuda_pool_free (src0_ddq[id], src0_asq[id]);
6185
+ }
6186
+ if (src0_asf[id] > 0 ) {
6187
+ ggml_cuda_pool_free (src0_ddf[id], src0_asf[id]);
6188
+ }
6189
+ if (src1_asf[id] > 0 ) {
6190
+ ggml_cuda_pool_free (src1_ddf[id], src1_asf[id]);
6191
+ }
6192
+ if (dst_asf[id] > 0 ) {
6193
+ ggml_cuda_pool_free (dst_ddf[id], dst_asf[id]);
6194
+ }
6195
+ }
6196
+
6197
+ // main device waits for all other devices to be finished
6198
+ if (split && g_device_count > 1 ) {
6199
+ CUDA_CHECK (cudaSetDevice (g_main_device));
6200
+ for (int id = 0 ; id < g_device_count; ++id) {
6201
+ if (id != g_main_device && src0_extra->events [id]) {
6202
+ CUDA_CHECK (cudaStreamWaitEvent (g_cudaStreams_main[g_main_device], src0_extra->events [id]));
6203
+ }
6204
+ }
6205
+ }
6206
+
6207
+ if (dst->backend == GGML_BACKEND_CPU) {
6208
+ CUDA_CHECK (cudaSetDevice (g_main_device));
6209
+ CUDA_CHECK (cudaDeviceSynchronize ());
6210
+ }
6211
+ }
6212
+
5894
6213
static void ggml_cuda_op (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5895
6214
ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
5896
6215
const int64_t ne00 = src0->ne [0 ];
@@ -6327,10 +6646,10 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
6327
6646
} else if (all_on_device && !ggml_is_contiguous (src0) && ggml_is_contiguous (src1) && src1->ne [1 ] == 1 ) {
6328
6647
ggml_cuda_mul_mat_vec_nc (src0, src1, dst);
6329
6648
}else if (src0->type == GGML_TYPE_F32) {
6330
- ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true , false );
6649
+ ggml_cuda_op_mul_mat (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true );
6331
6650
} else if (ggml_is_quantized (src0->type ) || src0->type == GGML_TYPE_F16) {
6332
6651
if (src1->ne [1 ] == 1 && src0->ne [0 ] % GGML_CUDA_DMMV_X == 0 ) {
6333
- ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_vec, false , false );
6652
+ ggml_cuda_op_mul_mat (src0, src1, dst, ggml_cuda_op_mul_mat_vec, false );
6334
6653
} else {
6335
6654
int min_compute_capability = INT_MAX;
6336
6655
for (int id = 0 ; id < g_device_count; ++id) {
@@ -6341,9 +6660,9 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
6341
6660
}
6342
6661
6343
6662
if (g_mul_mat_q && ggml_is_quantized (src0->type ) && min_compute_capability >= MIN_CC_DP4A) {
6344
- ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_q, false , false );
6663
+ ggml_cuda_op_mul_mat (src0, src1, dst, ggml_cuda_op_mul_mat_q, false );
6345
6664
} else {
6346
- ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true , false );
6665
+ ggml_cuda_op_mul_mat (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true );
6347
6666
}
6348
6667
}
6349
6668
} else {
0 commit comments