Skip to content

Commit dbddcd1

Browse files
cuda_op_mul_mat
1 parent 624f06f commit dbddcd1

File tree

1 file changed

+323
-4
lines changed

1 file changed

+323
-4
lines changed

ggml-cuda.cu

Lines changed: 323 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5891,6 +5891,325 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
58915891
}
58925892
}
58935893

5894+
static void ggml_cuda_op_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5895+
ggml_cuda_op_t op, bool src0_needs_f32) {
5896+
const int64_t ne00 = src0->ne[0];
5897+
const int64_t ne01 = src0->ne[1];
5898+
const int64_t ne02 = src0->ne[2];
5899+
const int64_t ne03 = src0->ne[3];
5900+
const int64_t nrows0 = ggml_nrows(src0);
5901+
5902+
const bool use_src1 = src1 != nullptr;
5903+
const int64_t ne10 = use_src1 ? src1->ne[0] : 1;
5904+
const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
5905+
const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
5906+
const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
5907+
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
5908+
5909+
GGML_ASSERT(ne03 == ne13);
5910+
5911+
const int64_t ne0 = dst->ne[0];
5912+
const int64_t ne1 = dst->ne[1];
5913+
5914+
const int nb2 = dst->nb[2];
5915+
const int nb3 = dst->nb[3];
5916+
5917+
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
5918+
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
5919+
5920+
// strides for iteration over dims 3 and 2
5921+
const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
5922+
const int64_t num_iters = num_iters_0;
5923+
const int64_t stride_mod = 1;
5924+
const int64_t src0_stride = ne00 * ne01 * stride_mod;
5925+
const int64_t src1_stride = ne10 * ne11 * stride_mod;
5926+
const int64_t dst_stride = ne0 * ne1 * stride_mod;
5927+
5928+
const int64_t rows_per_iter = ne01;
5929+
const int64_t i03_max = ne03;
5930+
const int64_t i02_max = (ne02 >= ne12 ? ne02 : ne12);
5931+
const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
5932+
5933+
const size_t src0_ts = ggml_type_size(src0->type);
5934+
const size_t src0_bs = ggml_blck_size(src0->type);
5935+
5936+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
5937+
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
5938+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
5939+
5940+
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
5941+
const bool src0_is_contiguous = ggml_is_contiguous(src0);
5942+
const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
5943+
5944+
const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1);
5945+
const bool src1_stays_on_host = use_src1 && (
5946+
dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
5947+
5948+
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
5949+
GGML_ASSERT(!(split && ne02 < ne12));
5950+
5951+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
5952+
5953+
// dd = data device
5954+
char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
5955+
float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
5956+
float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
5957+
float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
5958+
5959+
// asq = actual size quantized, asf = actual size float
5960+
size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
5961+
size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
5962+
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
5963+
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
5964+
5965+
// if multiple devices are used they need to wait for the main device
5966+
// here an event is recorded that signifies that the main device has finished calculating the input data
5967+
if (split && g_device_count > 1) {
5968+
CUDA_CHECK(cudaSetDevice(g_main_device));
5969+
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
5970+
}
5971+
5972+
for (int id = 0; id < g_device_count; ++id) {
5973+
if (!split && id != g_main_device) {
5974+
continue;
5975+
}
5976+
5977+
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device;
5978+
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
5979+
5980+
int64_t row_low, row_high;
5981+
if (split) {
5982+
const int64_t rounding = get_row_rounding(src0->type);
5983+
5984+
row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
5985+
row_low -= row_low % rounding;
5986+
5987+
if (id == g_device_count - 1) {
5988+
row_high = nrows0;
5989+
} else {
5990+
row_high = nrows0*g_tensor_split[id + 1];
5991+
row_high -= row_high % rounding;
5992+
}
5993+
} else {
5994+
row_low = 0;
5995+
row_high = nrows0*i02_divisor;
5996+
}
5997+
if (row_low == row_high) {
5998+
continue;
5999+
}
6000+
6001+
int64_t row_diff = row_high - row_low;
6002+
6003+
cudaSetDevice(id);
6004+
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
6005+
6006+
// wait for main GPU data if necessary
6007+
if (split && id != g_main_device) {
6008+
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
6009+
}
6010+
6011+
if (src0_on_device && src0_is_contiguous) {
6012+
if (src0_is_f32) {
6013+
src0_ddf[id] = (float *) src0_extra->data_device[id];
6014+
} else {
6015+
src0_ddq[id] = (char *) src0_extra->data_device[id];
6016+
}
6017+
} else {
6018+
if (src0_is_f32) {
6019+
src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
6020+
} else {
6021+
src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
6022+
}
6023+
}
6024+
6025+
if (src0_needs_f32 && !src0_is_f32) {
6026+
src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
6027+
}
6028+
6029+
if (use_src1 && !src1_stays_on_host) {
6030+
if (src1_on_device && src1_is_contiguous) {
6031+
src1_ddf[id] = (float *) src1_extra->data_device[id];
6032+
} else {
6033+
src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
6034+
}
6035+
}
6036+
if (dst_on_device) {
6037+
dst_ddf[id] = (float *) dst_extra->data_device[id];
6038+
} else {
6039+
size_t size_dst_ddf = split ? row_diff*ne1 * sizeof(float) : num_iters*dst_stride * sizeof(float);
6040+
dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
6041+
}
6042+
6043+
for (int64_t i03 = 0; i03 < i03_max; i03++) {
6044+
const int64_t i13 = i03 % ne13;
6045+
for (int64_t i02 = 0; i02 < i02_max; i02++) {
6046+
const int64_t i12 = i02 % ne12;
6047+
6048+
const int64_t i0 = i03*i02_max + i02;
6049+
6050+
// i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
6051+
const int64_t i0_offset_low = row_low/rows_per_iter;
6052+
const int64_t i0_offset_high = row_high/rows_per_iter;
6053+
6054+
int64_t i01_low = 0;
6055+
int64_t i01_high = rows_per_iter;
6056+
if (split) {
6057+
if (i0 < i0_offset_low || i0 > i0_offset_high) {
6058+
continue;
6059+
}
6060+
if (i0 == i0_offset_low) {
6061+
i01_low = row_low % rows_per_iter;
6062+
}
6063+
if (i0 == i0_offset_high) {
6064+
i01_high = row_high % rows_per_iter;
6065+
}
6066+
}
6067+
6068+
// There is possibly a bug in the Windows nvcc compiler regarding instruction reordering or optimizing out local variables.
6069+
// Removing the first assert or changing the order of the arguments causes the second assert to fail.
6070+
// Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
6071+
// The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
6072+
GGML_ASSERT(i01_low == 0 || g_device_count > 1);
6073+
GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
6074+
6075+
const int64_t i01_diff = i01_high - i01_low;
6076+
if (i01_diff == 0) {
6077+
continue;
6078+
}
6079+
const int64_t i11 = i13*ne12 + i12;
6080+
6081+
// for split tensors the data begins at i0 == i0_offset_low
6082+
char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
6083+
float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
6084+
float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
6085+
float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
6086+
6087+
// for split tensors the data pointer needs to be rounded down
6088+
// to the bin edge for i03, i02 bins beyond the first
6089+
if (i0 - i0_offset_low > 0) {
6090+
src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
6091+
src0_ddf_i -= (row_low % ne01)*ne00;
6092+
dst_ddf_i -= (row_low % ne0)*ne1;
6093+
}
6094+
6095+
// the main device memory buffer can be on VRAM scratch, with space for all partial results
6096+
// in that case an offset on dst_ddf_i is needed
6097+
if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
6098+
dst_ddf_i += i01_low; // offset is 0 if no tensor split
6099+
}
6100+
6101+
// copy src0, src1 to device if necessary
6102+
if (use_src1 && !src1_stays_on_host) {
6103+
if (src1->backend == GGML_BACKEND_CPU) {
6104+
int64_t nrows1 = ne11;
6105+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
6106+
} else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
6107+
if (id != g_main_device) {
6108+
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
6109+
src1_ddf_i_source += i11*src1_stride;
6110+
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
6111+
cudaMemcpyDeviceToDevice, cudaStream_main));
6112+
}
6113+
} else if (src1_on_device && !src1_is_contiguous) {
6114+
GGML_ASSERT(!split);
6115+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
6116+
} else {
6117+
GGML_ASSERT(false);
6118+
}
6119+
}
6120+
6121+
if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
6122+
if (src0_is_f32) {
6123+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
6124+
} else {
6125+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
6126+
}
6127+
}
6128+
6129+
// convert src0 to f32 if it is necessary for the ggml_cuda_op
6130+
if (src0_needs_f32 && !src0_is_f32) {
6131+
to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
6132+
CUDA_CHECK(cudaGetLastError());
6133+
}
6134+
6135+
// do the computation
6136+
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
6137+
CUDA_CHECK(cudaGetLastError());
6138+
6139+
// copy dst to host or other device if necessary
6140+
if (!dst_on_device) {
6141+
void * dst_off_device;
6142+
cudaMemcpyKind kind;
6143+
if (dst->backend == GGML_BACKEND_CPU) {
6144+
dst_off_device = dst->data;
6145+
kind = cudaMemcpyDeviceToHost;
6146+
} else if (dst->backend == GGML_BACKEND_GPU) {
6147+
dst_off_device = dst_extra->data_device[g_main_device];
6148+
kind = cudaMemcpyDeviceToDevice;
6149+
} else {
6150+
GGML_ASSERT(false);
6151+
}
6152+
if (split) {
6153+
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
6154+
// dst is NOT transposed.
6155+
// The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
6156+
// Instead they need to be copied to the correct slice in ne0 = dst row index.
6157+
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
6158+
float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
6159+
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
6160+
i01_diff*sizeof(float), ne1, kind, cudaStream_main));
6161+
} else {
6162+
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
6163+
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
6164+
}
6165+
}
6166+
6167+
// signify to main device that other device is done
6168+
if (split && g_device_count > 1 && id != g_main_device) {
6169+
CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
6170+
}
6171+
}
6172+
}
6173+
}
6174+
6175+
// wait until each device is finished, then free their buffers
6176+
for (int id = 0; id < g_device_count; ++id) {
6177+
if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
6178+
continue;
6179+
}
6180+
6181+
CUDA_CHECK(cudaSetDevice(id));
6182+
6183+
if (src0_asq[id] > 0) {
6184+
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
6185+
}
6186+
if (src0_asf[id] > 0) {
6187+
ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]);
6188+
}
6189+
if (src1_asf[id] > 0) {
6190+
ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
6191+
}
6192+
if (dst_asf[id] > 0) {
6193+
ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
6194+
}
6195+
}
6196+
6197+
// main device waits for all other devices to be finished
6198+
if (split && g_device_count > 1) {
6199+
CUDA_CHECK(cudaSetDevice(g_main_device));
6200+
for (int id = 0; id < g_device_count; ++id) {
6201+
if (id != g_main_device && src0_extra->events[id]) {
6202+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
6203+
}
6204+
}
6205+
}
6206+
6207+
if (dst->backend == GGML_BACKEND_CPU) {
6208+
CUDA_CHECK(cudaSetDevice(g_main_device));
6209+
CUDA_CHECK(cudaDeviceSynchronize());
6210+
}
6211+
}
6212+
58946213
static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
58956214
ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
58966215
const int64_t ne00 = src0->ne[0];
@@ -6327,10 +6646,10 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
63276646
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
63286647
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
63296648
}else if (src0->type == GGML_TYPE_F32) {
6330-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
6649+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
63316650
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
63326651
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
6333-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
6652+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false);
63346653
} else {
63356654
int min_compute_capability = INT_MAX;
63366655
for (int id = 0; id < g_device_count; ++id) {
@@ -6341,9 +6660,9 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
63416660
}
63426661

63436662
if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
6344-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
6663+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, false);
63456664
} else {
6346-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
6665+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
63476666
}
63486667
}
63496668
} else {

0 commit comments

Comments
 (0)