cuda_op_mul_mat

JohannesGaessler · JohannesGaessler · commit dbddcd1002ad · 2023-09-08T22:36:57.000+02:00
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -5891,6 +5891,325 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
     }
 }
 
+static void ggml_cuda_op_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+                         ggml_cuda_op_t op, bool src0_needs_f32) {
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+    const int64_t nrows0 = ggml_nrows(src0);
+
+    const bool use_src1 = src1 != nullptr;
+    const int64_t ne10 = use_src1 ? src1->ne[0] : 1;
+    const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
+    const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
+    const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
+    const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
+
+    GGML_ASSERT(ne03 == ne13);
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
+
+    // strides for iteration over dims 3 and 2
+    const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
+    const int64_t num_iters = num_iters_0;
+    const int64_t stride_mod = 1;
+    const int64_t src0_stride = ne00 * ne01 * stride_mod;
+    const int64_t src1_stride = ne10 * ne11 * stride_mod;
+    const int64_t dst_stride = ne0 * ne1 * stride_mod;
+
+    const int64_t rows_per_iter = ne01;
+    const int64_t i03_max = ne03;
+    const int64_t i02_max = (ne02 >= ne12 ? ne02 : ne12);
+    const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
+
+    const size_t src0_ts = ggml_type_size(src0->type);
+    const size_t src0_bs = ggml_blck_size(src0->type);
+
+    struct ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
+    struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
+    struct ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *) dst->extra;
+
+    const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
+    const bool src0_is_contiguous = ggml_is_contiguous(src0);
+    const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
+
+    const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1);
+    const bool src1_stays_on_host = use_src1 && (
+        dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
+
+    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
+    GGML_ASSERT(!(split && ne02 < ne12));
+
+    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
+
+    // dd = data device
+    char  * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
+    float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
+    float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
+    float *  dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
+
+    // asq = actual size quantized, asf = actual size float
+    size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
+    size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
+    size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
+    size_t  dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
+
+    // if multiple devices are used they need to wait for the main device
+    // here an event is recorded that signifies that the main device has finished calculating the input data
+    if (split && g_device_count > 1) {
+        CUDA_CHECK(cudaSetDevice(g_main_device));
+        CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
+    }
+
+    for (int id = 0; id < g_device_count; ++id) {
+        if (!split && id != g_main_device) {
+            continue;
+        }
+
+        const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device;
+        const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
+
+        int64_t row_low, row_high;
+        if (split) {
+            const int64_t rounding = get_row_rounding(src0->type);
+
+            row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
+            row_low -= row_low % rounding;
+
+            if (id == g_device_count - 1) {
+                row_high = nrows0;
+            } else {
+                row_high = nrows0*g_tensor_split[id + 1];
+                row_high -= row_high % rounding;
+            }
+        } else {
+            row_low = 0;
+            row_high = nrows0*i02_divisor;
+        }
+        if (row_low == row_high) {
+            continue;
+        }
+
+        int64_t row_diff = row_high - row_low;
+
+        cudaSetDevice(id);
+        cudaStream_t cudaStream_main = g_cudaStreams_main[id];
+
+        // wait for main GPU data if necessary
+        if (split && id != g_main_device) {
+            CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
+        }
+
+        if (src0_on_device && src0_is_contiguous) {
+            if (src0_is_f32) {
+                src0_ddf[id] = (float *) src0_extra->data_device[id];
+            } else {
+                src0_ddq[id] = (char *) src0_extra->data_device[id];
+            }
+        } else {
+            if (src0_is_f32) {
+                src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
+            } else {
+                src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
+            }
+        }
+
+        if (src0_needs_f32 && !src0_is_f32) {
+            src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
+        }
+
+        if (use_src1 && !src1_stays_on_host) {
+            if (src1_on_device && src1_is_contiguous) {
+                src1_ddf[id] = (float *) src1_extra->data_device[id];
+            } else {
+                src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
+            }
+        }
+        if (dst_on_device) {
+            dst_ddf[id] = (float *) dst_extra->data_device[id];
+        } else {
+            size_t size_dst_ddf = split ? row_diff*ne1 * sizeof(float) : num_iters*dst_stride * sizeof(float);
+            dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
+        }
+
+        for (int64_t i03 = 0; i03 < i03_max; i03++) {
+            const int64_t i13 = i03 % ne13;
+            for (int64_t i02 = 0; i02 < i02_max; i02++) {
+                const int64_t i12 = i02 % ne12;
+
+                const int64_t i0 = i03*i02_max + i02;
+
+                // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
+                const int64_t i0_offset_low = row_low/rows_per_iter;
+                const int64_t i0_offset_high = row_high/rows_per_iter;
+
+                int64_t i01_low = 0;
+                int64_t i01_high = rows_per_iter;
+                if (split) {
+                    if (i0 < i0_offset_low || i0 > i0_offset_high) {
+                        continue;
+                    }
+                    if (i0 == i0_offset_low) {
+                        i01_low = row_low % rows_per_iter;
+                    }
+                    if (i0 == i0_offset_high) {
+                        i01_high = row_high % rows_per_iter;
+                    }
+                }
+
+                // There is possibly a bug in the Windows nvcc compiler regarding instruction reordering or optimizing out local variables.
+                // Removing the first assert or changing the order of the arguments causes the second assert to fail.
+                // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
+                // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
+                GGML_ASSERT(i01_low == 0 || g_device_count > 1);
+                GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
+
+                const int64_t i01_diff = i01_high - i01_low;
+                if (i01_diff == 0) {
+                    continue;
+                }
+                const int64_t i11 = i13*ne12 + i12;
+
+                // for split tensors the data begins at i0 == i0_offset_low
+                char  * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
+                float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
+                float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
+                float * dst_ddf_i  =  dst_ddf[id] + (i0             - i0_offset_low)*dst_stride;
+
+                // for split tensors the data pointer needs to be rounded down
+                // to the bin edge for i03, i02 bins beyond the first
+                if (i0 - i0_offset_low > 0) {
+                    src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
+                    src0_ddf_i -= (row_low % ne01)*ne00;
+                    dst_ddf_i  -= (row_low % ne0)*ne1;
+                }
+
+                // the main device memory buffer can be on VRAM scratch, with space for all partial results
+                // in that case an offset on dst_ddf_i is needed
+                if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
+                    dst_ddf_i += i01_low; // offset is 0 if no tensor split
+                }
+
+                // copy src0, src1 to device if necessary
+                if (use_src1 && !src1_stays_on_host) {
+                    if (src1->backend == GGML_BACKEND_CPU) {
+                        int64_t nrows1 = ne11;
+                        CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
+                    } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
+                        if (id != g_main_device) {
+                            float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
+                            src1_ddf_i_source += i11*src1_stride;
+                            CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
+                                                    cudaMemcpyDeviceToDevice, cudaStream_main));
+                        }
+                    } else if (src1_on_device && !src1_is_contiguous) {
+                        GGML_ASSERT(!split);
+                        CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
+                    } else {
+                        GGML_ASSERT(false);
+                    }
+                }
+
+                if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
+                    if (src0_is_f32) {
+                        CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
+                    } else {
+                        CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
+                    }
+                }
+
+                // convert src0 to f32 if it is necessary for the ggml_cuda_op
+                if (src0_needs_f32 && !src0_is_f32) {
+                    to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
+                    CUDA_CHECK(cudaGetLastError());
+                }
+
+                // do the computation
+                op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
+                CUDA_CHECK(cudaGetLastError());
+
+                // copy dst to host or other device if necessary
+                if (!dst_on_device) {
+                    void * dst_off_device;
+                    cudaMemcpyKind kind;
+                    if (dst->backend == GGML_BACKEND_CPU) {
+                        dst_off_device = dst->data;
+                        kind = cudaMemcpyDeviceToHost;
+                    } else if (dst->backend == GGML_BACKEND_GPU) {
+                        dst_off_device = dst_extra->data_device[g_main_device];
+                        kind = cudaMemcpyDeviceToDevice;
+                    } else {
+                        GGML_ASSERT(false);
+                    }
+                    if (split) {
+                        // src0 = weight matrix is saved as a transposed matrix for better memory layout.
+                        // dst is NOT transposed.
+                        // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
+                        // Instead they need to be copied to the correct slice in ne0 = dst row index.
+                        // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
+                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
+                        CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
+                                                     i01_diff*sizeof(float), ne1, kind, cudaStream_main));
+                    } else {
+                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
+                        CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
+                    }
+                }
+
+                // signify to main device that other device is done
+                if (split && g_device_count > 1 && id != g_main_device) {
+                    CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
+                }
+            }
+        }
+    }
+
+    // wait until each device is finished, then free their buffers
+    for (int id = 0; id < g_device_count; ++id) {
+        if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
+            continue;
+        }
+
+        CUDA_CHECK(cudaSetDevice(id));
+
+        if (src0_asq[id] > 0) {
+            ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
+        }
+        if (src0_asf[id] > 0) {
+            ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]);
+        }
+        if (src1_asf[id] > 0) {
+            ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
+        }
+        if (dst_asf[id] > 0) {
+            ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
+        }
+    }
+
+    // main device waits for all other devices to be finished
+    if (split && g_device_count > 1) {
+        CUDA_CHECK(cudaSetDevice(g_main_device));
+        for (int id = 0; id < g_device_count; ++id) {
+            if (id != g_main_device && src0_extra->events[id]) {
+                CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
+            }
+        }
+    }
+
+    if (dst->backend == GGML_BACKEND_CPU) {
+        CUDA_CHECK(cudaSetDevice(g_main_device));
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+}
+
 static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
                          ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
     const int64_t ne00 = src0->ne[0];
@@ -6327,10 +6646,10 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
     } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
         ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
     }else if (src0->type == GGML_TYPE_F32) {
-        ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
+        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
     } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
         if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
-            ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
+            ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false);
         } else {
             int min_compute_capability = INT_MAX;
             for (int id = 0; id < g_device_count; ++id) {
@@ -6341,9 +6660,9 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
             }
 
             if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
-                ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, false);
             } else {
-                ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
             }
         }
     } else {