Skip to content

Commit d5d6a80

Browse files
committed
cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory
1 parent 2dd6dee commit d5d6a80

File tree

1 file changed

+6
-5
lines changed

1 file changed

+6
-5
lines changed

ggml.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8242,15 +8242,18 @@ static void ggml_compute_forward_mul_mat_f16_f32(
82428242
const int d_ne = ne11 * ne01;
82438243

82448244
size_t x_size, y_size, d_size;
8245-
float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
8246-
float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
8247-
float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
8245+
ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
8246+
ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
8247+
float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
82488248
#else
82498249
float * const wdata = params->wdata;
82508250
#endif
82518251
for (int64_t i03 = 0; i03 < ne03; i03++) {
82528252
for (int64_t i02 = 0; i02 < ne02; i02++) {
82538253
#if defined(GGML_USE_CUBLAS)
8254+
// copy src0 while converting src1
8255+
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i02, i03, g_cudaStream));
8256+
82548257
// with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
82558258
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + (ne11 * ne10) * (i03 * ne02 + i02);
82568259
{
@@ -8274,11 +8277,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
82748277

82758278
#if defined(GGML_USE_CUBLAS)
82768279
const ggml_fp16_t * y = (ggml_fp16_t *) wdata;
8277-
82788280
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
82798281

82808282
// copy data to device
8281-
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
82828283
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
82838284

82848285
// compute

0 commit comments

Comments
 (0)