@@ -181,11 +181,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
181
181
do { \
182
182
cudaError_t err_ = (err); \
183
183
if (err_ != cudaSuccess) { \
184
- int dev_id ; \
185
- cudaGetDevice (&dev_id ); \
184
+ int id ; \
185
+ cudaGetDevice (&id ); \
186
186
fprintf (stderr, " \n CUDA error %d at %s:%d: %s\n " , err_, __FILE__, __LINE__, \
187
187
cudaGetErrorString (err_)); \
188
- fprintf (stderr, " current device: %d\n " , dev_id ); \
188
+ fprintf (stderr, " current device: %d\n " , id ); \
189
189
exit (1 ); \
190
190
} \
191
191
} while (0 )
@@ -195,11 +195,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
195
195
do { \
196
196
cublasStatus_t err_ = (err); \
197
197
if (err_ != CUBLAS_STATUS_SUCCESS) { \
198
- int dev_id ; \
199
- cudaGetDevice (&dev_id ); \
198
+ int id ; \
199
+ cudaGetDevice (&id ); \
200
200
fprintf (stderr, " \n cuBLAS error %d at %s:%d: %s\n " , \
201
201
err_, __FILE__, __LINE__, cublasGetStatusString (err_)); \
202
- fprintf (stderr, " current device: %d\n " , dev_id ); \
202
+ fprintf (stderr, " current device: %d\n " , id ); \
203
203
exit (1 ); \
204
204
} \
205
205
} while (0 )
@@ -465,7 +465,6 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
465
465
466
466
#define MAX_STREAMS 8
467
467
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
468
- static cudaMemPool_t g_cudaMemPools[GGML_CUDA_MAX_DEVICES] = { nullptr };
469
468
470
469
struct ggml_tensor_extra_gpu {
471
470
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
@@ -5774,16 +5773,6 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
5774
5773
return ptr;
5775
5774
}
5776
5775
5777
- static void * ggml_cuda_pool_malloc_async (size_t size, size_t * actual_size, int id, cudaStream_t stream) {
5778
- if (g_cudaMemPools[id] == nullptr ) {
5779
- return ggml_cuda_pool_malloc (size, actual_size);
5780
- }
5781
- void *ptr;
5782
- CUDA_CHECK (cudaMallocFromPoolAsync (&ptr, size, g_cudaMemPools[id], stream));
5783
- *actual_size = size;
5784
- return ptr;
5785
- }
5786
-
5787
5776
static void ggml_cuda_pool_free (void * ptr, size_t size) {
5788
5777
scoped_spin_lock lock (g_cuda_pool_lock);
5789
5778
int id;
@@ -5802,13 +5791,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
5802
5791
}
5803
5792
5804
5793
5805
- static void ggml_cuda_pool_free_async (void * ptr, size_t actual_size, int id, cudaStream_t stream) {
5806
- if (g_cudaMemPools[id] == nullptr ) {
5807
- return ggml_cuda_pool_free (ptr, actual_size);
5808
- }
5809
- CUDA_CHECK (cudaFreeAsync (ptr, stream));
5810
- }
5811
-
5812
5794
void ggml_init_cublas () {
5813
5795
static bool initialized = false ;
5814
5796
@@ -5863,13 +5845,6 @@ void ggml_init_cublas() {
5863
5845
// create cublas handle
5864
5846
CUBLAS_CHECK (cublasCreate (&g_cublas_handles[id]));
5865
5847
CUBLAS_CHECK (cublasSetMathMode (g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
5866
-
5867
- // configure memory pool
5868
- cudaError_t err = cudaDeviceGetMemPool (&g_cudaMemPools[id], id);
5869
- if (err == cudaSuccess) {
5870
- size_t treshold = UINT64_MAX;
5871
- CUDA_CHECK (cudaMemPoolSetAttribute (g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold));
5872
- }
5873
5848
}
5874
5849
5875
5850
// configure logging to stdout
@@ -6463,7 +6438,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6463
6438
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda (src0->type );
6464
6439
GGML_ASSERT (to_fp16_cuda != nullptr );
6465
6440
size_t ne = row_diff*ne00;
6466
- src0_as_f16 = (half *) ggml_cuda_pool_malloc_async (ne * sizeof (half), &src0_as, id, stream );
6441
+ src0_as_f16 = (half *) ggml_cuda_pool_malloc (ne * sizeof (half), &src0_as);
6467
6442
to_fp16_cuda (src0_dd_i, src0_as_f16, ne, stream);
6468
6443
}
6469
6444
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
@@ -6474,12 +6449,13 @@ inline void ggml_cuda_op_mul_mat_cublas(
6474
6449
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda (src1->type );
6475
6450
GGML_ASSERT (to_fp16_cuda != nullptr );
6476
6451
size_t ne = src1_ncols*ne10;
6477
- src1_as_f16 = (half *) ggml_cuda_pool_malloc_async (ne * sizeof (half), &src1_as, id, stream );
6452
+ src1_as_f16 = (half *) ggml_cuda_pool_malloc (ne * sizeof (half), &src1_as);
6478
6453
to_fp16_cuda (src1_ddf_i, src1_as_f16, ne, stream);
6479
6454
}
6480
6455
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
6481
- size_t dst_f16_as = 0 ;
6482
- half * dst_f16 = (half *) ggml_cuda_pool_malloc_async (row_diff*src1_ncols * sizeof (half), &dst_f16_as, id, stream);
6456
+
6457
+ size_t dst_as = 0 ;
6458
+ half * dst_f16 = (half *) ggml_cuda_pool_malloc (row_diff*src1_ncols * sizeof (half), &dst_as);
6483
6459
6484
6460
const half alpha_f16 = 1 .0f ;
6485
6461
const half beta_f16 = 0 .0f ;
@@ -6497,15 +6473,14 @@ inline void ggml_cuda_op_mul_mat_cublas(
6497
6473
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (GGML_TYPE_F16);
6498
6474
to_fp32_cuda (dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
6499
6475
6500
- if (dst_f16_as != 0 ) {
6501
- ggml_cuda_pool_free_async (dst_f16, dst_f16_as, id, stream);
6502
- }
6476
+ ggml_cuda_pool_free (dst_f16, dst_as);
6503
6477
6504
6478
if (src0_as != 0 ) {
6505
- ggml_cuda_pool_free_async (src0_as_f16, src0_as, id, stream );
6479
+ ggml_cuda_pool_free (src0_as_f16, src0_as);
6506
6480
}
6481
+
6507
6482
if (src1_as != 0 ) {
6508
- ggml_cuda_pool_free_async (src1_as_f16, src1_as, id, stream );
6483
+ ggml_cuda_pool_free (src1_as_f16, src1_as);
6509
6484
}
6510
6485
}
6511
6486
else {
@@ -6515,7 +6490,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6515
6490
if (src0->type != GGML_TYPE_F32) {
6516
6491
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (src0->type );
6517
6492
GGML_ASSERT (to_fp32_cuda != nullptr );
6518
- src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc_async (row_diff*ne00 * sizeof (float ), &src0_as, id, stream ); // NOLINT
6493
+ src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc (row_diff*ne00 * sizeof (float ), &src0_as); // NOLINT
6519
6494
to_fp32_cuda (src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
6520
6495
}
6521
6496
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
@@ -6532,7 +6507,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
6532
6507
&beta, dst_dd_i, ldc));
6533
6508
6534
6509
if (src0_as != 0 ) {
6535
- ggml_cuda_pool_free_async (src0_ddq_as_f32, src0_as, id, stream );
6510
+ ggml_cuda_pool_free (src0_ddq_as_f32, src0_as);
6536
6511
}
6537
6512
}
6538
6513
@@ -6955,30 +6930,29 @@ static void ggml_cuda_op_mul_mat(
6955
6930
src0_dd[id] = (char *) src0_extra->data_device [id];
6956
6931
} else {
6957
6932
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes (src0);
6958
- src0_dd[id] = (char *) ggml_cuda_pool_malloc_async (ggml_nbytes (src0), &src0_as[id], id, stream );
6933
+ src0_dd[id] = (char *) ggml_cuda_pool_malloc (ggml_nbytes (src0), &src0_as[id]);
6959
6934
}
6960
6935
6961
6936
if (src1_on_device && src1_is_contiguous) {
6962
6937
src1_ddf[id] = (float *) src1_extra->data_device [id];
6963
6938
} else {
6964
- src1_ddf[id] = (float *) ggml_cuda_pool_malloc_async (ggml_nbytes (src1), &src1_asf[id], id, stream );
6939
+ src1_ddf[id] = (float *) ggml_cuda_pool_malloc (ggml_nbytes (src1), &src1_asf[id]);
6965
6940
}
6966
6941
6967
6942
if (convert_src1_to_q8_1) {
6968
- const size_t size_dst_ddq = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
6969
- src1_ddq[id] = (char *) ggml_cuda_pool_malloc_async (size_dst_ddq, &src1_asq[id], id, stream);
6943
+ src1_ddq[id] = (char *) ggml_cuda_pool_malloc (nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
6970
6944
6971
6945
if (src1_on_device && src1_is_contiguous) {
6972
6946
quantize_row_q8_1_cuda (src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
6973
- // CUDA_CHECK(cudaGetLastError());
6947
+ CUDA_CHECK (cudaGetLastError ());
6974
6948
}
6975
6949
}
6976
6950
6977
6951
if (dst_on_device) {
6978
6952
dst_dd[id] = (float *) dst_extra->data_device [id];
6979
6953
} else {
6980
6954
const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof (float ) : ggml_nbytes (dst);
6981
- dst_dd[id] = (float *) ggml_cuda_pool_malloc_async (size_dst_ddf, &dst_as[id], id, stream );
6955
+ dst_dd[id] = (float *) ggml_cuda_pool_malloc (size_dst_ddf, &dst_as[id]);
6982
6956
}
6983
6957
}
6984
6958
@@ -7104,6 +7078,24 @@ static void ggml_cuda_op_mul_mat(
7104
7078
}
7105
7079
}
7106
7080
7081
+ for (int64_t id = 0 ; id < g_device_count; ++id) {
7082
+ CUDA_CHECK (ggml_cuda_set_device (id));
7083
+
7084
+ // free buffers again when done
7085
+ if (src0_as[id] > 0 ) {
7086
+ ggml_cuda_pool_free (src0_dd[id], src0_as[id]);
7087
+ }
7088
+ if (src1_asf[id] > 0 ) {
7089
+ ggml_cuda_pool_free (src1_ddf[id], src1_asf[id]);
7090
+ }
7091
+ if (src1_asq[id] > 0 ) {
7092
+ ggml_cuda_pool_free (src1_ddq[id], src1_asq[id]);
7093
+ }
7094
+ if (dst_as[id] > 0 ) {
7095
+ ggml_cuda_pool_free (dst_dd[id], dst_as[id]);
7096
+ }
7097
+ }
7098
+
7107
7099
// main device waits for all other devices to be finished
7108
7100
if (split && g_device_count > 1 ) {
7109
7101
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1 ) / MUL_MAT_SRC1_COL_STRIDE;
@@ -7121,21 +7113,6 @@ static void ggml_cuda_op_mul_mat(
7121
7113
CUDA_CHECK (ggml_cuda_set_device (g_main_device));
7122
7114
CUDA_CHECK (cudaDeviceSynchronize ());
7123
7115
}
7124
-
7125
- for (int64_t id = 0 ; id < g_device_count; ++id) {
7126
- if (src0_as[id] > 0 ) {
7127
- ggml_cuda_pool_free_async (src0_dd[id], src0_as[id], id, g_cudaStreams[id][0 ]);
7128
- }
7129
- if (src1_asf[id] > 0 ) {
7130
- ggml_cuda_pool_free_async (src1_ddf[id], src1_asf[id], id, g_cudaStreams[id][0 ]);
7131
- }
7132
- if (src1_asq[id] > 0 ) {
7133
- ggml_cuda_pool_free_async (src1_ddq[id], src1_asq[id], id, g_cudaStreams[id][0 ]);
7134
- }
7135
- if (dst_as[id] > 0 ) {
7136
- ggml_cuda_pool_free_async (dst_dd[id], dst_as[id], id, g_cudaStreams[id][0 ]);
7137
- }
7138
- }
7139
7116
}
7140
7117
7141
7118
static void ggml_cuda_repeat (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -7322,11 +7299,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7322
7299
GGML_ASSERT (to_fp16_cuda != nullptr );
7323
7300
7324
7301
size_t src1_as = 0 ;
7325
- half * src1_as_f16 = (half *) ggml_cuda_pool_malloc_async (ne1 * sizeof (half), &src1_as, id, main_stream );
7302
+ half * src1_as_f16 = (half *) ggml_cuda_pool_malloc (ne1 * sizeof (half), &src1_as);
7326
7303
to_fp16_cuda (src1_ddf, src1_as_f16, ne1, main_stream);
7327
7304
7328
7305
size_t dst_as = 0 ;
7329
- half * dst_f16 = (half *) ggml_cuda_pool_malloc_async (ne * sizeof (half), &dst_as, id, main_stream );
7306
+ half * dst_f16 = (half *) ggml_cuda_pool_malloc (ne * sizeof (half), &dst_as);
7330
7307
7331
7308
GGML_ASSERT (ne12 % ne02 == 0 );
7332
7309
GGML_ASSERT (ne13 % ne03 == 0 );
@@ -7380,8 +7357,8 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7380
7357
size_t ptrs_src_s = 0 ;
7381
7358
size_t ptrs_dst_s = 0 ;
7382
7359
7383
- ptrs_src = (const void **) ggml_cuda_pool_malloc_async (2 *ne23*sizeof (void *), &ptrs_src_s, id, main_stream );
7384
- ptrs_dst = ( void **) ggml_cuda_pool_malloc_async (1 *ne23*sizeof (void *), &ptrs_dst_s, id, main_stream );
7360
+ ptrs_src = (const void **) ggml_cuda_pool_malloc (2 *ne23*sizeof (void *), &ptrs_src_s);
7361
+ ptrs_dst = ( void **) ggml_cuda_pool_malloc (1 *ne23*sizeof (void *), &ptrs_dst_s);
7385
7362
7386
7363
dim3 block_dims (ne13, ne12);
7387
7364
k_compute_batched_ptrs<<<1 , block_dims, 0 , main_stream>>> (
@@ -7394,6 +7371,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7394
7371
dst->nb [2 ], dst->nb [3 ],
7395
7372
r2, r3);
7396
7373
CUDA_CHECK (cudaGetLastError ());
7374
+
7397
7375
CUBLAS_CHECK (
7398
7376
cublasGemmBatchedEx (g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7399
7377
ne01, ne11, ne10,
@@ -7405,22 +7383,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7405
7383
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7406
7384
7407
7385
if (ptrs_src_s != 0 ) {
7408
- ggml_cuda_pool_free_async (ptrs_src, ptrs_src_s, id, main_stream );
7386
+ ggml_cuda_pool_free (ptrs_src, ptrs_src_s);
7409
7387
}
7410
7388
if (ptrs_dst_s != 0 ) {
7411
- ggml_cuda_pool_free_async (ptrs_dst, ptrs_dst_s, id, main_stream );
7389
+ ggml_cuda_pool_free (ptrs_dst, ptrs_dst_s);
7412
7390
}
7413
7391
}
7414
7392
#endif
7415
7393
7416
7394
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (GGML_TYPE_F16);
7417
7395
to_fp32_cuda (dst_f16, dst_ddf, ne, main_stream);
7418
- if (src1_as != 0 ) {
7419
- ggml_cuda_pool_free_async (src1_as_f16, src1_as, id, main_stream);
7420
- }
7421
- if (dst_as != 0 ) {
7422
- ggml_cuda_pool_free_async (dst_f16, dst_as, id, main_stream);
7423
- }
7396
+
7397
+ ggml_cuda_pool_free (src1_as_f16, src1_as);
7398
+ ggml_cuda_pool_free (dst_f16, dst_as);
7424
7399
}
7425
7400
7426
7401
static void ggml_cuda_mul_mat (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
0 commit comments