@@ -503,6 +503,7 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
503
503
static void * g_scratch_buffer = nullptr ;
504
504
static size_t g_scratch_size = 0 ; // disabled by default
505
505
static size_t g_scratch_offset = 0 ;
506
+ static bool g_cudaMutliGpuMemPoolSupported = true ;
506
507
507
508
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr };
508
509
@@ -5813,7 +5814,7 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
5813
5814
5814
5815
5815
5816
static void ggml_cuda_pool_free_async (void * ptr, size_t actual_size, int id, cudaStream_t stream) {
5816
- if (g_cudaMemPools[id] == nullptr ) {
5817
+ if (g_cudaMemPools[id] == nullptr || !g_cudaMutliGpuMemPoolSupported ) {
5817
5818
return ggml_cuda_pool_free (ptr, actual_size);
5818
5819
}
5819
5820
CUDA_CHECK (cudaFreeAsync (ptr, stream));
@@ -5896,6 +5897,49 @@ void ggml_init_cublas() {
5896
5897
g_compute_capabilities[id] = 100 *prop.major + 10 *prop.minor ;
5897
5898
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5898
5899
}
5900
+
5901
+ #if defined(CUDA_USE_MEMORY_POOL)
5902
+ if (g_device_count > 1 ) {
5903
+ // give access to devices memory pools
5904
+ if (g_cudaMemPools[g_main_device] != nullptr ) {
5905
+ cudaMemPool_t main_device_pool;
5906
+ cudaMemAccessDesc desc_main_device = {};
5907
+ desc_main_device.location .type = cudaMemLocationTypeDevice;
5908
+ desc_main_device.location .id = g_main_device;
5909
+ desc_main_device.flags = cudaMemAccessFlagsProtReadWrite;
5910
+ CUDA_CHECK (cudaDeviceGetDefaultMemPool (&main_device_pool, g_main_device));
5911
+ for (int id = 0 ; id < g_device_count; ++id) {
5912
+ if (id == g_main_device) continue ;
5913
+
5914
+ if (g_cudaMemPools[id] == nullptr ) {
5915
+ fprintf (stderr,
5916
+ " Warning: Device %d doesnt support CUDA memory pool, skipping pool access config\n " ,
5917
+ id);
5918
+ }
5919
+
5920
+ cudaMemAccessDesc desc_device = {};
5921
+ desc_device.location .type = cudaMemLocationTypeDevice;
5922
+ desc_device.location .id = id;
5923
+ desc_device.flags = cudaMemAccessFlagsProtReadWrite;
5924
+ cudaError_t err = cudaMemPoolSetAccess (main_device_pool, &desc_device, 1 /* numDescs */ );
5925
+ if (err != cudaSuccess) {
5926
+ fprintf (stderr, " Cant give access for main device memory pool to device %d\n " , id);
5927
+ }
5928
+ cudaMemPool_t mempool;
5929
+ CUDA_CHECK (cudaDeviceGetDefaultMemPool (&mempool, id));
5930
+ err = cudaMemPoolSetAccess (mempool, &desc_main_device, 1 /* numDescs */ );
5931
+ if (err != cudaSuccess) {
5932
+ fprintf (stderr, " Cant give access for device %d memory pool to main device \n " , id);
5933
+ }
5934
+ }
5935
+ } else {
5936
+ fprintf (stderr,
5937
+ " WARNING: Your main GPU device doesnt support CUDA memory pools. Using custom memory pool implementation.\n " );
5938
+ g_cudaMutliGpuMemPoolSupported = false ;
5939
+ }
5940
+ }
5941
+ #endif
5942
+
5899
5943
for (int id = 0 ; id < g_device_count; ++id) {
5900
5944
g_tensor_split[id] /= total_vram;
5901
5945
}
@@ -6410,7 +6454,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
6410
6454
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
6411
6455
6412
6456
if (src1_convert_f16) {
6413
- src1_dfloat = (half *) ggml_cuda_pool_malloc (ne00*sizeof (half), &ash);
6457
+ src1_dfloat = (half *) ggml_cuda_pool_malloc_async (ne00*sizeof (half), &ash, g_main_device, stream );
6414
6458
ggml_cpy_f32_f16_cuda ((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
6415
6459
ne00, 1 , sizeof (float ), 0 , 0 ,
6416
6460
ne00, 1 , sizeof (half), 0 , 0 , stream);
@@ -6811,22 +6855,22 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
6811
6855
if (src0_on_device) {
6812
6856
src0_ddf = (float *) src0_extra->data_device [g_main_device];
6813
6857
} else {
6814
- src0_ddf = (float *) ggml_cuda_pool_malloc (ggml_nbytes (src0), &src0_asf);
6858
+ src0_ddf = (float *) ggml_cuda_pool_malloc_async (ggml_nbytes (src0), &src0_asf, g_main_device, main_stream );
6815
6859
CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src0_ddf, src0, 0 , 0 , 0 , nrows0, main_stream));
6816
6860
}
6817
6861
6818
6862
if (use_src1 && !src1_stays_on_host) {
6819
6863
if (src1_on_device) {
6820
6864
src1_ddf = (float *) src1_extra->data_device [g_main_device];
6821
6865
} else {
6822
- src1_ddf = (float *) ggml_cuda_pool_malloc (ggml_nbytes (src1), &src1_asf);
6866
+ src1_ddf = (float *) ggml_cuda_pool_malloc_async (ggml_nbytes (src1), &src1_asf, g_main_device, main_stream );
6823
6867
CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src1_ddf, src1, 0 , 0 , 0 , nrows1, main_stream));
6824
6868
}
6825
6869
}
6826
6870
if (dst_on_device) {
6827
6871
dst_ddf = (float *) dst_extra->data_device [g_main_device];
6828
6872
} else {
6829
- dst_ddf = (float *) ggml_cuda_pool_malloc (ggml_nbytes (dst), &dst_asf);
6873
+ dst_ddf = (float *) ggml_cuda_pool_malloc_async (ggml_nbytes (dst), &dst_asf, g_main_device, main_stream );
6830
6874
}
6831
6875
6832
6876
// do the computation
@@ -6838,18 +6882,18 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
6838
6882
CUDA_CHECK (cudaMemcpyAsync (dst->data , dst_ddf, ggml_nbytes (dst), cudaMemcpyDeviceToHost, main_stream));
6839
6883
}
6840
6884
6885
+ if (dst->backend == GGML_BACKEND_CPU) {
6886
+ CUDA_CHECK (cudaDeviceSynchronize ());
6887
+ }
6888
+
6841
6889
if (src0_asf > 0 ) {
6842
- ggml_cuda_pool_free (src0_ddf, src0_asf);
6890
+ ggml_cuda_pool_free_async (src0_ddf, src0_asf, g_main_device, main_stream );
6843
6891
}
6844
6892
if (src1_asf > 0 ) {
6845
- ggml_cuda_pool_free (src1_ddf, src1_asf);
6893
+ ggml_cuda_pool_free_async (src1_ddf, src1_asf, g_main_device, main_stream );
6846
6894
}
6847
6895
if (dst_asf > 0 ) {
6848
- ggml_cuda_pool_free (dst_ddf, dst_asf);
6849
- }
6850
-
6851
- if (dst->backend == GGML_BACKEND_CPU) {
6852
- CUDA_CHECK (cudaDeviceSynchronize ());
6896
+ ggml_cuda_pool_free_async (dst_ddf, dst_asf, g_main_device, main_stream);
6853
6897
}
6854
6898
}
6855
6899
0 commit comments