@@ -6647,7 +6647,6 @@ static void ggml_cuda_pool_free_leg(void * ptr, size_t size) {
6647
6647
6648
6648
#if !defined(GGML_USE_HIPBLAS)
6649
6649
// pool with virtual memory
6650
- static std::vector<CUmemGenericAllocationHandle> g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
6651
6650
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0 };
6652
6651
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0 };
6653
6652
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36 ; // 64 GB
@@ -6687,6 +6686,9 @@ static void * ggml_cuda_pool_malloc_vmm(size_t size, size_t * actual_size) {
6687
6686
// map at the end of the pool
6688
6687
CU_CHECK (cuMemMap (g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, 0 , handle, 0 ));
6689
6688
6689
+ // the memory allocation handle is no longer needed after mapping
6690
+ CU_CHECK (cuMemRelease (handle));
6691
+
6690
6692
// set access
6691
6693
CUmemAccessDesc access = {};
6692
6694
access.location .type = CU_MEM_LOCATION_TYPE_DEVICE;
@@ -6695,7 +6697,6 @@ static void * ggml_cuda_pool_malloc_vmm(size_t size, size_t * actual_size) {
6695
6697
CU_CHECK (cuMemSetAccess (g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, &access, 1 ));
6696
6698
6697
6699
// add to the pool
6698
- g_cuda_pool_handles[id].push_back (handle);
6699
6700
g_cuda_pool_size[id] += reserve_size;
6700
6701
6701
6702
// printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
@@ -8193,13 +8194,13 @@ static void ggml_cuda_op_mul_mat(
8193
8194
if (id != g_main_device) {
8194
8195
if (convert_src1_to_q8_1) {
8195
8196
char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
8196
- CUDA_CHECK (cudaMemcpyAsync (src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs ,
8197
- cudaMemcpyDeviceToDevice , stream));
8197
+ CUDA_CHECK (cudaMemcpyPeerAsync (src1_ddq_i, id, src1_ddq_i_source, g_main_device ,
8198
+ src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs , stream));
8198
8199
} else {
8199
8200
float * src1_ddf_i_source = (float *) src1_extra->data_device [g_main_device];
8200
8201
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
8201
- CUDA_CHECK (cudaMemcpyAsync (src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10* sizeof ( float ) ,
8202
- cudaMemcpyDeviceToDevice , stream));
8202
+ CUDA_CHECK (cudaMemcpyPeerAsync (src1_ddf_i, id, src1_ddf_i_source, g_main_device ,
8203
+ src1_ncols*ne10* sizeof ( float ) , stream));
8203
8204
}
8204
8205
}
8205
8206
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
0 commit comments