Skip to content

Commit 32dc09a

Browse files
committed
cuda : fix vmm pool with multi GPU
1 parent 753be37 commit 32dc09a

File tree

1 file changed

+7
-6
lines changed

1 file changed

+7
-6
lines changed

ggml-cuda.cu

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6647,7 +6647,6 @@ static void ggml_cuda_pool_free_leg(void * ptr, size_t size) {
66476647

66486648
#if !defined(GGML_USE_HIPBLAS)
66496649
// pool with virtual memory
6650-
static std::vector<CUmemGenericAllocationHandle> g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
66516650
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
66526651
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
66536652
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
@@ -6687,6 +6686,9 @@ static void * ggml_cuda_pool_malloc_vmm(size_t size, size_t * actual_size) {
66876686
// map at the end of the pool
66886687
CU_CHECK(cuMemMap(g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, 0, handle, 0));
66896688

6689+
// the memory allocation handle is no longer needed after mapping
6690+
CU_CHECK(cuMemRelease(handle));
6691+
66906692
// set access
66916693
CUmemAccessDesc access = {};
66926694
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
@@ -6695,7 +6697,6 @@ static void * ggml_cuda_pool_malloc_vmm(size_t size, size_t * actual_size) {
66956697
CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, &access, 1));
66966698

66976699
// add to the pool
6698-
g_cuda_pool_handles[id].push_back(handle);
66996700
g_cuda_pool_size[id] += reserve_size;
67006701

67016702
//printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
@@ -8193,13 +8194,13 @@ static void ggml_cuda_op_mul_mat(
81938194
if (id != g_main_device) {
81948195
if (convert_src1_to_q8_1) {
81958196
char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
8196-
CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs,
8197-
cudaMemcpyDeviceToDevice, stream));
8197+
CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, g_main_device,
8198+
src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
81988199
} else {
81998200
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
82008201
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
8201-
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float),
8202-
cudaMemcpyDeviceToDevice, stream));
8202+
CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, g_main_device,
8203+
src1_ncols*ne10*sizeof(float), stream));
82038204
}
82048205
}
82058206
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {

0 commit comments

Comments
 (0)