Skip to content

Commit 5f0db95

Browse files
authored
hip : Add hipGraph and VMM support to ROCM (#11362)
* Add hipGraph support * Enable VMM on rocm
1 parent c5d9eff commit 5f0db95

File tree

5 files changed

+92
-20
lines changed

5 files changed

+92
-20
lines changed

ggml/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashA
154154
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
155155

156156
option(GGML_HIP "ggml: use HIP" OFF)
157+
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
157158
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
158159
option(GGML_VULKAN "ggml: use Vulkan" OFF)
159160
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)

ggml/src/ggml-cuda/common.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -588,7 +588,7 @@ struct ggml_tensor_extra_gpu {
588588
};
589589

590590

591-
#if (CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)
591+
#if ((CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)) || defined(GGML_HIP_GRAPHS)
592592
#define USE_CUDA_GRAPH
593593
#endif
594594

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
6262
[[noreturn]]
6363
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
6464
int id = -1; // in case cudaGetDevice fails
65-
cudaGetDevice(&id);
65+
(void)cudaGetDevice(&id);
6666

6767
GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
6868
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
@@ -152,7 +152,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
152152
for (int id = 0; id < info.device_count; ++id) {
153153
int device_vmm = 0;
154154

155-
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
155+
#if !defined(GGML_CUDA_NO_VMM)
156156
CUdevice device;
157157
CU_CHECK(cuDeviceGet(&device, id));
158158
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -164,7 +164,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
164164
alloc_prop.location.id = id;
165165
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
166166
}
167-
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
167+
#endif // !defined(GGML_CUDA_NO_VMM)
168168
info.devices[id].vmm = !!device_vmm;
169169

170170
cudaDeviceProp prop;
@@ -300,7 +300,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
300300
};
301301

302302
// pool with virtual memory
303-
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
303+
#if !defined(GGML_CUDA_NO_VMM)
304304
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
305305
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
306306

@@ -309,6 +309,9 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
309309
size_t pool_used = 0;
310310
size_t pool_size = 0;
311311
size_t granularity;
312+
#if defined(GGML_USE_HIP)
313+
std::vector<std::pair<CUdeviceptr, size_t>> mappings;
314+
#endif
312315

313316
explicit ggml_cuda_pool_vmm(int device) :
314317
device(device),
@@ -317,7 +320,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
317320

318321
~ggml_cuda_pool_vmm() {
319322
if (pool_addr != 0) {
323+
#if defined(GGML_USE_HIP)
324+
// Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
325+
for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
326+
CU_CHECK(cuMemUnmap(mapping.first, mapping.second));
327+
}
328+
#else
320329
CU_CHECK(cuMemUnmap(pool_addr, pool_size));
330+
#endif
321331
CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
322332
}
323333
}
@@ -350,7 +360,11 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
350360
}
351361

352362
// map at the end of the pool
353-
CU_CHECK(cuMemMap(pool_addr + pool_size, reserve_size, 0, handle, 0));
363+
CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
364+
CU_CHECK(cuMemMap(start_ptr, reserve_size, 0, handle, 0));
365+
#if defined(GGML_USE_HIP)
366+
mappings.push_back({start_ptr, reserve_size});
367+
#endif
354368

355369
// the memory allocation handle is no longer needed after mapping
356370
CU_CHECK(cuMemRelease(handle));
@@ -360,7 +374,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
360374
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
361375
access.location.id = device;
362376
access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
363-
CU_CHECK(cuMemSetAccess(pool_addr + pool_size, reserve_size, &access, 1));
377+
CU_CHECK(cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1));
364378

365379
// add to the pool
366380
pool_size += reserve_size;
@@ -372,7 +386,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
372386

373387
GGML_ASSERT(pool_addr != 0);
374388

375-
void * ptr = (void *) (pool_addr + pool_used);
389+
void * ptr = (void *) ((CUdeviceptr)((char *)(pool_addr) + pool_used));
376390
*actual_size = size;
377391
pool_used += size;
378392

@@ -391,17 +405,17 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
391405
pool_used -= size;
392406

393407
// all deallocations must be in reverse order of the allocations
394-
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
408+
GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
395409
}
396410
};
397-
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
411+
#endif // !defined(GGML_CUDA_NO_VMM)
398412

399413
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
400-
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
414+
#if !defined(GGML_CUDA_NO_VMM)
401415
if (ggml_cuda_info().devices[device].vmm) {
402416
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
403417
}
404-
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
418+
#endif // !defined(GGML_CUDA_NO_VMM)
405419
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
406420
}
407421

@@ -547,7 +561,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
547561
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
548562
if (err != cudaSuccess) {
549563
// clear the error
550-
cudaGetLastError();
564+
(void)cudaGetLastError();
551565
GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
552566
return nullptr;
553567
}
@@ -962,7 +976,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
962976
cudaError_t err = cudaMallocHost((void **) &ptr, size);
963977
if (err != cudaSuccess) {
964978
// clear the error
965-
cudaGetLastError();
979+
(void)cudaGetLastError();
966980
GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
967981
size / 1024.0 / 1024.0, cudaGetErrorString(err));
968982
return nullptr;
@@ -1209,15 +1223,15 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
12091223
CUDA_CHECK(err);
12101224
} else {
12111225
// reset the error
1212-
cudaGetLastError();
1226+
(void)cudaGetLastError();
12131227
}
12141228
} else {
12151229
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
12161230
if (err != cudaErrorPeerAccessNotEnabled) {
12171231
CUDA_CHECK(err);
12181232
} else {
12191233
// reset the error
1220-
cudaGetLastError();
1234+
(void)cudaGetLastError();
12211235
}
12221236
}
12231237
}
@@ -2452,7 +2466,7 @@ static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vecto
24522466
if (stat == cudaErrorInvalidDeviceFunction) {
24532467
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
24542468
// We don't need to update blas nodes, so clear error and move on.
2455-
cudaGetLastError();
2469+
(void)cudaGetLastError();
24562470
} else {
24572471
GGML_ASSERT(stat == cudaSuccess);
24582472
}
@@ -2507,14 +2521,20 @@ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx,
25072521
static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
25082522

25092523
cudaGraphExecUpdateResultInfo result_info;
2524+
#ifdef __HIP_PLATFORM_AMD__
2525+
hipGraphNode_t errorNode;
2526+
hipError_t stat = hipGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info);
2527+
#else
25102528
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2529+
#endif
25112530
if (stat == cudaErrorGraphExecUpdateFailure) {
25122531
#ifndef NDEBUG
25132532
GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
25142533
#endif
2534+
25152535
// The pre-existing graph exec cannot be updated due to violated constraints
25162536
// so instead clear error and re-instantiate
2517-
cudaGetLastError();
2537+
(void)cudaGetLastError();
25182538
CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
25192539
cuda_ctx->cuda_graph->instance = nullptr;
25202540
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
@@ -2742,7 +2762,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
27422762
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
27432763
if (err != cudaSuccess) {
27442764
// clear the error
2745-
cudaGetLastError();
2765+
(void)cudaGetLastError();
27462766

27472767
GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
27482768
size / 1024.0 / 1024.0, cudaGetErrorString(err));
@@ -2762,7 +2782,7 @@ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
27622782
cudaError_t err = cudaHostUnregister(buffer);
27632783
if (err != cudaSuccess) {
27642784
// clear the error
2765-
cudaGetLastError();
2785+
(void)cudaGetLastError();
27662786
}
27672787
}
27682788

ggml/src/ggml-cuda/vendors/hip.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@
1919
#define CUBLAS_TF32_TENSOR_OP_MATH 0
2020
#define CUDA_R_16F HIPBLAS_R_16F
2121
#define CUDA_R_32F HIPBLAS_R_32F
22+
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
23+
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
24+
#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
25+
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
26+
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
27+
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
2228
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
2329
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
2430
#define cublasCreate hipblasCreate
@@ -74,13 +80,50 @@
7480
#define cudaMemGetInfo hipMemGetInfo
7581
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
7682
#define cudaSetDevice hipSetDevice
83+
#define cuDeviceGet hipDeviceGet
84+
#define CUdevice hipDevice_t
85+
#define CUdeviceptr hipDeviceptr_t
86+
#define cuMemUnmap hipMemUnmap
87+
#define CUmemAccessDesc hipMemAccessDesc
88+
#define cuMemAddressFree hipMemAddressFree
89+
#define cuMemRelease hipMemRelease
90+
#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
91+
#define cuMemCreate hipMemCreate
92+
#define cuMemAddressReserve hipMemAddressReserve
93+
#define cuMemMap hipMemMap
94+
#define cuMemSetAccess hipMemSetAccess
95+
#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
96+
#define CUmemAllocationProp hipMemAllocationProp
97+
#define cuDeviceGetAttribute hipDeviceGetAttribute
7798
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
7899
#define cudaStreamDestroy hipStreamDestroy
79100
#define cudaStreamFireAndForget hipStreamFireAndForget
80101
#define cudaStreamNonBlocking hipStreamNonBlocking
81102
#define cudaStreamPerThread hipStreamPerThread
82103
#define cudaStreamSynchronize hipStreamSynchronize
83104
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
105+
#define cudaGraphExec_t hipGraphExec_t
106+
#define cudaGraphNode_t hipGraphNode_t
107+
#define cudaKernelNodeParams hipKernelNodeParams
108+
#define cudaKernelNodeParams hipKernelNodeParams
109+
#define cudaGraphExecDestroy hipGraphExecDestroy
110+
#define cudaGraphLaunch hipGraphLaunch
111+
#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
112+
#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
113+
#define cudaGraphNodeType hipGraphNodeType
114+
#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
115+
#define cudaGraphInstantiate hipGraphInstantiate
116+
#define cudaStreamEndCapture hipStreamEndCapture
117+
#define cudaGraphDestroy hipGraphDestroy
118+
#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
119+
#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
120+
#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
121+
#define cudaGraphNodeGetType hipGraphNodeGetType
122+
#define cudaGraphGetNodes hipGraphGetNodes
123+
#define cudaGraphExecUpdate hipGraphExecUpdate
124+
#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
125+
#define cudaStreamBeginCapture hipStreamBeginCapture
126+
#define cudaGraph_t hipGraph_t
84127
#define cudaStream_t hipStream_t
85128
#define cudaSuccess hipSuccess
86129
#define __trap() do { abort(); __builtin_unreachable(); } while(0)

ggml/src/ggml-hip/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,14 @@ if (GGML_CUDA_NO_PEER_COPY)
9292
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
9393
endif()
9494

95+
if (GGML_HIP_GRAPHS)
96+
add_compile_definitions(GGML_HIP_GRAPHS)
97+
endif()
98+
99+
if (GGML_CUDA_NO_VMM)
100+
add_compile_definitions(GGML_CUDA_NO_VMM)
101+
endif()
102+
95103
if (CXX_IS_HIPCC)
96104
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
97105
target_link_libraries(ggml-hip PRIVATE hip::device)

0 commit comments

Comments
 (0)