@@ -62,7 +62,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
62
62
[[noreturn]]
63
63
void ggml_cuda_error (const char * stmt, const char * func, const char * file, int line, const char * msg) {
64
64
int id = -1 ; // in case cudaGetDevice fails
65
- cudaGetDevice (&id);
65
+ ( void ) cudaGetDevice (&id);
66
66
67
67
GGML_LOG_ERROR (GGML_CUDA_NAME " error: %s\n " , msg);
68
68
GGML_LOG_ERROR (" current device: %d, in function %s at %s:%d\n " , id, func, file, line);
@@ -152,7 +152,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
152
152
for (int id = 0 ; id < info.device_count ; ++id) {
153
153
int device_vmm = 0 ;
154
154
155
- #if !defined(GGML_USE_HIP) && !defined( GGML_CUDA_NO_VMM)
155
+ #if !defined(GGML_CUDA_NO_VMM)
156
156
CUdevice device;
157
157
CU_CHECK (cuDeviceGet (&device, id));
158
158
CU_CHECK (cuDeviceGetAttribute (&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -164,7 +164,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
164
164
alloc_prop.location .id = id;
165
165
CU_CHECK (cuMemGetAllocationGranularity (&info.devices [id].vmm_granularity , &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
166
166
}
167
- #endif // !defined(GGML_USE_HIP) && !defined( GGML_CUDA_NO_VMM)
167
+ #endif // !defined(GGML_CUDA_NO_VMM)
168
168
info.devices [id].vmm = !!device_vmm;
169
169
170
170
cudaDeviceProp prop;
@@ -300,7 +300,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
300
300
};
301
301
302
302
// pool with virtual memory
303
- #if !defined(GGML_USE_HIP) && !defined( GGML_CUDA_NO_VMM)
303
+ #if !defined(GGML_CUDA_NO_VMM)
304
304
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
305
305
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35 ; // 32 GB
306
306
@@ -309,6 +309,9 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
309
309
size_t pool_used = 0 ;
310
310
size_t pool_size = 0 ;
311
311
size_t granularity;
312
+ #if defined(GGML_USE_HIP)
313
+ std::vector<std::pair<CUdeviceptr, size_t >> mappings;
314
+ #endif
312
315
313
316
explicit ggml_cuda_pool_vmm (int device) :
314
317
device(device),
@@ -317,7 +320,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
317
320
318
321
~ggml_cuda_pool_vmm () {
319
322
if (pool_addr != 0 ) {
323
+ #if defined(GGML_USE_HIP)
324
+ // Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
325
+ for (std::pair<CUdeviceptr, size_t > & mapping : mappings) {
326
+ CU_CHECK (cuMemUnmap (mapping.first , mapping.second ));
327
+ }
328
+ #else
320
329
CU_CHECK (cuMemUnmap (pool_addr, pool_size));
330
+ #endif
321
331
CU_CHECK (cuMemAddressFree (pool_addr, CUDA_POOL_VMM_MAX_SIZE));
322
332
}
323
333
}
@@ -350,7 +360,11 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
350
360
}
351
361
352
362
// map at the end of the pool
353
- CU_CHECK (cuMemMap (pool_addr + pool_size, reserve_size, 0 , handle, 0 ));
363
+ CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
364
+ CU_CHECK (cuMemMap (start_ptr, reserve_size, 0 , handle, 0 ));
365
+ #if defined(GGML_USE_HIP)
366
+ mappings.push_back ({start_ptr, reserve_size});
367
+ #endif
354
368
355
369
// the memory allocation handle is no longer needed after mapping
356
370
CU_CHECK (cuMemRelease (handle));
@@ -360,7 +374,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
360
374
access.location .type = CU_MEM_LOCATION_TYPE_DEVICE;
361
375
access.location .id = device;
362
376
access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
363
- CU_CHECK (cuMemSetAccess (pool_addr + pool_size, reserve_size, &access, 1 ));
377
+ CU_CHECK (cuMemSetAccess ((CUdeviceptr)(( char *)( pool_addr) + pool_size) , reserve_size, &access, 1 ));
364
378
365
379
// add to the pool
366
380
pool_size += reserve_size;
@@ -372,7 +386,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
372
386
373
387
GGML_ASSERT (pool_addr != 0 );
374
388
375
- void * ptr = (void *) (pool_addr + pool_used);
389
+ void * ptr = (void *) ((CUdeviceptr)(( char *)( pool_addr) + pool_used) );
376
390
*actual_size = size;
377
391
pool_used += size;
378
392
@@ -391,17 +405,17 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
391
405
pool_used -= size;
392
406
393
407
// all deallocations must be in reverse order of the allocations
394
- GGML_ASSERT (ptr == (void *) (pool_addr + pool_used));
408
+ GGML_ASSERT (ptr == (void *) (( char *)( pool_addr) + pool_used));
395
409
}
396
410
};
397
- #endif // !defined(GGML_USE_HIP) && !defined( GGML_CUDA_NO_VMM)
411
+ #endif // !defined(GGML_CUDA_NO_VMM)
398
412
399
413
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device (int device) {
400
- #if !defined(GGML_USE_HIP) && !defined( GGML_CUDA_NO_VMM)
414
+ #if !defined(GGML_CUDA_NO_VMM)
401
415
if (ggml_cuda_info ().devices [device].vmm ) {
402
416
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm (device));
403
417
}
404
- #endif // !defined(GGML_USE_HIP) && !defined( GGML_CUDA_NO_VMM)
418
+ #endif // !defined(GGML_CUDA_NO_VMM)
405
419
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg (device));
406
420
}
407
421
@@ -547,7 +561,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
547
561
cudaError_t err = ggml_cuda_device_malloc (&dev_ptr, size, buft_ctx->device );
548
562
if (err != cudaSuccess) {
549
563
// clear the error
550
- cudaGetLastError ();
564
+ ( void ) cudaGetLastError ();
551
565
GGML_LOG_ERROR (" %s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n " , __func__, size / 1024.0 / 1024.0 , buft_ctx->device , cudaGetErrorString (err));
552
566
return nullptr ;
553
567
}
@@ -962,7 +976,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
962
976
cudaError_t err = cudaMallocHost ((void **) &ptr, size);
963
977
if (err != cudaSuccess) {
964
978
// clear the error
965
- cudaGetLastError ();
979
+ ( void ) cudaGetLastError ();
966
980
GGML_LOG_DEBUG (" %s: failed to allocate %.2f MiB of pinned memory: %s\n " , __func__,
967
981
size / 1024.0 / 1024.0 , cudaGetErrorString (err));
968
982
return nullptr ;
@@ -1209,15 +1223,15 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
1209
1223
CUDA_CHECK (err);
1210
1224
} else {
1211
1225
// reset the error
1212
- cudaGetLastError ();
1226
+ ( void ) cudaGetLastError ();
1213
1227
}
1214
1228
} else {
1215
1229
cudaError_t err = cudaDeviceDisablePeerAccess (id_other);
1216
1230
if (err != cudaErrorPeerAccessNotEnabled) {
1217
1231
CUDA_CHECK (err);
1218
1232
} else {
1219
1233
// reset the error
1220
- cudaGetLastError ();
1234
+ ( void ) cudaGetLastError ();
1221
1235
}
1222
1236
}
1223
1237
}
@@ -2452,7 +2466,7 @@ static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vecto
2452
2466
if (stat == cudaErrorInvalidDeviceFunction) {
2453
2467
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
2454
2468
// We don't need to update blas nodes, so clear error and move on.
2455
- cudaGetLastError ();
2469
+ ( void ) cudaGetLastError ();
2456
2470
} else {
2457
2471
GGML_ASSERT (stat == cudaSuccess);
2458
2472
}
@@ -2507,14 +2521,20 @@ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx,
2507
2521
static void update_cuda_graph_executable (ggml_backend_cuda_context * cuda_ctx) {
2508
2522
2509
2523
cudaGraphExecUpdateResultInfo result_info;
2524
+ #ifdef __HIP_PLATFORM_AMD__
2525
+ hipGraphNode_t errorNode;
2526
+ hipError_t stat = hipGraphExecUpdate (cuda_ctx->cuda_graph ->instance , cuda_ctx->cuda_graph ->graph , &errorNode, &result_info);
2527
+ #else
2510
2528
cudaError_t stat = cudaGraphExecUpdate (cuda_ctx->cuda_graph ->instance , cuda_ctx->cuda_graph ->graph , &result_info);
2529
+ #endif
2511
2530
if (stat == cudaErrorGraphExecUpdateFailure) {
2512
2531
#ifndef NDEBUG
2513
2532
GGML_LOG_DEBUG (" %s: CUDA graph update failed\n " , __func__);
2514
2533
#endif
2534
+
2515
2535
// The pre-existing graph exec cannot be updated due to violated constraints
2516
2536
// so instead clear error and re-instantiate
2517
- cudaGetLastError ();
2537
+ ( void ) cudaGetLastError ();
2518
2538
CUDA_CHECK (cudaGraphExecDestroy (cuda_ctx->cuda_graph ->instance ));
2519
2539
cuda_ctx->cuda_graph ->instance = nullptr ;
2520
2540
CUDA_CHECK (cudaGraphInstantiate (&cuda_ctx->cuda_graph ->instance , cuda_ctx->cuda_graph ->graph , NULL , NULL , 0 ));
@@ -2742,7 +2762,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
2742
2762
cudaError_t err = cudaHostRegister (buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
2743
2763
if (err != cudaSuccess) {
2744
2764
// clear the error
2745
- cudaGetLastError ();
2765
+ ( void ) cudaGetLastError ();
2746
2766
2747
2767
GGML_LOG_DEBUG (" %s: failed to register %.2f MiB of pinned memory: %s\n " , __func__,
2748
2768
size / 1024.0 / 1024.0 , cudaGetErrorString (err));
@@ -2762,7 +2782,7 @@ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
2762
2782
cudaError_t err = cudaHostUnregister (buffer);
2763
2783
if (err != cudaSuccess) {
2764
2784
// clear the error
2765
- cudaGetLastError ();
2785
+ ( void ) cudaGetLastError ();
2766
2786
}
2767
2787
}
2768
2788
0 commit comments