Skip to content

Commit 00ab8a6

Browse files
authored
[flang][cuda] Use cuda runtime API (#103488)
CUDA Fortran is meant to be an equivalent to the runtime API. Therefore, it makes more sense to use the cuda rt API in the allocators for CUF. @bdudleback
1 parent e1b1550 commit 00ab8a6

File tree

4 files changed

+22
-52
lines changed

4 files changed

+22
-52
lines changed

flang/include/flang/Runtime/CUDA/allocator.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,10 @@
1313
#include "flang/Runtime/entry-names.h"
1414

1515
#define CUDA_REPORT_IF_ERROR(expr) \
16-
[](CUresult result) { \
17-
if (!result) \
16+
[](cudaError_t err) { \
17+
if (err == cudaSuccess) \
1818
return; \
19-
const char *name = nullptr; \
20-
cuGetErrorName(result, &name); \
19+
const char *name = cudaGetErrorName(err); \
2120
if (!name) \
2221
name = "<unknown>"; \
2322
Terminator terminator{__FILE__, __LINE__}; \

flang/runtime/CUDA/CMakeLists.txt

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,20 @@
77
#===------------------------------------------------------------------------===#
88

99
include_directories(${CUDAToolkit_INCLUDE_DIRS})
10-
find_library(CUDA_RUNTIME_LIBRARY cuda HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED)
1110

1211
add_flang_library(CufRuntime
1312
allocator.cpp
1413
descriptor.cpp
1514
)
15+
16+
if (BUILD_SHARED_LIBS)
17+
set(CUF_LIBRARY ${CUDA_LIBRARIES})
18+
else()
19+
set(CUF_LIBRARY ${CUDA_cudart_static_LIBRARY})
20+
endif()
21+
1622
target_link_libraries(CufRuntime
1723
PRIVATE
1824
FortranRuntime
19-
${CUDA_RUNTIME_LIBRARY}
25+
${CUF_LIBRARY}
2026
)

flang/runtime/CUDA/allocator.cpp

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
#include "flang/ISO_Fortran_binding_wrapper.h"
1616
#include "flang/Runtime/allocator-registry.h"
1717

18-
#include "cuda.h"
18+
#include "cuda_runtime.h"
1919

2020
namespace Fortran::runtime::cuda {
2121
extern "C" {
@@ -34,32 +34,28 @@ void RTDEF(CUFRegisterAllocator)() {
3434

3535
void *CUFAllocPinned(std::size_t sizeInBytes) {
3636
void *p;
37-
CUDA_REPORT_IF_ERROR(cuMemAllocHost(&p, sizeInBytes));
37+
CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
3838
return p;
3939
}
4040

41-
void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cuMemFreeHost(p)); }
41+
void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
4242

4343
void *CUFAllocDevice(std::size_t sizeInBytes) {
44-
CUdeviceptr p = 0;
45-
CUDA_REPORT_IF_ERROR(cuMemAlloc(&p, sizeInBytes));
46-
return reinterpret_cast<void *>(p);
44+
void *p;
45+
CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
46+
return p;
4747
}
4848

49-
void CUFFreeDevice(void *p) {
50-
CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(p)));
51-
}
49+
void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
5250

5351
void *CUFAllocManaged(std::size_t sizeInBytes) {
54-
CUdeviceptr p = 0;
52+
void *p;
5553
CUDA_REPORT_IF_ERROR(
56-
cuMemAllocManaged(&p, sizeInBytes, CU_MEM_ATTACH_GLOBAL));
54+
cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
5755
return reinterpret_cast<void *>(p);
5856
}
5957

60-
void CUFFreeManaged(void *p) {
61-
CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(p)));
62-
}
58+
void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
6359

6460
void *CUFAllocUnified(std::size_t sizeInBytes) {
6561
// Call alloc managed for the time being.

flang/unittests/Runtime/CUDA/AllocatorCUF.cpp

Lines changed: 1 addition & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
#include "flang/Runtime/allocatable.h"
1515
#include "flang/Runtime/allocator-registry.h"
1616

17-
#include "cuda.h"
17+
#include "cuda_runtime.h"
1818

1919
using namespace Fortran::runtime;
2020
using namespace Fortran::runtime::cuda;
@@ -25,38 +25,9 @@ static OwningPtr<Descriptor> createAllocatable(
2525
CFI_attribute_allocatable);
2626
}
2727

28-
thread_local static int32_t defaultDevice = 0;
29-
30-
CUdevice getDefaultCuDevice() {
31-
CUdevice device;
32-
CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
33-
return device;
34-
}
35-
36-
class ScopedContext {
37-
public:
38-
ScopedContext() {
39-
// Static reference to CUDA primary context for device ordinal
40-
// defaultDevice.
41-
static CUcontext context = [] {
42-
CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0));
43-
CUcontext ctx;
44-
// Note: this does not affect the current context.
45-
CUDA_REPORT_IF_ERROR(
46-
cuDevicePrimaryCtxRetain(&ctx, getDefaultCuDevice()));
47-
return ctx;
48-
}();
49-
50-
CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context));
51-
}
52-
53-
~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }
54-
};
55-
5628
TEST(AllocatableCUFTest, SimpleDeviceAllocate) {
5729
using Fortran::common::TypeCategory;
5830
RTNAME(CUFRegisterAllocator)();
59-
ScopedContext ctx;
6031
// REAL(4), DEVICE, ALLOCATABLE :: a(:)
6132
auto a{createAllocatable(TypeCategory::Real, 4)};
6233
a->SetAllocIdx(kDeviceAllocatorPos);
@@ -74,7 +45,6 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocate) {
7445
TEST(AllocatableCUFTest, SimplePinnedAllocate) {
7546
using Fortran::common::TypeCategory;
7647
RTNAME(CUFRegisterAllocator)();
77-
ScopedContext ctx;
7848
// INTEGER(4), PINNED, ALLOCATABLE :: a(:)
7949
auto a{createAllocatable(TypeCategory::Integer, 4)};
8050
EXPECT_FALSE(a->HasAddendum());
@@ -93,7 +63,6 @@ TEST(AllocatableCUFTest, SimplePinnedAllocate) {
9363
TEST(AllocatableCUFTest, DescriptorAllocationTest) {
9464
using Fortran::common::TypeCategory;
9565
RTNAME(CUFRegisterAllocator)();
96-
ScopedContext ctx;
9766
// REAL(4), DEVICE, ALLOCATABLE :: a(:)
9867
auto a{createAllocatable(TypeCategory::Real, 4)};
9968
Descriptor *desc = nullptr;

0 commit comments

Comments
 (0)