Skip to content

Commit cdf447b

Browse files
authored
[flang][cuda] Add function to allocate and deallocate device module variable (#109213)
This patch adds new runtime entry points that perform the simple allocation/deallocation of module allocatable variable with cuda attributes. When the allocation is initiated on the host, the descriptor on the device is synchronized. Both descriptors point to the same data on the device. This is the first PR of a stack.
1 parent c9aa9d5 commit cdf447b

File tree

12 files changed

+242
-17
lines changed

12 files changed

+242
-17
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
//===-- include/flang/Runtime/CUDA/allocatable.h ----------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef FORTRAN_RUNTIME_CUDA_ALLOCATABLE_H_
10+
#define FORTRAN_RUNTIME_CUDA_ALLOCATABLE_H_
11+
12+
#include "flang/Runtime/descriptor.h"
13+
#include "flang/Runtime/entry-names.h"
14+
15+
namespace Fortran::runtime::cuda {
16+
17+
extern "C" {
18+
19+
/// Perform allocation of the descriptor with synchronization of it when
20+
/// necessary.
21+
int RTDECL(CUFAllocatableAllocate)(Descriptor &, bool hasStat = false,
22+
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
23+
int sourceLine = 0);
24+
25+
/// Perform deallocation of the descriptor with synchronization of it when
26+
/// necessary.
27+
int RTDECL(CUFAllocatableDeallocate)(Descriptor &, bool hasStat = false,
28+
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
29+
int sourceLine = 0);
30+
31+
} // extern "C"
32+
33+
} // namespace Fortran::runtime::cuda
34+
#endif // FORTRAN_RUNTIME_CUDA_ALLOCATABLE_H_

flang/include/flang/Runtime/CUDA/allocator.h

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,6 @@
1212
#include "flang/Runtime/descriptor.h"
1313
#include "flang/Runtime/entry-names.h"
1414

15-
#define CUDA_REPORT_IF_ERROR(expr) \
16-
[](cudaError_t err) { \
17-
if (err == cudaSuccess) \
18-
return; \
19-
const char *name = cudaGetErrorName(err); \
20-
if (!name) \
21-
name = "<unknown>"; \
22-
Terminator terminator{__FILE__, __LINE__}; \
23-
terminator.Crash("'%s' failed with '%s'", #expr, name); \
24-
}(expr)
25-
2615
namespace Fortran::runtime::cuda {
2716

2817
extern "C" {
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
//===-- include/flang/Runtime/CUDA/common.h ------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef FORTRAN_RUNTIME_CUDA_COMMON_H_
10+
#define FORTRAN_RUNTIME_CUDA_COMMON_H_
11+
12+
#include "flang/Runtime/descriptor.h"
13+
#include "flang/Runtime/entry-names.h"
14+
15+
static constexpr unsigned kHostToDevice = 0;
16+
static constexpr unsigned kDeviceToHost = 1;
17+
static constexpr unsigned kDeviceToDevice = 2;
18+
19+
#define CUDA_REPORT_IF_ERROR(expr) \
20+
[](cudaError_t err) { \
21+
if (err == cudaSuccess) \
22+
return; \
23+
const char *name = cudaGetErrorName(err); \
24+
if (!name) \
25+
name = "<unknown>"; \
26+
Terminator terminator{__FILE__, __LINE__}; \
27+
terminator.Crash("'%s' failed with '%s'", #expr, name); \
28+
}(expr)
29+
30+
#endif // FORTRAN_RUNTIME_CUDA_COMMON_H_

flang/include/flang/Runtime/CUDA/descriptor.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,23 @@ namespace Fortran::runtime::cuda {
1717

1818
extern "C" {
1919

20-
// Allocate a descriptor in managed.
20+
/// Allocate a descriptor in managed.
2121
Descriptor *RTDECL(CUFAllocDesciptor)(
2222
std::size_t, const char *sourceFile = nullptr, int sourceLine = 0);
2323

24-
// Deallocate a descriptor allocated in managed or unified memory.
24+
/// Deallocate a descriptor allocated in managed or unified memory.
2525
void RTDECL(CUFFreeDesciptor)(
2626
Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0);
2727

28+
/// Retrieve the device pointer from the host one.
29+
void *RTDECL(CUFGetDeviceAddress)(
30+
void *hostPtr, const char *sourceFile = nullptr, int sourceLine = 0);
31+
32+
/// Sync the \p src descriptor to the \p dst descriptor.
33+
void RTDECL(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src,
34+
const char *sourceFile = nullptr, int sourceLine = 0);
35+
2836
} // extern "C"
37+
2938
} // namespace Fortran::runtime::cuda
3039
#endif // FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_

flang/include/flang/Runtime/CUDA/memory.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,6 @@
1313
#include "flang/Runtime/entry-names.h"
1414
#include <cstddef>
1515

16-
static constexpr unsigned kHostToDevice = 0;
17-
static constexpr unsigned kDeviceToHost = 1;
18-
static constexpr unsigned kDeviceToDevice = 2;
19-
2016
namespace Fortran::runtime::cuda {
2117

2218
extern "C" {

flang/lib/Optimizer/Transforms/CufOpConversion.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "flang/Optimizer/Dialect/FIROps.h"
1515
#include "flang/Optimizer/HLFIR/HLFIROps.h"
1616
#include "flang/Optimizer/Support/DataLayout.h"
17+
#include "flang/Runtime/CUDA/common.h"
1718
#include "flang/Runtime/CUDA/descriptor.h"
1819
#include "flang/Runtime/CUDA/memory.h"
1920
#include "flang/Runtime/allocatable.h"

flang/runtime/CUDA/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,12 @@ set(CUFRT_LIBNAME CufRuntime_cuda_${CUDAToolkit_VERSION_MAJOR})
1515

1616
add_flang_library(${CUFRT_LIBNAME}
1717
allocator.cpp
18+
allocatable.cpp
1819
descriptor.cpp
1920
memory.cpp
21+
22+
LINK_COMPONENTS
23+
Support
2024
)
2125

2226
if (BUILD_SHARED_LIBS)

flang/runtime/CUDA/allocatable.cpp

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
//===-- runtime/CUDA/allocatable.cpp --------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "flang/Runtime/CUDA/allocatable.h"
10+
#include "../stat.h"
11+
#include "../terminator.h"
12+
#include "flang/Runtime/CUDA/common.h"
13+
#include "flang/Runtime/CUDA/descriptor.h"
14+
#include "flang/Runtime/allocatable.h"
15+
#include "llvm/Support/ErrorHandling.h"
16+
17+
#include "cuda_runtime.h"
18+
19+
namespace Fortran::runtime::cuda {
20+
21+
extern "C" {
22+
RT_EXT_API_GROUP_BEGIN
23+
24+
int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
25+
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
26+
if (desc.HasAddendum()) {
27+
Terminator terminator{sourceFile, sourceLine};
28+
// TODO: This require a bit more work to set the correct type descriptor
29+
// address
30+
terminator.Crash(
31+
"not yet implemented: CUDA descriptor allocation with addendum");
32+
}
33+
// Perform the standard allocation.
34+
int stat{RTNAME(AllocatableAllocate)(
35+
desc, hasStat, errMsg, sourceFile, sourceLine)};
36+
#ifndef RT_DEVICE_COMPILATION
37+
// Descriptor synchronization is only done when the allocation is done
38+
// from the host.
39+
if (stat == StatOk) {
40+
void *deviceAddr{
41+
RTNAME(CUFGetDeviceAddress)((void *)&desc, sourceFile, sourceLine)};
42+
RTNAME(CUFDescriptorSync)
43+
((Descriptor *)deviceAddr, &desc, sourceFile, sourceLine);
44+
}
45+
#endif
46+
return stat;
47+
}
48+
49+
int RTDEF(CUFAllocatableDeallocate)(Descriptor &desc, bool hasStat,
50+
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
51+
// Perform the standard allocation.
52+
int stat{RTNAME(AllocatableDeallocate)(
53+
desc, hasStat, errMsg, sourceFile, sourceLine)};
54+
#ifndef RT_DEVICE_COMPILATION
55+
// Descriptor synchronization is only done when the deallocation is done
56+
// from the host.
57+
if (stat == StatOk) {
58+
void *deviceAddr{
59+
RTNAME(CUFGetDeviceAddress)((void *)&desc, sourceFile, sourceLine)};
60+
RTNAME(CUFDescriptorSync)
61+
((Descriptor *)deviceAddr, &desc, sourceFile, sourceLine);
62+
}
63+
#endif
64+
return stat;
65+
}
66+
67+
RT_EXT_API_GROUP_END
68+
69+
} // extern "C"
70+
71+
} // namespace Fortran::runtime::cuda

flang/runtime/CUDA/allocator.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "../type-info.h"
1414
#include "flang/Common/Fortran.h"
1515
#include "flang/ISO_Fortran_binding_wrapper.h"
16+
#include "flang/Runtime/CUDA/common.h"
1617
#include "flang/Runtime/allocator-registry.h"
1718

1819
#include "cuda_runtime.h"

flang/runtime/CUDA/descriptor.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,11 @@
77
//===----------------------------------------------------------------------===//
88

99
#include "flang/Runtime/CUDA/descriptor.h"
10+
#include "../terminator.h"
1011
#include "flang/Runtime/CUDA/allocator.h"
12+
#include "flang/Runtime/CUDA/common.h"
13+
14+
#include "cuda_runtime.h"
1115

1216
namespace Fortran::runtime::cuda {
1317
extern "C" {
@@ -23,6 +27,24 @@ void RTDEF(CUFFreeDesciptor)(
2327
CUFFreeManaged(reinterpret_cast<void *>(desc));
2428
}
2529

30+
void *RTDEF(CUFGetDeviceAddress)(
31+
void *hostPtr, const char *sourceFile, int sourceLine) {
32+
Terminator terminator{sourceFile, sourceLine};
33+
void *p;
34+
CUDA_REPORT_IF_ERROR(cudaGetSymbolAddress((void **)&p, hostPtr));
35+
if (!p) {
36+
terminator.Crash("Could not retrieve symbol's address");
37+
}
38+
return p;
39+
}
40+
41+
void RTDEF(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src,
42+
const char *sourceFile, int sourceLine) {
43+
std::size_t count{src->SizeInBytes()};
44+
CUDA_REPORT_IF_ERROR(cudaMemcpy(
45+
(void *)dst, (const void *)src, count, cudaMemcpyHostToDevice));
46+
}
47+
2648
RT_EXT_API_GROUP_END
2749
}
2850
} // namespace Fortran::runtime::cuda
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
//===-- flang/unittests/Runtime/Allocatable.cpp ------------------*- C++-*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "flang/Runtime/allocatable.h"
10+
#include "gtest/gtest.h"
11+
#include "../../../runtime/terminator.h"
12+
#include "flang/Common/Fortran.h"
13+
#include "flang/Runtime/CUDA/allocator.h"
14+
#include "flang/Runtime/CUDA/common.h"
15+
#include "flang/Runtime/CUDA/descriptor.h"
16+
#include "flang/Runtime/allocator-registry.h"
17+
18+
#include "cuda_runtime.h"
19+
20+
using namespace Fortran::runtime;
21+
using namespace Fortran::runtime::cuda;
22+
23+
static OwningPtr<Descriptor> createAllocatable(
24+
Fortran::common::TypeCategory tc, int kind, int rank = 1) {
25+
return Descriptor::Create(TypeCode{tc, kind}, kind, nullptr, rank, nullptr,
26+
CFI_attribute_allocatable);
27+
}
28+
29+
TEST(AllocatableCUFTest, SimpleDeviceAllocatable) {
30+
using Fortran::common::TypeCategory;
31+
RTNAME(CUFRegisterAllocator)();
32+
// REAL(4), DEVICE, ALLOCATABLE :: a(:)
33+
auto a{createAllocatable(TypeCategory::Real, 4)};
34+
a->SetAllocIdx(kDeviceAllocatorPos);
35+
EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
36+
EXPECT_FALSE(a->HasAddendum());
37+
RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
38+
39+
// Emulate a device descriptor for the purpose of unit testing part of the
40+
// code.
41+
Descriptor *device_desc;
42+
CUDA_REPORT_IF_ERROR(cudaMalloc(&device_desc, a->SizeInBytes()));
43+
44+
RTNAME(AllocatableAllocate)
45+
(*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
46+
EXPECT_TRUE(a->IsAllocated());
47+
RTNAME(CUFDescriptorSync)(device_desc, a.get(), __FILE__, __LINE__);
48+
cudaDeviceSynchronize();
49+
50+
EXPECT_EQ(cudaSuccess, cudaGetLastError());
51+
52+
RTNAME(AllocatableDeallocate)
53+
(*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
54+
EXPECT_FALSE(a->IsAllocated());
55+
56+
RTNAME(CUFDescriptorSync)(device_desc, a.get(), __FILE__, __LINE__);
57+
cudaDeviceSynchronize();
58+
59+
EXPECT_EQ(cudaSuccess, cudaGetLastError());
60+
}

flang/unittests/Runtime/CUDA/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,19 @@
11
if (FLANG_CUF_RUNTIME)
22

33
add_flang_unittest(FlangCufRuntimeTests
4+
Allocatable.cpp
45
AllocatorCUF.cpp
56
)
67

8+
if (BUILD_SHARED_LIBS)
9+
set(CUDA_RT_TARGET CUDA::cudart)
10+
else()
11+
set(CUDA_RT_TARGET CUDA::cudart_static)
12+
endif()
13+
714
target_link_libraries(FlangCufRuntimeTests
815
PRIVATE
16+
${CUDA_RT_TARGET}
917
CufRuntime_cuda_${CUDAToolkit_VERSION_MAJOR}
1018
FortranRuntime
1119
)

0 commit comments

Comments
 (0)