Skip to content

Commit 83ccaad

Browse files
authored
[flang][cuda] Use async id for device stream allocation (#118733)
When stream is specified use cudaMallocAsync with the specified stream
1 parent b6c0f1b commit 83ccaad

File tree

3 files changed

+32
-13
lines changed

3 files changed

+32
-13
lines changed

flang/include/flang/Runtime/CUDA/allocator.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#ifndef FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
1010
#define FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
1111

12+
#include "common.h"
1213
#include "flang/Runtime/descriptor.h"
1314
#include "flang/Runtime/entry-names.h"
1415

@@ -19,16 +20,16 @@ extern "C" {
1920
void RTDECL(CUFRegisterAllocator)();
2021
}
2122

22-
void *CUFAllocPinned(std::size_t, std::int64_t);
23+
void *CUFAllocPinned(std::size_t, std::int64_t = kCudaNoStream);
2324
void CUFFreePinned(void *);
2425

2526
void *CUFAllocDevice(std::size_t, std::int64_t);
2627
void CUFFreeDevice(void *);
2728

28-
void *CUFAllocManaged(std::size_t, std::int64_t);
29+
void *CUFAllocManaged(std::size_t, std::int64_t = kCudaNoStream);
2930
void CUFFreeManaged(void *);
3031

31-
void *CUFAllocUnified(std::size_t, std::int64_t);
32+
void *CUFAllocUnified(std::size_t, std::int64_t = kCudaNoStream);
3233
void CUFFreeUnified(void *);
3334

3435
} // namespace Fortran::runtime::cuda

flang/runtime/CUDA/allocator.cpp

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,26 +33,28 @@ void RTDEF(CUFRegisterAllocator)() {
3333
}
3434
}
3535

36-
void *CUFAllocPinned(
37-
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
36+
void *CUFAllocPinned(std::size_t sizeInBytes, std::int64_t) {
3837
void *p;
3938
CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
4039
return p;
4140
}
4241

4342
void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
4443

45-
void *CUFAllocDevice(
46-
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
44+
void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t stream) {
4745
void *p;
48-
CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
46+
if (stream >= 0) {
47+
CUDA_REPORT_IF_ERROR(
48+
cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)stream));
49+
} else {
50+
CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
51+
}
4952
return p;
5053
}
5154

5255
void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
5356

54-
void *CUFAllocManaged(
55-
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
57+
void *CUFAllocManaged(std::size_t sizeInBytes, std::int64_t) {
5658
void *p;
5759
CUDA_REPORT_IF_ERROR(
5860
cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -61,10 +63,9 @@ void *CUFAllocManaged(
6163

6264
void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
6365

64-
void *CUFAllocUnified(
65-
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
66+
void *CUFAllocUnified(std::size_t sizeInBytes, std::int64_t) {
6667
// Call alloc managed for the time being.
67-
return CUFAllocManaged(sizeInBytes, asyncId);
68+
return CUFAllocManaged(sizeInBytes);
6869
}
6970

7071
void CUFFreeUnified(void *p) {

flang/unittests/Runtime/CUDA/AllocatorCUF.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,23 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocate) {
4343
EXPECT_FALSE(a->IsAllocated());
4444
}
4545

46+
TEST(AllocatableCUFTest, SimpleStreamDeviceAllocate) {
47+
using Fortran::common::TypeCategory;
48+
RTNAME(CUFRegisterAllocator)();
49+
// REAL(4), DEVICE, ALLOCATABLE :: a(:)
50+
auto a{createAllocatable(TypeCategory::Real, 4)};
51+
a->SetAllocIdx(kDeviceAllocatorPos);
52+
EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
53+
EXPECT_FALSE(a->HasAddendum());
54+
RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
55+
RTNAME(AllocatableAllocate)
56+
(*a, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
57+
EXPECT_TRUE(a->IsAllocated());
58+
RTNAME(AllocatableDeallocate)
59+
(*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
60+
EXPECT_FALSE(a->IsAllocated());
61+
}
62+
4663
TEST(AllocatableCUFTest, SimplePinnedAllocate) {
4764
using Fortran::common::TypeCategory;
4865
RTNAME(CUFRegisterAllocator)();

0 commit comments

Comments
 (0)