Skip to content

Commit 7d1c661

Browse files
authored
[flang] Allow to pass an async id to allocate the descriptor (#118713)
This is a patch in preparation for the support stream ordered memory allocator in CUDA Fortran. This patch adds an asynchronous id to the AllocatableAllocate runtime function and to Descriptor::Allocate so it can be passed down to the registered allocator. It is up to the allocator to use this value or not. A follow up patch will implement that asynchronous allocator for CUDA Fortran.
1 parent 970d6d2 commit 7d1c661

22 files changed

+88
-64
lines changed

flang/include/flang/Runtime/CUDA/allocator.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,16 @@ extern "C" {
1919
void RTDECL(CUFRegisterAllocator)();
2020
}
2121

22-
void *CUFAllocPinned(std::size_t);
22+
void *CUFAllocPinned(std::size_t, std::int64_t);
2323
void CUFFreePinned(void *);
2424

25-
void *CUFAllocDevice(std::size_t);
25+
void *CUFAllocDevice(std::size_t, std::int64_t);
2626
void CUFFreeDevice(void *);
2727

28-
void *CUFAllocManaged(std::size_t);
28+
void *CUFAllocManaged(std::size_t, std::int64_t);
2929
void CUFFreeManaged(void *);
3030

31-
void *CUFAllocUnified(std::size_t);
31+
void *CUFAllocUnified(std::size_t, std::int64_t);
3232
void CUFFreeUnified(void *);
3333

3434
} // namespace Fortran::runtime::cuda

flang/include/flang/Runtime/CUDA/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ static constexpr unsigned kHostToDevice = 0;
2323
static constexpr unsigned kDeviceToHost = 1;
2424
static constexpr unsigned kDeviceToDevice = 2;
2525

26+
/// Value used for asyncId when no specific stream is specified.
27+
static constexpr std::int64_t kCudaNoStream = -1;
28+
2629
#define CUDA_REPORT_IF_ERROR(expr) \
2730
[](cudaError_t err) { \
2831
if (err == cudaSuccess) \

flang/include/flang/Runtime/allocatable.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,9 @@ int RTDECL(AllocatableCheckLengthParameter)(Descriptor &,
9494
// Successfully allocated memory is initialized if the allocatable has a
9595
// derived type, and is always initialized by AllocatableAllocateSource().
9696
// Performs all necessary coarray synchronization and validation actions.
97-
int RTDECL(AllocatableAllocate)(Descriptor &, bool hasStat = false,
98-
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
99-
int sourceLine = 0);
97+
int RTDECL(AllocatableAllocate)(Descriptor &, std::int64_t asyncId = -1,
98+
bool hasStat = false, const Descriptor *errMsg = nullptr,
99+
const char *sourceFile = nullptr, int sourceLine = 0);
100100
int RTDECL(AllocatableAllocateSource)(Descriptor &, const Descriptor &source,
101101
bool hasStat = false, const Descriptor *errMsg = nullptr,
102102
const char *sourceFile = nullptr, int sourceLine = 0);

flang/include/flang/Runtime/allocator-registry.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#define FORTRAN_RUNTIME_ALLOCATOR_REGISTRY_H_
1111

1212
#include "flang/Common/api-attrs.h"
13+
#include <cstdint>
1314
#include <cstdlib>
1415
#include <vector>
1516

@@ -25,18 +26,19 @@ static constexpr unsigned kUnifiedAllocatorPos = 4;
2526

2627
namespace Fortran::runtime {
2728

28-
using AllocFct = void *(*)(std::size_t);
29+
using AllocFct = void *(*)(std::size_t, std::int64_t);
2930
using FreeFct = void (*)(void *);
3031

3132
typedef struct Allocator_t {
3233
AllocFct alloc{nullptr};
3334
FreeFct free{nullptr};
3435
} Allocator_t;
3536

36-
#ifdef RT_DEVICE_COMPILATION
37-
static RT_API_ATTRS void *MallocWrapper(std::size_t size) {
37+
static RT_API_ATTRS void *MallocWrapper(
38+
std::size_t size, [[maybe_unused]] std::int64_t) {
3839
return std::malloc(size);
3940
}
41+
#ifdef RT_DEVICE_COMPILATION
4042
static RT_API_ATTRS void FreeWrapper(void *p) { return std::free(p); }
4143
#endif
4244

@@ -46,7 +48,7 @@ struct AllocatorRegistry {
4648
: allocators{{&MallocWrapper, &FreeWrapper}} {}
4749
#else
4850
constexpr AllocatorRegistry() {
49-
allocators[kDefaultAllocator] = {&std::malloc, &std::free};
51+
allocators[kDefaultAllocator] = {&MallocWrapper, &std::free};
5052
};
5153
#endif
5254
RT_API_ATTRS void Register(int, Allocator_t);

flang/include/flang/Runtime/descriptor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,7 @@ class Descriptor {
374374
// before calling. It (re)computes the byte strides after
375375
// allocation. Does not allocate automatic components or
376376
// perform default component initialization.
377-
RT_API_ATTRS int Allocate();
377+
RT_API_ATTRS int Allocate(std::int64_t asyncId = -1);
378378
RT_API_ATTRS void SetByteStrides();
379379

380380
// Deallocates storage; does not call FINAL subroutines or

flang/lib/Lower/Allocatable.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -184,9 +184,14 @@ static mlir::Value genRuntimeAllocate(fir::FirOpBuilder &builder,
184184
? fir::runtime::getRuntimeFunc<mkRTKey(PointerAllocate)>(loc, builder)
185185
: fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc,
186186
builder);
187-
llvm::SmallVector<mlir::Value> args{
188-
box.getAddr(), errorManager.hasStat, errorManager.errMsgAddr,
189-
errorManager.sourceFile, errorManager.sourceLine};
187+
llvm::SmallVector<mlir::Value> args{box.getAddr()};
188+
if (!box.isPointer())
189+
args.push_back(
190+
builder.createIntegerConstant(loc, builder.getI64Type(), -1));
191+
args.push_back(errorManager.hasStat);
192+
args.push_back(errorManager.errMsgAddr);
193+
args.push_back(errorManager.sourceFile);
194+
args.push_back(errorManager.sourceLine);
190195
llvm::SmallVector<mlir::Value> operands;
191196
for (auto [fst, snd] : llvm::zip(args, callee.getFunctionType().getInputs()))
192197
operands.emplace_back(builder.createConvert(loc, snd, fst));

flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,16 +76,19 @@ void fir::runtime::genAllocatableAllocate(fir::FirOpBuilder &builder,
7676
mlir::func::FuncOp func{
7777
fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc, builder)};
7878
mlir::FunctionType fTy{func.getFunctionType()};
79+
mlir::Value asyncId =
80+
builder.createIntegerConstant(loc, builder.getI64Type(), -1);
7981
mlir::Value sourceFile{fir::factory::locationToFilename(builder, loc)};
8082
mlir::Value sourceLine{
81-
fir::factory::locationToLineNo(builder, loc, fTy.getInput(4))};
83+
fir::factory::locationToLineNo(builder, loc, fTy.getInput(5))};
8284
if (!hasStat)
8385
hasStat = builder.createBool(loc, false);
8486
if (!errMsg) {
8587
mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType());
8688
errMsg = builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
8789
}
88-
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
89-
builder, loc, fTy, desc, hasStat, errMsg, sourceFile, sourceLine)};
90+
llvm::SmallVector<mlir::Value> args{
91+
fir::runtime::createArguments(builder, loc, fTy, desc, asyncId, hasStat,
92+
errMsg, sourceFile, sourceLine)};
9093
builder.create<fir::CallOp>(loc, func, args);
9194
}

flang/runtime/CUDA/allocatable.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
5252
}
5353
// Perform the standard allocation.
5454
int stat{RTNAME(AllocatableAllocate)(
55-
desc, hasStat, errMsg, sourceFile, sourceLine)};
55+
desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
5656
return stat;
5757
}
5858

flang/runtime/CUDA/allocator.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,23 +33,26 @@ void RTDEF(CUFRegisterAllocator)() {
3333
}
3434
}
3535

36-
void *CUFAllocPinned(std::size_t sizeInBytes) {
36+
void *CUFAllocPinned(
37+
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
3738
void *p;
3839
CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
3940
return p;
4041
}
4142

4243
void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
4344

44-
void *CUFAllocDevice(std::size_t sizeInBytes) {
45+
void *CUFAllocDevice(
46+
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
4547
void *p;
4648
CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
4749
return p;
4850
}
4951

5052
void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
5153

52-
void *CUFAllocManaged(std::size_t sizeInBytes) {
54+
void *CUFAllocManaged(
55+
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
5356
void *p;
5457
CUDA_REPORT_IF_ERROR(
5558
cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -58,9 +61,10 @@ void *CUFAllocManaged(std::size_t sizeInBytes) {
5861

5962
void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
6063

61-
void *CUFAllocUnified(std::size_t sizeInBytes) {
64+
void *CUFAllocUnified(
65+
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
6266
// Call alloc managed for the time being.
63-
return CUFAllocManaged(sizeInBytes);
67+
return CUFAllocManaged(sizeInBytes, asyncId);
6468
}
6569

6670
void CUFFreeUnified(void *p) {

flang/runtime/CUDA/descriptor.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ RT_EXT_API_GROUP_BEGIN
1919

2020
Descriptor *RTDEF(CUFAllocDesciptor)(
2121
std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
22-
return reinterpret_cast<Descriptor *>(CUFAllocManaged(sizeInBytes));
22+
return reinterpret_cast<Descriptor *>(
23+
CUFAllocManaged(sizeInBytes, kCudaNoStream));
2324
}
2425

2526
void RTDEF(CUFFreeDesciptor)(

flang/runtime/allocatable.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -133,15 +133,17 @@ void RTDEF(AllocatableApplyMold)(
133133
}
134134
}
135135

136-
int RTDEF(AllocatableAllocate)(Descriptor &descriptor, bool hasStat,
137-
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
136+
int RTDEF(AllocatableAllocate)(Descriptor &descriptor, std::int64_t asyncId,
137+
bool hasStat, const Descriptor *errMsg, const char *sourceFile,
138+
int sourceLine) {
138139
Terminator terminator{sourceFile, sourceLine};
139140
if (!descriptor.IsAllocatable()) {
140141
return ReturnError(terminator, StatInvalidDescriptor, errMsg, hasStat);
141142
} else if (descriptor.IsAllocated()) {
142143
return ReturnError(terminator, StatBaseNotNull, errMsg, hasStat);
143144
} else {
144-
int stat{ReturnError(terminator, descriptor.Allocate(), errMsg, hasStat)};
145+
int stat{
146+
ReturnError(terminator, descriptor.Allocate(asyncId), errMsg, hasStat)};
145147
if (stat == StatOk) {
146148
if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
147149
if (const auto *derived{addendum->derivedType()}) {
@@ -160,7 +162,7 @@ int RTDEF(AllocatableAllocateSource)(Descriptor &alloc,
160162
const Descriptor &source, bool hasStat, const Descriptor *errMsg,
161163
const char *sourceFile, int sourceLine) {
162164
int stat{RTNAME(AllocatableAllocate)(
163-
alloc, hasStat, errMsg, sourceFile, sourceLine)};
165+
alloc, /*asyncId=*/-1, hasStat, errMsg, sourceFile, sourceLine)};
164166
if (stat == StatOk) {
165167
Terminator terminator{sourceFile, sourceLine};
166168
DoFromSourceAssign(alloc, source, terminator);

flang/runtime/array-constructor.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,17 +50,17 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
5050
initialAllocationSize(fromElements, to.ElementBytes())};
5151
to.GetDimension(0).SetBounds(1, allocationSize);
5252
RTNAME(AllocatableAllocate)
53-
(to, /*hasStat=*/false, /*errMsg=*/nullptr, vector.sourceFile,
54-
vector.sourceLine);
53+
(to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr,
54+
vector.sourceFile, vector.sourceLine);
5555
to.GetDimension(0).SetBounds(1, fromElements);
5656
vector.actualAllocationSize = allocationSize;
5757
} else {
5858
// Do not over-allocate if the final extent was known before pushing the
5959
// first value: there should be no reallocation.
6060
RUNTIME_CHECK(terminator, previousToElements >= fromElements);
6161
RTNAME(AllocatableAllocate)
62-
(to, /*hasStat=*/false, /*errMsg=*/nullptr, vector.sourceFile,
63-
vector.sourceLine);
62+
(to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr,
63+
vector.sourceFile, vector.sourceLine);
6464
vector.actualAllocationSize = previousToElements;
6565
}
6666
} else {

flang/runtime/descriptor.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ RT_API_ATTRS static inline int MapAllocIdx(const Descriptor &desc) {
163163
#endif
164164
}
165165

166-
RT_API_ATTRS int Descriptor::Allocate() {
166+
RT_API_ATTRS int Descriptor::Allocate(std::int64_t asyncId) {
167167
std::size_t elementBytes{ElementBytes()};
168168
if (static_cast<std::int64_t>(elementBytes) < 0) {
169169
// F'2023 7.4.4.2 p5: "If the character length parameter value evaluates
@@ -175,7 +175,7 @@ RT_API_ATTRS int Descriptor::Allocate() {
175175
// Zero size allocation is possible in Fortran and the resulting
176176
// descriptor must be allocated/associated. Since std::malloc(0)
177177
// result is implementation defined, always allocate at least one byte.
178-
void *p{alloc(byteSize ? byteSize : 1)};
178+
void *p{alloc(byteSize ? byteSize : 1, asyncId)};
179179
if (!p) {
180180
return CFI_ERROR_MEM_ALLOCATION;
181181
}

flang/test/HLFIR/elemental-codegen.fir

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ func.func @test_polymorphic(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.bindc_
192192
// CHECK: %[[VAL_35:.*]] = fir.absent !fir.box<none>
193193
// CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
194194
// CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_31]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
195-
// CHECK: %[[VAL_38:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_36]], %[[VAL_34]], %[[VAL_35]], %[[VAL_37]], %[[VAL_33]]) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
195+
// CHECK: %[[VAL_38:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_36]], %{{.*}}, %[[VAL_34]], %[[VAL_35]], %[[VAL_37]], %[[VAL_33]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
196196
// CHECK: %[[VAL_39:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>
197197
// CHECK: %[[VAL_40:.*]] = arith.constant 1 : index
198198
// CHECK: fir.do_loop %[[VAL_41:.*]] = %[[VAL_40]] to %[[EX1]] step %[[VAL_40]] unordered {
@@ -276,7 +276,7 @@ func.func @test_polymorphic_expr(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.b
276276
// CHECK: %[[VAL_36:.*]] = fir.absent !fir.box<none>
277277
// CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
278278
// CHECK: %[[VAL_38:.*]] = fir.convert %[[VAL_32]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
279-
// CHECK: %[[VAL_39:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_37]], %[[VAL_35]], %[[VAL_36]], %[[VAL_38]], %[[VAL_34]]) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
279+
// CHECK: %[[VAL_39:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_37]], %{{.*}}, %[[VAL_35]], %[[VAL_36]], %[[VAL_38]], %[[VAL_34]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
280280
// CHECK: %[[VAL_40:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>
281281
// CHECK: %[[VAL_41:.*]] = arith.constant 1 : index
282282
// CHECK: fir.do_loop %[[VAL_42:.*]] = %[[VAL_41]] to %[[VAL_3]] step %[[VAL_41]] unordered {
@@ -329,7 +329,7 @@ func.func @test_polymorphic_expr(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.b
329329
// CHECK: %[[VAL_85:.*]] = fir.absent !fir.box<none>
330330
// CHECK: %[[VAL_86:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
331331
// CHECK: %[[VAL_87:.*]] = fir.convert %[[VAL_81]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
332-
// CHECK: %[[VAL_88:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_86]], %[[VAL_84]], %[[VAL_85]], %[[VAL_87]], %[[VAL_83]]) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
332+
// CHECK: %[[VAL_88:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_86]], %{{.*}}, %[[VAL_84]], %[[VAL_85]], %[[VAL_87]], %[[VAL_83]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
333333
// CHECK: %[[VAL_89:.*]] = fir.load %[[VAL_63]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>
334334
// CHECK: %[[VAL_90:.*]] = arith.constant 1 : index
335335
// CHECK: fir.do_loop %[[VAL_91:.*]] = %[[VAL_90]] to %[[VAL_3]] step %[[VAL_90]] unordered {

flang/test/Lower/OpenACC/acc-declare.f90

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,6 @@ subroutine init()
469469
end module
470470

471471
! CHECK-LABEL: func.func @_QMacc_declare_post_action_statPinit()
472-
! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
472+
! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
473473
! CHECK: fir.if
474-
! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
474+
! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32

0 commit comments

Comments
 (0)