Skip to content

Commit eb5cda4

Browse files
authored
[flang][cuda] cuf.allocate: Carry over stream to the runtime call (#117631)
- Update the runtime entry points to accept a stream information - Update the conversion of `cuf.allocate` to pass correctly the stream information when present. Note that the stream is not currently used in the runtime. This will be done in a separate patch as a design/solution needs to be down together with the allocators.
1 parent ae719f0 commit eb5cda4

File tree

4 files changed

+81
-48
lines changed

4 files changed

+81
-48
lines changed

flang/include/flang/Runtime/CUDA/allocatable.h

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,23 +16,28 @@ namespace Fortran::runtime::cuda {
1616

1717
extern "C" {
1818

19+
/// Perform allocation of the descriptor.
20+
int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t stream = -1,
21+
bool hasStat = false, const Descriptor *errMsg = nullptr,
22+
const char *sourceFile = nullptr, int sourceLine = 0);
23+
1924
/// Perform allocation of the descriptor with synchronization of it when
2025
/// necessary.
21-
int RTDECL(CUFAllocatableAllocate)(Descriptor &, bool hasStat = false,
22-
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
23-
int sourceLine = 0);
26+
int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t stream = -1,
27+
bool hasStat = false, const Descriptor *errMsg = nullptr,
28+
const char *sourceFile = nullptr, int sourceLine = 0);
2429

2530
/// Perform allocation of the descriptor without synchronization. Assign data
2631
/// from source.
2732
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
28-
const Descriptor &source, bool hasStat = false,
33+
const Descriptor &source, int64_t stream = -1, bool hasStat = false,
2934
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
3035
int sourceLine = 0);
3136

3237
/// Perform allocation of the descriptor with synchronization of it when
3338
/// necessary. Assign data from source.
3439
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
35-
const Descriptor &source, bool hasStat = false,
40+
const Descriptor &source, int64_t stream = -1, bool hasStat = false,
3641
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
3742
int sourceLine = 0);
3843

flang/lib/Optimizer/Transforms/CUFOpConversion.cpp

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
158158
mlir::Value sourceLine;
159159
if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
160160
sourceLine = fir::factory::locationToLineNo(
161-
builder, loc, op.getSource() ? fTy.getInput(5) : fTy.getInput(4));
161+
builder, loc, op.getSource() ? fTy.getInput(6) : fTy.getInput(5));
162162
else
163163
sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
164164

@@ -174,14 +174,23 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
174174
}
175175
llvm::SmallVector<mlir::Value> args;
176176
if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
177-
if (op.getSource())
177+
if (op.getSource()) {
178+
mlir::Value stream =
179+
op.getStream()
180+
? op.getStream()
181+
: builder.createIntegerConstant(loc, fTy.getInput(2), -1);
178182
args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
179-
op.getSource(), hasStat, errmsg,
180-
sourceFile, sourceLine);
181-
else
182-
args =
183-
fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
184-
errmsg, sourceFile, sourceLine);
183+
op.getSource(), stream, hasStat,
184+
errmsg, sourceFile, sourceLine);
185+
} else {
186+
mlir::Value stream =
187+
op.getStream()
188+
? op.getStream()
189+
: builder.createIntegerConstant(loc, fTy.getInput(1), -1);
190+
args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
191+
stream, hasStat, errmsg, sourceFile,
192+
sourceLine);
193+
}
185194
} else {
186195
args =
187196
fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
@@ -199,10 +208,6 @@ struct CUFAllocateOpConversion
199208
mlir::LogicalResult
200209
matchAndRewrite(cuf::AllocateOp op,
201210
mlir::PatternRewriter &rewriter) const override {
202-
// TODO: Allocation using different stream.
203-
if (op.getStream())
204-
return mlir::failure();
205-
206211
// TODO: Pinned is a reference to a logical value that can be set to true
207212
// when pinned allocation succeed. This will require a new entry point.
208213
if (op.getPinned())
@@ -220,8 +225,9 @@ struct CUFAllocateOpConversion
220225
func = fir::runtime::getRuntimeFunc<mkRTKey(
221226
CUFAllocatableAllocateSourceSync)>(loc, builder);
222227
else
223-
func = fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
224-
loc, builder);
228+
func =
229+
fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocateSync)>(
230+
loc, builder);
225231
return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
226232
}
227233

@@ -231,10 +237,7 @@ struct CUFAllocateOpConversion
231237
fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocateSource)>(
232238
loc, builder);
233239
else
234-
// Allocation for local descriptor falls back on the standard runtime
235-
// AllocatableAllocate as the dedicated allocator is set in the descriptor
236-
// before the call.
237-
func = fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(
240+
func = fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
238241
loc, builder);
239242

240243
return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);

flang/runtime/CUDA/allocatable.cpp

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,11 @@ namespace Fortran::runtime::cuda {
2222
extern "C" {
2323
RT_EXT_API_GROUP_BEGIN
2424

25-
int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
26-
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
27-
if (desc.HasAddendum()) {
28-
Terminator terminator{sourceFile, sourceLine};
29-
// TODO: This require a bit more work to set the correct type descriptor
30-
// address
31-
terminator.Crash(
32-
"not yet implemented: CUDA descriptor allocation with addendum");
33-
}
34-
// Perform the standard allocation.
35-
int stat{RTNAME(AllocatableAllocate)(
36-
desc, hasStat, errMsg, sourceFile, sourceLine)};
25+
int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
26+
bool hasStat, const Descriptor *errMsg, const char *sourceFile,
27+
int sourceLine) {
28+
int stat{RTNAME(CUFAllocatableAllocate)(
29+
desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
3730
#ifndef RT_DEVICE_COMPILATION
3831
// Descriptor synchronization is only done when the allocation is done
3932
// from the host.
@@ -47,11 +40,27 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
4740
return stat;
4841
}
4942

50-
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
51-
const Descriptor &source, bool hasStat, const Descriptor *errMsg,
52-
const char *sourceFile, int sourceLine) {
43+
int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
44+
bool hasStat, const Descriptor *errMsg, const char *sourceFile,
45+
int sourceLine) {
46+
if (desc.HasAddendum()) {
47+
Terminator terminator{sourceFile, sourceLine};
48+
// TODO: This require a bit more work to set the correct type descriptor
49+
// address
50+
terminator.Crash(
51+
"not yet implemented: CUDA descriptor allocation with addendum");
52+
}
53+
// Perform the standard allocation.
5354
int stat{RTNAME(AllocatableAllocate)(
54-
alloc, hasStat, errMsg, sourceFile, sourceLine)};
55+
desc, hasStat, errMsg, sourceFile, sourceLine)};
56+
return stat;
57+
}
58+
59+
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
60+
const Descriptor &source, int64_t stream, bool hasStat,
61+
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
62+
int stat{RTNAME(CUFAllocatableAllocate)(
63+
alloc, stream, hasStat, errMsg, sourceFile, sourceLine)};
5564
if (stat == StatOk) {
5665
Terminator terminator{sourceFile, sourceLine};
5766
Fortran::runtime::DoFromSourceAssign(
@@ -61,10 +70,10 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
6170
}
6271

6372
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
64-
const Descriptor &source, bool hasStat, const Descriptor *errMsg,
65-
const char *sourceFile, int sourceLine) {
66-
int stat{RTNAME(AllocatableAllocate)(
67-
alloc, hasStat, errMsg, sourceFile, sourceLine)};
73+
const Descriptor &source, int64_t stream, bool hasStat,
74+
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
75+
int stat{RTNAME(CUFAllocatableAllocateSync)(
76+
alloc, stream, hasStat, errMsg, sourceFile, sourceLine)};
6877
if (stat == StatOk) {
6978
Terminator terminator{sourceFile, sourceLine};
7079
Fortran::runtime::DoFromSourceAssign(

flang/test/Fir/CUDA/cuda-allocate.fir

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ func.func @_QPsub1() {
1919
// CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
2020
// CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
2121
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
22-
// CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
22+
// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
2323

2424
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
2525
// CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -47,7 +47,7 @@ func.func @_QPsub3() {
4747
// CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
4848

4949
// CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
50-
// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
50+
// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
5151

5252
// CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
5353
// CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -87,7 +87,7 @@ func.func @_QPsub5() {
8787
}
8888

8989
// CHECK-LABEL: func.func @_QPsub5()
90-
// CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
90+
// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
9191
// CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
9292

9393

@@ -118,7 +118,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
118118
// CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdataEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
119119
// CHECK: _FortranAAllocatableSetBounds
120120
// CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
121-
// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
121+
// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
122122

123123

124124
func.func @_QPallocate_source() {
@@ -142,7 +142,7 @@ func.func @_QPallocate_source() {
142142
// CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
143143
// CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
144144
// CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
145-
// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
145+
// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
146146

147147

148148
fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
@@ -165,4 +165,20 @@ func.func @_QMmod1Pallocate_source_global() {
165165
// CHECK-LABEL: func.func @_QMmod1Pallocate_source_global()
166166
// CHECK: fir.call @_FortranACUFAllocatableAllocateSourceSync
167167

168+
func.func @_QQallocate_stream() {
169+
%0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFEa"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
170+
%1 = fir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
171+
%2 = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
172+
%3 = fir.declare %2 {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
173+
%4 = fir.load %3 : !fir.ref<i64>
174+
%5 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> stream(%4 : i64) {data_attr = #cuf.cuda<device>} -> i32
175+
return
176+
}
177+
178+
// CHECK-LABEL: func.func @_QQallocate_stream()
179+
// CHECK: %[[STREAM_ALLOCA:.*]] = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
180+
// CHECK: %[[STREAM:.*]] = fir.declare %[[STREAM_ALLOCA]] {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
181+
// CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]] : !fir.ref<i64>
182+
// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
183+
168184
} // end of module

0 commit comments

Comments
 (0)