-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[flang][cuda] cuf.allocate: Carry over stream to the runtime call #117631
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-flang-runtime Author: Valentin Clement (バレンタイン クレメン) (clementval) Changes
Note that the stream is not currently used in the runtime. This will be done in a separate patch as a design/solution needs to be down together with the allocators. Full diff: https://github.com/llvm/llvm-project/pull/117631.diff 4 Files Affected:
diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index be18b2b705bbcd..0ef8a142ee7ccb 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -16,23 +16,28 @@ namespace Fortran::runtime::cuda {
extern "C" {
+/// Perform allocation of the descriptor.
+int RTDECL(CUFAllocatableAllocate)(Descriptor &, long stream = -1,
+ bool hasStat = false, const Descriptor *errMsg = nullptr,
+ const char *sourceFile = nullptr, int sourceLine = 0);
+
/// Perform allocation of the descriptor with synchronization of it when
/// necessary.
-int RTDECL(CUFAllocatableAllocate)(Descriptor &, bool hasStat = false,
- const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
- int sourceLine = 0);
+int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, long stream = -1,
+ bool hasStat = false, const Descriptor *errMsg = nullptr,
+ const char *sourceFile = nullptr, int sourceLine = 0);
/// Perform allocation of the descriptor without synchronization. Assign data
/// from source.
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
- const Descriptor &source, bool hasStat = false,
+ const Descriptor &source, long stream = -1, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);
/// Perform allocation of the descriptor with synchronization of it when
/// necessary. Assign data from source.
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
- const Descriptor &source, bool hasStat = false,
+ const Descriptor &source, long stream = -1, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 3983336516db9e..5056c48c91cfaa 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -158,7 +158,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
mlir::Value sourceLine;
if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
sourceLine = fir::factory::locationToLineNo(
- builder, loc, op.getSource() ? fTy.getInput(5) : fTy.getInput(4));
+ builder, loc, op.getSource() ? fTy.getInput(6) : fTy.getInput(5));
else
sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
@@ -174,14 +174,23 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
}
llvm::SmallVector<mlir::Value> args;
if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
- if (op.getSource())
+ if (op.getSource()) {
+ mlir::Value stream =
+ op.getStream()
+ ? op.getStream()
+ : builder.createIntegerConstant(loc, fTy.getInput(2), -1);
args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
- op.getSource(), hasStat, errmsg,
- sourceFile, sourceLine);
- else
- args =
- fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
- errmsg, sourceFile, sourceLine);
+ op.getSource(), stream, hasStat,
+ errmsg, sourceFile, sourceLine);
+ } else {
+ mlir::Value stream =
+ op.getStream()
+ ? op.getStream()
+ : builder.createIntegerConstant(loc, fTy.getInput(1), -1);
+ args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
+ stream, hasStat, errmsg, sourceFile,
+ sourceLine);
+ }
} else {
args =
fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
@@ -199,10 +208,6 @@ struct CUFAllocateOpConversion
mlir::LogicalResult
matchAndRewrite(cuf::AllocateOp op,
mlir::PatternRewriter &rewriter) const override {
- // TODO: Allocation using different stream.
- if (op.getStream())
- return mlir::failure();
-
// TODO: Pinned is a reference to a logical value that can be set to true
// when pinned allocation succeed. This will require a new entry point.
if (op.getPinned())
@@ -220,8 +225,9 @@ struct CUFAllocateOpConversion
func = fir::runtime::getRuntimeFunc<mkRTKey(
CUFAllocatableAllocateSourceSync)>(loc, builder);
else
- func = fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
- loc, builder);
+ func =
+ fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocateSync)>(
+ loc, builder);
return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
}
@@ -231,10 +237,7 @@ struct CUFAllocateOpConversion
fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocateSource)>(
loc, builder);
else
- // Allocation for local descriptor falls back on the standard runtime
- // AllocatableAllocate as the dedicated allocator is set in the descriptor
- // before the call.
- func = fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(
+ func = fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
loc, builder);
return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index 9fed50c859a9cf..acd7bd3fe77d5e 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -22,18 +22,10 @@ namespace Fortran::runtime::cuda {
extern "C" {
RT_EXT_API_GROUP_BEGIN
-int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
+int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, long stream, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
- if (desc.HasAddendum()) {
- Terminator terminator{sourceFile, sourceLine};
- // TODO: This require a bit more work to set the correct type descriptor
- // address
- terminator.Crash(
- "not yet implemented: CUDA descriptor allocation with addendum");
- }
- // Perform the standard allocation.
- int stat{RTNAME(AllocatableAllocate)(
- desc, hasStat, errMsg, sourceFile, sourceLine)};
+ int stat{RTNAME(CUFAllocatableAllocate)(
+ desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
#ifndef RT_DEVICE_COMPILATION
// Descriptor synchronization is only done when the allocation is done
// from the host.
@@ -47,9 +39,25 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
return stat;
}
+int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, long stream,
+ bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+ int sourceLine) {
+ if (desc.HasAddendum()) {
+ Terminator terminator{sourceFile, sourceLine};
+ // TODO: This require a bit more work to set the correct type descriptor
+ // address
+ terminator.Crash(
+ "not yet implemented: CUDA descriptor allocation with addendum");
+ }
+ // Perform the standard allocation.
+ int stat{RTNAME(AllocatableAllocate)(
+ desc, hasStat, errMsg, sourceFile, sourceLine)};
+ return stat;
+}
+
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
- const Descriptor &source, bool hasStat, const Descriptor *errMsg,
- const char *sourceFile, int sourceLine) {
+ const Descriptor &source, long stream, bool hasStat,
+ const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
int stat{RTNAME(AllocatableAllocate)(
alloc, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
@@ -61,8 +69,8 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
}
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
- const Descriptor &source, bool hasStat, const Descriptor *errMsg,
- const char *sourceFile, int sourceLine) {
+ const Descriptor &source, long stream, bool hasStat,
+ const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
int stat{RTNAME(AllocatableAllocate)(
alloc, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index 47d75b16b7a2d2..9b87c7546d1e9c 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -19,7 +19,7 @@ func.func @_QPsub1() {
// CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
// CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -47,7 +47,7 @@ func.func @_QPsub3() {
// CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
// CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -87,7 +87,7 @@ func.func @_QPsub5() {
}
// CHECK-LABEL: func.func @_QPsub5()
-// CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -118,7 +118,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
// CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdataEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
// CHECK: _FortranAAllocatableSetBounds
// CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
func.func @_QPallocate_source() {
@@ -142,7 +142,7 @@ func.func @_QPallocate_source() {
// CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
// CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
@@ -165,4 +165,20 @@ func.func @_QMmod1Pallocate_source_global() {
// CHECK-LABEL: func.func @_QMmod1Pallocate_source_global()
// CHECK: fir.call @_FortranACUFAllocatableAllocateSourceSync
+func.func @_QQallocate_stream() {
+ %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFEa"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+ %1 = fir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+ %2 = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
+ %3 = fir.declare %2 {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
+ %4 = fir.load %3 : !fir.ref<i64>
+ %5 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> stream(%4 : i64) {data_attr = #cuf.cuda<device>} -> i32
+ return
+}
+
+// CHECK-LABEL: func.func @_QQallocate_stream()
+// CHECK: %[[STREAM_ALLOCA:.*]] = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
+// CHECK: %[[STREAM:.*]] = fir.declare %[[STREAM_ALLOCA]] {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
+// CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]] : !fir.ref<i64>
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+
} // end of module
|
@llvm/pr-subscribers-flang-fir-hlfir Author: Valentin Clement (バレンタイン クレメン) (clementval) Changes
Note that the stream is not currently used in the runtime. This will be done in a separate patch as a design/solution needs to be down together with the allocators. Full diff: https://github.com/llvm/llvm-project/pull/117631.diff 4 Files Affected:
diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index be18b2b705bbcd..0ef8a142ee7ccb 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -16,23 +16,28 @@ namespace Fortran::runtime::cuda {
extern "C" {
+/// Perform allocation of the descriptor.
+int RTDECL(CUFAllocatableAllocate)(Descriptor &, long stream = -1,
+ bool hasStat = false, const Descriptor *errMsg = nullptr,
+ const char *sourceFile = nullptr, int sourceLine = 0);
+
/// Perform allocation of the descriptor with synchronization of it when
/// necessary.
-int RTDECL(CUFAllocatableAllocate)(Descriptor &, bool hasStat = false,
- const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
- int sourceLine = 0);
+int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, long stream = -1,
+ bool hasStat = false, const Descriptor *errMsg = nullptr,
+ const char *sourceFile = nullptr, int sourceLine = 0);
/// Perform allocation of the descriptor without synchronization. Assign data
/// from source.
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
- const Descriptor &source, bool hasStat = false,
+ const Descriptor &source, long stream = -1, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);
/// Perform allocation of the descriptor with synchronization of it when
/// necessary. Assign data from source.
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
- const Descriptor &source, bool hasStat = false,
+ const Descriptor &source, long stream = -1, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 3983336516db9e..5056c48c91cfaa 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -158,7 +158,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
mlir::Value sourceLine;
if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
sourceLine = fir::factory::locationToLineNo(
- builder, loc, op.getSource() ? fTy.getInput(5) : fTy.getInput(4));
+ builder, loc, op.getSource() ? fTy.getInput(6) : fTy.getInput(5));
else
sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
@@ -174,14 +174,23 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
}
llvm::SmallVector<mlir::Value> args;
if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
- if (op.getSource())
+ if (op.getSource()) {
+ mlir::Value stream =
+ op.getStream()
+ ? op.getStream()
+ : builder.createIntegerConstant(loc, fTy.getInput(2), -1);
args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
- op.getSource(), hasStat, errmsg,
- sourceFile, sourceLine);
- else
- args =
- fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
- errmsg, sourceFile, sourceLine);
+ op.getSource(), stream, hasStat,
+ errmsg, sourceFile, sourceLine);
+ } else {
+ mlir::Value stream =
+ op.getStream()
+ ? op.getStream()
+ : builder.createIntegerConstant(loc, fTy.getInput(1), -1);
+ args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
+ stream, hasStat, errmsg, sourceFile,
+ sourceLine);
+ }
} else {
args =
fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
@@ -199,10 +208,6 @@ struct CUFAllocateOpConversion
mlir::LogicalResult
matchAndRewrite(cuf::AllocateOp op,
mlir::PatternRewriter &rewriter) const override {
- // TODO: Allocation using different stream.
- if (op.getStream())
- return mlir::failure();
-
// TODO: Pinned is a reference to a logical value that can be set to true
// when pinned allocation succeed. This will require a new entry point.
if (op.getPinned())
@@ -220,8 +225,9 @@ struct CUFAllocateOpConversion
func = fir::runtime::getRuntimeFunc<mkRTKey(
CUFAllocatableAllocateSourceSync)>(loc, builder);
else
- func = fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
- loc, builder);
+ func =
+ fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocateSync)>(
+ loc, builder);
return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
}
@@ -231,10 +237,7 @@ struct CUFAllocateOpConversion
fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocateSource)>(
loc, builder);
else
- // Allocation for local descriptor falls back on the standard runtime
- // AllocatableAllocate as the dedicated allocator is set in the descriptor
- // before the call.
- func = fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(
+ func = fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
loc, builder);
return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index 9fed50c859a9cf..acd7bd3fe77d5e 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -22,18 +22,10 @@ namespace Fortran::runtime::cuda {
extern "C" {
RT_EXT_API_GROUP_BEGIN
-int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
+int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, long stream, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
- if (desc.HasAddendum()) {
- Terminator terminator{sourceFile, sourceLine};
- // TODO: This require a bit more work to set the correct type descriptor
- // address
- terminator.Crash(
- "not yet implemented: CUDA descriptor allocation with addendum");
- }
- // Perform the standard allocation.
- int stat{RTNAME(AllocatableAllocate)(
- desc, hasStat, errMsg, sourceFile, sourceLine)};
+ int stat{RTNAME(CUFAllocatableAllocate)(
+ desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
#ifndef RT_DEVICE_COMPILATION
// Descriptor synchronization is only done when the allocation is done
// from the host.
@@ -47,9 +39,25 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
return stat;
}
+int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, long stream,
+ bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+ int sourceLine) {
+ if (desc.HasAddendum()) {
+ Terminator terminator{sourceFile, sourceLine};
+ // TODO: This require a bit more work to set the correct type descriptor
+ // address
+ terminator.Crash(
+ "not yet implemented: CUDA descriptor allocation with addendum");
+ }
+ // Perform the standard allocation.
+ int stat{RTNAME(AllocatableAllocate)(
+ desc, hasStat, errMsg, sourceFile, sourceLine)};
+ return stat;
+}
+
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
- const Descriptor &source, bool hasStat, const Descriptor *errMsg,
- const char *sourceFile, int sourceLine) {
+ const Descriptor &source, long stream, bool hasStat,
+ const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
int stat{RTNAME(AllocatableAllocate)(
alloc, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
@@ -61,8 +69,8 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
}
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
- const Descriptor &source, bool hasStat, const Descriptor *errMsg,
- const char *sourceFile, int sourceLine) {
+ const Descriptor &source, long stream, bool hasStat,
+ const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
int stat{RTNAME(AllocatableAllocate)(
alloc, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index 47d75b16b7a2d2..9b87c7546d1e9c 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -19,7 +19,7 @@ func.func @_QPsub1() {
// CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
// CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -47,7 +47,7 @@ func.func @_QPsub3() {
// CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
// CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -87,7 +87,7 @@ func.func @_QPsub5() {
}
// CHECK-LABEL: func.func @_QPsub5()
-// CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -118,7 +118,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
// CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdataEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
// CHECK: _FortranAAllocatableSetBounds
// CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
func.func @_QPallocate_source() {
@@ -142,7 +142,7 @@ func.func @_QPallocate_source() {
// CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
// CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
@@ -165,4 +165,20 @@ func.func @_QMmod1Pallocate_source_global() {
// CHECK-LABEL: func.func @_QMmod1Pallocate_source_global()
// CHECK: fir.call @_FortranACUFAllocatableAllocateSourceSync
+func.func @_QQallocate_stream() {
+ %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFEa"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+ %1 = fir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+ %2 = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
+ %3 = fir.declare %2 {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
+ %4 = fir.load %3 : !fir.ref<i64>
+ %5 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> stream(%4 : i64) {data_attr = #cuf.cuda<device>} -> i32
+ return
+}
+
+// CHECK-LABEL: func.func @_QQallocate_stream()
+// CHECK: %[[STREAM_ALLOCA:.*]] = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
+// CHECK: %[[STREAM:.*]] = fir.declare %[[STREAM_ALLOCA]] {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
+// CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]] : !fir.ref<i64>
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+
} // end of module
|
cuf.allocate
to pass correctly the stream information when present.Note that the stream is not currently used in the runtime. This will be done in a separate patch as a design/solution needs to be down together with the allocators.