[flang][cuda] cuf.allocate: Carry over stream to the runtime call #117631

clementval · 2024-11-25T21:02:47Z

Update the runtime entry points to accept a stream information
Update the conversion of cuf.allocate to pass correctly the stream information when present.

Note that the stream is not currently used in the runtime. This will be done in a separate patch as a design/solution needs to be down together with the allocators.

llvmbot · 2024-11-25T21:03:25Z

@llvm/pr-subscribers-flang-runtime

Author: Valentin Clement (バレンタインクレメン) (clementval)

Changes

Update the runtime entry points to accept a stream information
Update the conversion of cuf.allocate to pass correctly the stream information when present.

Note that the stream is not currently used in the runtime. This will be done in a separate patch as a design/solution needs to be down together with the allocators.

Full diff: https://github.com/llvm/llvm-project/pull/117631.diff

4 Files Affected:

(modified) flang/include/flang/Runtime/CUDA/allocatable.h (+10-5)
(modified) flang/lib/Optimizer/Transforms/CUFOpConversion.cpp (+21-18)
(modified) flang/runtime/CUDA/allocatable.cpp (+23-15)
(modified) flang/test/Fir/CUDA/cuda-allocate.fir (+21-5)

diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index be18b2b705bbcd..0ef8a142ee7ccb 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -16,23 +16,28 @@ namespace Fortran::runtime::cuda {
 
 extern "C" {
 
+/// Perform allocation of the descriptor.
+int RTDECL(CUFAllocatableAllocate)(Descriptor &, long stream = -1,
+    bool hasStat = false, const Descriptor *errMsg = nullptr,
+    const char *sourceFile = nullptr, int sourceLine = 0);
+
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
-int RTDECL(CUFAllocatableAllocate)(Descriptor &, bool hasStat = false,
-    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, long stream = -1,
+    bool hasStat = false, const Descriptor *errMsg = nullptr,
+    const char *sourceFile = nullptr, int sourceLine = 0);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, bool hasStat = false,
+    const Descriptor &source, long stream = -1, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, bool hasStat = false,
+    const Descriptor &source, long stream = -1, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 3983336516db9e..5056c48c91cfaa 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -158,7 +158,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   mlir::Value sourceLine;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
     sourceLine = fir::factory::locationToLineNo(
-        builder, loc, op.getSource() ? fTy.getInput(5) : fTy.getInput(4));
+        builder, loc, op.getSource() ? fTy.getInput(6) : fTy.getInput(5));
   else
     sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
 
@@ -174,14 +174,23 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   }
   llvm::SmallVector<mlir::Value> args;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
-    if (op.getSource())
+    if (op.getSource()) {
+      mlir::Value stream =
+          op.getStream()
+              ? op.getStream()
+              : builder.createIntegerConstant(loc, fTy.getInput(2), -1);
       args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
-                                           op.getSource(), hasStat, errmsg,
-                                           sourceFile, sourceLine);
-    else
-      args =
-          fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
-                                        errmsg, sourceFile, sourceLine);
+                                           op.getSource(), stream, hasStat,
+                                           errmsg, sourceFile, sourceLine);
+    } else {
+      mlir::Value stream =
+          op.getStream()
+              ? op.getStream()
+              : builder.createIntegerConstant(loc, fTy.getInput(1), -1);
+      args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
+                                           stream, hasStat, errmsg, sourceFile,
+                                           sourceLine);
+    }
   } else {
     args =
         fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
@@ -199,10 +208,6 @@ struct CUFAllocateOpConversion
   mlir::LogicalResult
   matchAndRewrite(cuf::AllocateOp op,
                   mlir::PatternRewriter &rewriter) const override {
-    // TODO: Allocation using different stream.
-    if (op.getStream())
-      return mlir::failure();
-
     // TODO: Pinned is a reference to a logical value that can be set to true
     // when pinned allocation succeed. This will require a new entry point.
     if (op.getPinned())
@@ -220,8 +225,9 @@ struct CUFAllocateOpConversion
         func = fir::runtime::getRuntimeFunc<mkRTKey(
             CUFAllocatableAllocateSourceSync)>(loc, builder);
       else
-        func = fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
-            loc, builder);
+        func =
+            fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocateSync)>(
+                loc, builder);
       return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
     }
 
@@ -231,10 +237,7 @@ struct CUFAllocateOpConversion
           fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocateSource)>(
               loc, builder);
     else
-      // Allocation for local descriptor falls back on the standard runtime
-      // AllocatableAllocate as the dedicated allocator is set in the descriptor
-      // before the call.
-      func = fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(
+      func = fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
           loc, builder);
 
     return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index 9fed50c859a9cf..acd7bd3fe77d5e 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -22,18 +22,10 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
-int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
+int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, long stream, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
-  if (desc.HasAddendum()) {
-    Terminator terminator{sourceFile, sourceLine};
-    // TODO: This require a bit more work to set the correct type descriptor
-    // address
-    terminator.Crash(
-        "not yet implemented: CUDA descriptor allocation with addendum");
-  }
-  // Perform the standard allocation.
-  int stat{RTNAME(AllocatableAllocate)(
-      desc, hasStat, errMsg, sourceFile, sourceLine)};
+  int stat{RTNAME(CUFAllocatableAllocate)(
+      desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
@@ -47,9 +39,25 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
   return stat;
 }
 
+int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, long stream,
+    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+    int sourceLine) {
+  if (desc.HasAddendum()) {
+    Terminator terminator{sourceFile, sourceLine};
+    // TODO: This require a bit more work to set the correct type descriptor
+    // address
+    terminator.Crash(
+        "not yet implemented: CUDA descriptor allocation with addendum");
+  }
+  // Perform the standard allocation.
+  int stat{RTNAME(AllocatableAllocate)(
+      desc, hasStat, errMsg, sourceFile, sourceLine)};
+  return stat;
+}
+
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, bool hasStat, const Descriptor *errMsg,
-    const char *sourceFile, int sourceLine) {
+    const Descriptor &source, long stream, bool hasStat,
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(AllocatableAllocate)(
       alloc, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
@@ -61,8 +69,8 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
 }
 
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, bool hasStat, const Descriptor *errMsg,
-    const char *sourceFile, int sourceLine) {
+    const Descriptor &source, long stream, bool hasStat,
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(AllocatableAllocate)(
       alloc, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index 47d75b16b7a2d2..9b87c7546d1e9c 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -19,7 +19,7 @@ func.func @_QPsub1() {
 // CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 // CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -47,7 +47,7 @@ func.func @_QPsub3() {
 // CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 
 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -87,7 +87,7 @@ func.func @_QPsub5() {
 }
 
 // CHECK-LABEL: func.func @_QPsub5()
-// CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
@@ -118,7 +118,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
 // CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdataEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 // CHECK: _FortranAAllocatableSetBounds
 // CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 func.func @_QPallocate_source() {
@@ -142,7 +142,7 @@ func.func @_QPallocate_source() {
 // CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
 // CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
@@ -165,4 +165,20 @@ func.func @_QMmod1Pallocate_source_global() {
 // CHECK-LABEL: func.func @_QMmod1Pallocate_source_global()
 // CHECK: fir.call @_FortranACUFAllocatableAllocateSourceSync
 
+func.func @_QQallocate_stream() {
+  %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFEa"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+  %1 = fir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+  %2 = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
+  %3 = fir.declare %2 {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
+  %4 = fir.load %3 : !fir.ref<i64>
+  %5 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> stream(%4 : i64) {data_attr = #cuf.cuda<device>} -> i32
+  return
+}
+
+// CHECK-LABEL: func.func @_QQallocate_stream() 
+// CHECK: %[[STREAM_ALLOCA:.*]] = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
+// CHECK: %[[STREAM:.*]] = fir.declare %[[STREAM_ALLOCA]] {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
+// CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]] : !fir.ref<i64>
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+
 } // end of module

llvmbot · 2024-11-25T21:03:25Z

@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタインクレメン) (clementval)

Changes

Update the runtime entry points to accept a stream information
Update the conversion of cuf.allocate to pass correctly the stream information when present.

Note that the stream is not currently used in the runtime. This will be done in a separate patch as a design/solution needs to be down together with the allocators.

Full diff: https://github.com/llvm/llvm-project/pull/117631.diff

4 Files Affected:

(modified) flang/include/flang/Runtime/CUDA/allocatable.h (+10-5)
(modified) flang/lib/Optimizer/Transforms/CUFOpConversion.cpp (+21-18)
(modified) flang/runtime/CUDA/allocatable.cpp (+23-15)
(modified) flang/test/Fir/CUDA/cuda-allocate.fir (+21-5)

diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index be18b2b705bbcd..0ef8a142ee7ccb 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -16,23 +16,28 @@ namespace Fortran::runtime::cuda {
 
 extern "C" {
 
+/// Perform allocation of the descriptor.
+int RTDECL(CUFAllocatableAllocate)(Descriptor &, long stream = -1,
+    bool hasStat = false, const Descriptor *errMsg = nullptr,
+    const char *sourceFile = nullptr, int sourceLine = 0);
+
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
-int RTDECL(CUFAllocatableAllocate)(Descriptor &, bool hasStat = false,
-    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, long stream = -1,
+    bool hasStat = false, const Descriptor *errMsg = nullptr,
+    const char *sourceFile = nullptr, int sourceLine = 0);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, bool hasStat = false,
+    const Descriptor &source, long stream = -1, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, bool hasStat = false,
+    const Descriptor &source, long stream = -1, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 3983336516db9e..5056c48c91cfaa 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -158,7 +158,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   mlir::Value sourceLine;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
     sourceLine = fir::factory::locationToLineNo(
-        builder, loc, op.getSource() ? fTy.getInput(5) : fTy.getInput(4));
+        builder, loc, op.getSource() ? fTy.getInput(6) : fTy.getInput(5));
   else
     sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
 
@@ -174,14 +174,23 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   }
   llvm::SmallVector<mlir::Value> args;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
-    if (op.getSource())
+    if (op.getSource()) {
+      mlir::Value stream =
+          op.getStream()
+              ? op.getStream()
+              : builder.createIntegerConstant(loc, fTy.getInput(2), -1);
       args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
-                                           op.getSource(), hasStat, errmsg,
-                                           sourceFile, sourceLine);
-    else
-      args =
-          fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
-                                        errmsg, sourceFile, sourceLine);
+                                           op.getSource(), stream, hasStat,
+                                           errmsg, sourceFile, sourceLine);
+    } else {
+      mlir::Value stream =
+          op.getStream()
+              ? op.getStream()
+              : builder.createIntegerConstant(loc, fTy.getInput(1), -1);
+      args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
+                                           stream, hasStat, errmsg, sourceFile,
+                                           sourceLine);
+    }
   } else {
     args =
         fir::runtime::createArguments(builder, loc, fTy, op.getBox(), hasStat,
@@ -199,10 +208,6 @@ struct CUFAllocateOpConversion
   mlir::LogicalResult
   matchAndRewrite(cuf::AllocateOp op,
                   mlir::PatternRewriter &rewriter) const override {
-    // TODO: Allocation using different stream.
-    if (op.getStream())
-      return mlir::failure();
-
     // TODO: Pinned is a reference to a logical value that can be set to true
     // when pinned allocation succeed. This will require a new entry point.
     if (op.getPinned())
@@ -220,8 +225,9 @@ struct CUFAllocateOpConversion
         func = fir::runtime::getRuntimeFunc<mkRTKey(
             CUFAllocatableAllocateSourceSync)>(loc, builder);
       else
-        func = fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
-            loc, builder);
+        func =
+            fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocateSync)>(
+                loc, builder);
       return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
     }
 
@@ -231,10 +237,7 @@ struct CUFAllocateOpConversion
           fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocateSource)>(
               loc, builder);
     else
-      // Allocation for local descriptor falls back on the standard runtime
-      // AllocatableAllocate as the dedicated allocator is set in the descriptor
-      // before the call.
-      func = fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(
+      func = fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocatableAllocate)>(
           loc, builder);
 
     return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index 9fed50c859a9cf..acd7bd3fe77d5e 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -22,18 +22,10 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
-int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
+int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, long stream, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
-  if (desc.HasAddendum()) {
-    Terminator terminator{sourceFile, sourceLine};
-    // TODO: This require a bit more work to set the correct type descriptor
-    // address
-    terminator.Crash(
-        "not yet implemented: CUDA descriptor allocation with addendum");
-  }
-  // Perform the standard allocation.
-  int stat{RTNAME(AllocatableAllocate)(
-      desc, hasStat, errMsg, sourceFile, sourceLine)};
+  int stat{RTNAME(CUFAllocatableAllocate)(
+      desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
@@ -47,9 +39,25 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, bool hasStat,
   return stat;
 }
 
+int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, long stream,
+    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+    int sourceLine) {
+  if (desc.HasAddendum()) {
+    Terminator terminator{sourceFile, sourceLine};
+    // TODO: This require a bit more work to set the correct type descriptor
+    // address
+    terminator.Crash(
+        "not yet implemented: CUDA descriptor allocation with addendum");
+  }
+  // Perform the standard allocation.
+  int stat{RTNAME(AllocatableAllocate)(
+      desc, hasStat, errMsg, sourceFile, sourceLine)};
+  return stat;
+}
+
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, bool hasStat, const Descriptor *errMsg,
-    const char *sourceFile, int sourceLine) {
+    const Descriptor &source, long stream, bool hasStat,
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(AllocatableAllocate)(
       alloc, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
@@ -61,8 +69,8 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
 }
 
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, bool hasStat, const Descriptor *errMsg,
-    const char *sourceFile, int sourceLine) {
+    const Descriptor &source, long stream, bool hasStat,
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(AllocatableAllocate)(
       alloc, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index 47d75b16b7a2d2..9b87c7546d1e9c 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -19,7 +19,7 @@ func.func @_QPsub1() {
 // CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 // CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -47,7 +47,7 @@ func.func @_QPsub3() {
 // CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 
 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -87,7 +87,7 @@ func.func @_QPsub5() {
 }
 
 // CHECK-LABEL: func.func @_QPsub5()
-// CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
@@ -118,7 +118,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
 // CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdataEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 // CHECK: _FortranAAllocatableSetBounds
 // CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 func.func @_QPallocate_source() {
@@ -142,7 +142,7 @@ func.func @_QPallocate_source() {
 // CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
 // CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
@@ -165,4 +165,20 @@ func.func @_QMmod1Pallocate_source_global() {
 // CHECK-LABEL: func.func @_QMmod1Pallocate_source_global()
 // CHECK: fir.call @_FortranACUFAllocatableAllocateSourceSync
 
+func.func @_QQallocate_stream() {
+  %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFEa"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+  %1 = fir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+  %2 = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
+  %3 = fir.declare %2 {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
+  %4 = fir.load %3 : !fir.ref<i64>
+  %5 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> stream(%4 : i64) {data_attr = #cuf.cuda<device>} -> i32
+  return
+}
+
+// CHECK-LABEL: func.func @_QQallocate_stream() 
+// CHECK: %[[STREAM_ALLOCA:.*]] = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
+// CHECK: %[[STREAM:.*]] = fir.declare %[[STREAM_ALLOCA]] {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
+// CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]] : !fir.ref<i64>
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+
 } // end of module

flang/include/flang/Runtime/CUDA/allocatable.h

flang/runtime/CUDA/allocatable.cpp

[flang][cuda] cuf.allocate: Carry over stream to the runtime call

a7409db

clementval requested review from wangzpgi and Renaud-K November 25, 2024 21:02

llvmbot added flang:runtime flang Flang issues not falling into any other category flang:fir-hlfir labels Nov 25, 2024

Renaud-K reviewed Nov 25, 2024

View reviewed changes

flang/include/flang/Runtime/CUDA/allocatable.h Outdated Show resolved Hide resolved

Renaud-K reviewed Nov 25, 2024

View reviewed changes

flang/runtime/CUDA/allocatable.cpp Outdated Show resolved Hide resolved

clementval commented Nov 25, 2024

View reviewed changes

flang/runtime/CUDA/allocatable.cpp Outdated Show resolved Hide resolved

clementval added 2 commits November 25, 2024 14:32

Call CUFAllocatableAllocate with stream

61405b4

Update stream type to i64 and update calls to use stream

5cff96c

Renaud-K approved these changes Nov 25, 2024

View reviewed changes

clementval merged commit eb5cda4 into llvm:main Nov 26, 2024
8 checks passed

clementval deleted the cuf_allocatable_stream branch November 26, 2024 04:46

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[flang][cuda] cuf.allocate: Carry over stream to the runtime call #117631

[flang][cuda] cuf.allocate: Carry over stream to the runtime call #117631

Uh oh!

clementval commented Nov 25, 2024

Uh oh!

llvmbot commented Nov 25, 2024

Uh oh!

llvmbot commented Nov 25, 2024

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

[flang][cuda] cuf.allocate: Carry over stream to the runtime call #117631

[flang][cuda] cuf.allocate: Carry over stream to the runtime call #117631

Uh oh!

Conversation

clementval commented Nov 25, 2024

Uh oh!

llvmbot commented Nov 25, 2024

Uh oh!

llvmbot commented Nov 25, 2024

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!