[flang][cuda] Allocate local descriptor in managed memory #102060

clementval · 2024-08-05T21:00:37Z

This patch adds entry point in the runtime to be able to allocate descriptors in managed memory. These entry points currently only call CUFAllocManaged and CUFFreeManaged but could be more complicated in the future.

cuf.alloc and cuf.free related to local descriptors are converted into runtime calls.

llvmbot · 2024-08-05T21:01:10Z

@llvm/pr-subscribers-flang-fir-hlfir

@llvm/pr-subscribers-flang-runtime

Author: Valentin Clement (バレンタインクレメン) (clementval)

Changes

This patch adds entry point in the runtime to be able to allocate descriptors in managed memory. These entry points currently only call CUFAllocManaged and CUFFreeManaged but could be more complicated in the future.

cuf.alloc and cuf.free related to local descriptors are converted into runtime calls.

Full diff: https://github.com/llvm/llvm-project/pull/102060.diff

6 Files Affected:

(added) flang/include/flang/Runtime/CUDA/descriptor.h (+30)
(modified) flang/lib/Optimizer/Transforms/CufOpConversion.cpp (+103-8)
(modified) flang/runtime/CUDA/CMakeLists.txt (+1)
(added) flang/runtime/CUDA/descriptor.cpp (+28)
(modified) flang/test/Fir/CUDA/cuda-allocate.fir (+10-1)
(modified) flang/unittests/Runtime/CUDA/AllocatorCUF.cpp (+14)

diff --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h
new file mode 100644
index 0000000000000..6b8b4c555a728
--- /dev/null
+++ b/flang/include/flang/Runtime/CUDA/descriptor.h
@@ -0,0 +1,30 @@
+//===-- include/flang/Runtime/CUDA/descriptor.h -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_
+#define FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_
+
+#include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/entry-names.h"
+#include <cstddef>
+
+namespace Fortran::runtime::cuf {
+
+extern "C" {
+
+// Allocate a descriptor in managed or unified memory.
+Descriptor *RTDECL(CUFAllocDesciptor)(
+    std::size_t, const char *sourceFile = nullptr, int sourceLine = 0);
+
+// Deallocate a descriptor allocated in managed or unified memory.
+void RTDECL(CUFFreeDesciptor)(
+    Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0);
+
+} // extern "C"
+} // namespace Fortran::runtime::cuf
+#endif // FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_
diff --git a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
index bdeaaab9f9d1d..61c95843a3431 100644
--- a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
@@ -8,10 +8,13 @@
 
 #include "flang/Common/Fortran.h"
 #include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
+#include "flang/Optimizer/CodeGen/TypeConverter.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Runtime/CUDA/descriptor.h"
 #include "flang/Runtime/allocatable.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -25,6 +28,7 @@ namespace fir {
 using namespace fir;
 using namespace mlir;
 using namespace Fortran::runtime;
+using namespace Fortran::runtime::cuf;
 
 namespace {
 
@@ -75,11 +79,11 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
 }
 
 struct CufAllocateOpConversion
-    : public mlir::OpRewritePattern<cuf::AllocateOp> {
+    : public mlir::OpRewritePattern<::cuf::AllocateOp> {
   using OpRewritePattern::OpRewritePattern;
 
   mlir::LogicalResult
-  matchAndRewrite(cuf::AllocateOp op,
+  matchAndRewrite(::cuf::AllocateOp op,
                   mlir::PatternRewriter &rewriter) const override {
     // TODO: Allocation with source will need a new entry point in the runtime.
     if (op.getSource())
@@ -108,16 +112,16 @@ struct CufAllocateOpConversion
     mlir::func::FuncOp func =
         fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc,
                                                                    builder);
-    return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
+    return convertOpToCall<::cuf::AllocateOp>(op, rewriter, func);
   }
 };
 
 struct CufDeallocateOpConversion
-    : public mlir::OpRewritePattern<cuf::DeallocateOp> {
+    : public mlir::OpRewritePattern<::cuf::DeallocateOp> {
   using OpRewritePattern::OpRewritePattern;
 
   mlir::LogicalResult
-  matchAndRewrite(cuf::DeallocateOp op,
+  matchAndRewrite(::cuf::DeallocateOp op,
                   mlir::PatternRewriter &rewriter) const override {
     // TODO: Allocation of module variable will need more work as the descriptor
     // will be duplicated and needs to be synced after allocation.
@@ -133,7 +137,84 @@ struct CufDeallocateOpConversion
     mlir::func::FuncOp func =
         fir::runtime::getRuntimeFunc<mkRTKey(AllocatableDeallocate)>(loc,
                                                                      builder);
-    return convertOpToCall<cuf::DeallocateOp>(op, rewriter, func);
+    return convertOpToCall<::cuf::DeallocateOp>(op, rewriter, func);
+  }
+};
+
+struct CufAllocOpConversion : public mlir::OpRewritePattern<::cuf::AllocOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  CufAllocOpConversion(mlir::MLIRContext *context, mlir::DataLayout *dl,
+                       fir::LLVMTypeConverter *typeConverter)
+      : OpRewritePattern(context), dl{dl}, typeConverter{typeConverter} {}
+
+  mlir::LogicalResult
+  matchAndRewrite(::cuf::AllocOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(op.getInType());
+
+    // Only convert cuf.alloc that allocates a descriptor.
+    if (!boxTy)
+      return failure();
+
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+    mlir::func::FuncOp func =
+        fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocDesciptor)>(loc, builder);
+
+    auto fTy = func.getFunctionType();
+    mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+    mlir::Value sourceLine =
+        fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+
+    mlir::Type structTy = typeConverter->convertBoxTypeAsStruct(boxTy);
+    std::size_t boxSize = dl->getTypeSizeInBits(structTy) / 8;
+    mlir::Value sizeInBytes =
+        builder.createIntegerConstant(loc, builder.getIndexType(), boxSize);
+
+    llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+        builder, loc, fTy, sizeInBytes, sourceFile, sourceLine)};
+    auto callOp = builder.create<fir::CallOp>(loc, func, args);
+    auto convOp = builder.createConvert(loc, op.getResult().getType(),
+                                        callOp.getResult(0));
+    rewriter.replaceOp(op, convOp);
+    return mlir::success();
+  }
+
+private:
+  mlir::DataLayout *dl;
+  fir::LLVMTypeConverter *typeConverter;
+};
+
+struct CufFreeOpConversion : public mlir::OpRewritePattern<::cuf::FreeOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(::cuf::FreeOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    // Only convert cuf.free on descriptor.
+    if (!mlir::isa<fir::ReferenceType>(op.getDevptr().getType()))
+      return failure();
+    auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getDevptr().getType());
+    if (!mlir::isa<fir::BaseBoxType>(refTy.getEleTy()))
+      return failure();
+
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+    mlir::func::FuncOp func =
+        fir::runtime::getRuntimeFunc<mkRTKey(CUFFreeDesciptor)>(loc, builder);
+
+    auto fTy = func.getFunctionType();
+    mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+    mlir::Value sourceLine =
+        fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+    llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+        builder, loc, fTy, op.getDevptr(), sourceFile, sourceLine)};
+    builder.create<fir::CallOp>(loc, func, args);
+    rewriter.eraseOp(op);
+    return mlir::success();
   }
 };
 
@@ -143,8 +224,22 @@ class CufOpConversion : public fir::impl::CufOpConversionBase<CufOpConversion> {
     auto *ctx = &getContext();
     mlir::RewritePatternSet patterns(ctx);
     mlir::ConversionTarget target(*ctx);
-    target.addIllegalOp<cuf::AllocateOp, cuf::DeallocateOp>();
-    patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion>(ctx);
+
+    mlir::Operation *op = getOperation();
+    mlir::ModuleOp module = mlir::dyn_cast<mlir::ModuleOp>(op);
+    if (!module)
+      return signalPassFailure();
+
+    std::optional<mlir::DataLayout> dl =
+        fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
+    fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
+                                         /*forceUnifiedTBAATree=*/false, *dl);
+
+    target.addIllegalOp<::cuf::AllocOp, ::cuf::AllocateOp, ::cuf::DeallocateOp,
+                        ::cuf::FreeOp>();
+    patterns.insert<CufAllocOpConversion>(ctx, &*dl, &typeConverter);
+    patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion,
+                    CufFreeOpConversion>(ctx);
     if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
                                                   std::move(patterns)))) {
       mlir::emitError(mlir::UnknownLoc::get(ctx),
diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt
index de1104f07ce6c..88243536139e4 100644
--- a/flang/runtime/CUDA/CMakeLists.txt
+++ b/flang/runtime/CUDA/CMakeLists.txt
@@ -11,6 +11,7 @@ find_library(CUDA_RUNTIME_LIBRARY cuda HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTOR
 
 add_flang_library(CufRuntime
   allocator.cpp
+  descriptor.cpp
 )
 target_link_libraries(CufRuntime
   PRIVATE
diff --git a/flang/runtime/CUDA/descriptor.cpp b/flang/runtime/CUDA/descriptor.cpp
new file mode 100644
index 0000000000000..bfac336b5b681
--- /dev/null
+++ b/flang/runtime/CUDA/descriptor.cpp
@@ -0,0 +1,28 @@
+//===-- runtime/CUDA/descriptor.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/CUDA/descriptor.h"
+#include "flang/Runtime/CUDA/allocator.h"
+
+namespace Fortran::runtime::cuf {
+extern "C" {
+RT_EXT_API_GROUP_BEGIN
+
+Descriptor *RTDEF(CUFAllocDesciptor)(
+    std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
+  return reinterpret_cast<Descriptor *>(CUFAllocManaged(sizeInBytes));
+}
+
+void RTDECL(CUFFreeDesciptor)(
+    Descriptor *desc, const char *sourceFile, int sourceLine) {
+  CUFFreeManaged(reinterpret_cast<void *>(desc));
+}
+
+RT_EXT_API_GROUP_END
+}
+} // namespace Fortran::runtime::cuf
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index ab4a253f33dd8..1274d3921dd85 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -1,5 +1,7 @@
 // RUN: fir-opt --cuf-convert %s | FileCheck %s
 
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+
 func.func @_QPsub1() {
   %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Ea"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
   %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
@@ -8,14 +10,21 @@ func.func @_QPsub1() {
   %c0_i32 = arith.constant 0 : i32
   %9 = cuf.allocate %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
   %10 = cuf.deallocate %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
+  cuf.free %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>}
   return
 }
 
+
 // CHECK-LABEL: func.func @_QPsub1()
-// CHECK: %[[DESC:.*]] = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Ea"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK: %[[DESC_RT_CALL:.*]] = fir.call @_FortranACUFAllocDesciptor(%{{.*}}, %{{.*}}, %{{.*}}) : (i64, !fir.ref<i8>, i32) -> !fir.ref<!fir.box<none>>
+// CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 // CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK: fir.call @_FortranACUFFreeDesciptor(%[[BOX_NONE]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> none
+
+}
diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
index f372ae18c202f..2355c47778cca 100644
--- a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -10,12 +10,14 @@
 #include "../../../runtime/terminator.h"
 #include "flang/Common/Fortran.h"
 #include "flang/Runtime/CUDA/allocator.h"
+#include "flang/Runtime/CUDA/descriptor.h"
 #include "flang/Runtime/allocatable.h"
 #include "flang/Runtime/allocator-registry.h"
 
 #include "cuda.h"
 
 using namespace Fortran::runtime;
+using namespace Fortran::runtime::cuf;
 
 static OwningPtr<Descriptor> createAllocatable(
     Fortran::common::TypeCategory tc, int kind, int rank = 1) {
@@ -87,3 +89,15 @@ TEST(AllocatableCUFTest, SimplePinnedAllocate) {
   (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
   EXPECT_FALSE(a->IsAllocated());
 }
+
+TEST(AllocatableCUFTest, DescriptorAllocationTest) {
+  using Fortran::common::TypeCategory;
+  Fortran::runtime::cuf::CUFRegisterAllocator();
+  ScopedContext ctx;
+  // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+  auto a{createAllocatable(TypeCategory::Real, 4)};
+  Descriptor *desc = nullptr;
+  desc = RTNAME(CUFAllocDesciptor)(a->SizeInBytes());
+  EXPECT_TRUE(desc != nullptr);
+  RTNAME(CUFFreeDesciptor)(desc);
+}

flang/runtime/CUDA/descriptor.cpp

flang/include/flang/Runtime/CUDA/descriptor.h

vzakhari

Thank you, Valentin!

[flang][cuda] Add entry point to allocate descriptor in managed memory

c1c0921

clementval requested review from wangzpgi and vzakhari August 5, 2024 21:00

llvmbot added flang:runtime flang Flang issues not falling into any other category flang:fir-hlfir labels Aug 5, 2024

wangzpgi approved these changes Aug 5, 2024

View reviewed changes

vzakhari reviewed Aug 5, 2024

View reviewed changes

flang/runtime/CUDA/descriptor.cpp Outdated Show resolved Hide resolved

flang/include/flang/Runtime/CUDA/descriptor.h Outdated Show resolved Hide resolved

Switch to REDEF

e7bd654

clementval commented Aug 6, 2024

View reviewed changes

flang/include/flang/Runtime/CUDA/descriptor.h Outdated Show resolved Hide resolved

Update flang/include/flang/Runtime/CUDA/descriptor.h

99a71a0

vzakhari approved these changes Aug 6, 2024

View reviewed changes

clementval merged commit a3ccaed into llvm:main Aug 6, 2024
5 of 6 checks passed

clementval deleted the cuf_alloc_descriptor branch August 6, 2024 18:28

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[flang][cuda] Allocate local descriptor in managed memory #102060

[flang][cuda] Allocate local descriptor in managed memory #102060

Uh oh!

clementval commented Aug 5, 2024

Uh oh!

llvmbot commented Aug 5, 2024 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

Uh oh!

vzakhari left a comment

Uh oh!

Uh oh!

Uh oh!

[flang][cuda] Allocate local descriptor in managed memory #102060

[flang][cuda] Allocate local descriptor in managed memory #102060

Uh oh!

Conversation

clementval commented Aug 5, 2024

Uh oh!

llvmbot commented Aug 5, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

vzakhari left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Aug 5, 2024 •

edited

Loading