Skip to content

[flang][cuda] Allocate local descriptor in managed memory #102060

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Aug 6, 2024

Conversation

clementval
Copy link
Contributor

This patch adds entry point in the runtime to be able to allocate descriptors in managed memory. These entry points currently only call CUFAllocManaged and CUFFreeManaged but could be more complicated in the future.

cuf.alloc and cuf.free related to local descriptors are converted into runtime calls.

@clementval clementval requested review from wangzpgi and vzakhari August 5, 2024 21:00
@llvmbot llvmbot added flang:runtime flang Flang issues not falling into any other category flang:fir-hlfir labels Aug 5, 2024
@llvmbot
Copy link
Member

llvmbot commented Aug 5, 2024

@llvm/pr-subscribers-flang-fir-hlfir

@llvm/pr-subscribers-flang-runtime

Author: Valentin Clement (バレンタイン クレメン) (clementval)

Changes

This patch adds entry point in the runtime to be able to allocate descriptors in managed memory. These entry points currently only call CUFAllocManaged and CUFFreeManaged but could be more complicated in the future.

cuf.alloc and cuf.free related to local descriptors are converted into runtime calls.


Full diff: https://github.com/llvm/llvm-project/pull/102060.diff

6 Files Affected:

  • (added) flang/include/flang/Runtime/CUDA/descriptor.h (+30)
  • (modified) flang/lib/Optimizer/Transforms/CufOpConversion.cpp (+103-8)
  • (modified) flang/runtime/CUDA/CMakeLists.txt (+1)
  • (added) flang/runtime/CUDA/descriptor.cpp (+28)
  • (modified) flang/test/Fir/CUDA/cuda-allocate.fir (+10-1)
  • (modified) flang/unittests/Runtime/CUDA/AllocatorCUF.cpp (+14)
diff --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h
new file mode 100644
index 0000000000000..6b8b4c555a728
--- /dev/null
+++ b/flang/include/flang/Runtime/CUDA/descriptor.h
@@ -0,0 +1,30 @@
+//===-- include/flang/Runtime/CUDA/descriptor.h -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_
+#define FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_
+
+#include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/entry-names.h"
+#include <cstddef>
+
+namespace Fortran::runtime::cuf {
+
+extern "C" {
+
+// Allocate a descriptor in managed or unified memory.
+Descriptor *RTDECL(CUFAllocDesciptor)(
+    std::size_t, const char *sourceFile = nullptr, int sourceLine = 0);
+
+// Deallocate a descriptor allocated in managed or unified memory.
+void RTDECL(CUFFreeDesciptor)(
+    Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0);
+
+} // extern "C"
+} // namespace Fortran::runtime::cuf
+#endif // FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_
diff --git a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
index bdeaaab9f9d1d..61c95843a3431 100644
--- a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
@@ -8,10 +8,13 @@
 
 #include "flang/Common/Fortran.h"
 #include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
+#include "flang/Optimizer/CodeGen/TypeConverter.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Runtime/CUDA/descriptor.h"
 #include "flang/Runtime/allocatable.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -25,6 +28,7 @@ namespace fir {
 using namespace fir;
 using namespace mlir;
 using namespace Fortran::runtime;
+using namespace Fortran::runtime::cuf;
 
 namespace {
 
@@ -75,11 +79,11 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
 }
 
 struct CufAllocateOpConversion
-    : public mlir::OpRewritePattern<cuf::AllocateOp> {
+    : public mlir::OpRewritePattern<::cuf::AllocateOp> {
   using OpRewritePattern::OpRewritePattern;
 
   mlir::LogicalResult
-  matchAndRewrite(cuf::AllocateOp op,
+  matchAndRewrite(::cuf::AllocateOp op,
                   mlir::PatternRewriter &rewriter) const override {
     // TODO: Allocation with source will need a new entry point in the runtime.
     if (op.getSource())
@@ -108,16 +112,16 @@ struct CufAllocateOpConversion
     mlir::func::FuncOp func =
         fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc,
                                                                    builder);
-    return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
+    return convertOpToCall<::cuf::AllocateOp>(op, rewriter, func);
   }
 };
 
 struct CufDeallocateOpConversion
-    : public mlir::OpRewritePattern<cuf::DeallocateOp> {
+    : public mlir::OpRewritePattern<::cuf::DeallocateOp> {
   using OpRewritePattern::OpRewritePattern;
 
   mlir::LogicalResult
-  matchAndRewrite(cuf::DeallocateOp op,
+  matchAndRewrite(::cuf::DeallocateOp op,
                   mlir::PatternRewriter &rewriter) const override {
     // TODO: Allocation of module variable will need more work as the descriptor
     // will be duplicated and needs to be synced after allocation.
@@ -133,7 +137,84 @@ struct CufDeallocateOpConversion
     mlir::func::FuncOp func =
         fir::runtime::getRuntimeFunc<mkRTKey(AllocatableDeallocate)>(loc,
                                                                      builder);
-    return convertOpToCall<cuf::DeallocateOp>(op, rewriter, func);
+    return convertOpToCall<::cuf::DeallocateOp>(op, rewriter, func);
+  }
+};
+
+struct CufAllocOpConversion : public mlir::OpRewritePattern<::cuf::AllocOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  CufAllocOpConversion(mlir::MLIRContext *context, mlir::DataLayout *dl,
+                       fir::LLVMTypeConverter *typeConverter)
+      : OpRewritePattern(context), dl{dl}, typeConverter{typeConverter} {}
+
+  mlir::LogicalResult
+  matchAndRewrite(::cuf::AllocOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(op.getInType());
+
+    // Only convert cuf.alloc that allocates a descriptor.
+    if (!boxTy)
+      return failure();
+
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+    mlir::func::FuncOp func =
+        fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocDesciptor)>(loc, builder);
+
+    auto fTy = func.getFunctionType();
+    mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+    mlir::Value sourceLine =
+        fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+
+    mlir::Type structTy = typeConverter->convertBoxTypeAsStruct(boxTy);
+    std::size_t boxSize = dl->getTypeSizeInBits(structTy) / 8;
+    mlir::Value sizeInBytes =
+        builder.createIntegerConstant(loc, builder.getIndexType(), boxSize);
+
+    llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+        builder, loc, fTy, sizeInBytes, sourceFile, sourceLine)};
+    auto callOp = builder.create<fir::CallOp>(loc, func, args);
+    auto convOp = builder.createConvert(loc, op.getResult().getType(),
+                                        callOp.getResult(0));
+    rewriter.replaceOp(op, convOp);
+    return mlir::success();
+  }
+
+private:
+  mlir::DataLayout *dl;
+  fir::LLVMTypeConverter *typeConverter;
+};
+
+struct CufFreeOpConversion : public mlir::OpRewritePattern<::cuf::FreeOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(::cuf::FreeOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    // Only convert cuf.free on descriptor.
+    if (!mlir::isa<fir::ReferenceType>(op.getDevptr().getType()))
+      return failure();
+    auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getDevptr().getType());
+    if (!mlir::isa<fir::BaseBoxType>(refTy.getEleTy()))
+      return failure();
+
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+    mlir::func::FuncOp func =
+        fir::runtime::getRuntimeFunc<mkRTKey(CUFFreeDesciptor)>(loc, builder);
+
+    auto fTy = func.getFunctionType();
+    mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+    mlir::Value sourceLine =
+        fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+    llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+        builder, loc, fTy, op.getDevptr(), sourceFile, sourceLine)};
+    builder.create<fir::CallOp>(loc, func, args);
+    rewriter.eraseOp(op);
+    return mlir::success();
   }
 };
 
@@ -143,8 +224,22 @@ class CufOpConversion : public fir::impl::CufOpConversionBase<CufOpConversion> {
     auto *ctx = &getContext();
     mlir::RewritePatternSet patterns(ctx);
     mlir::ConversionTarget target(*ctx);
-    target.addIllegalOp<cuf::AllocateOp, cuf::DeallocateOp>();
-    patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion>(ctx);
+
+    mlir::Operation *op = getOperation();
+    mlir::ModuleOp module = mlir::dyn_cast<mlir::ModuleOp>(op);
+    if (!module)
+      return signalPassFailure();
+
+    std::optional<mlir::DataLayout> dl =
+        fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
+    fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
+                                         /*forceUnifiedTBAATree=*/false, *dl);
+
+    target.addIllegalOp<::cuf::AllocOp, ::cuf::AllocateOp, ::cuf::DeallocateOp,
+                        ::cuf::FreeOp>();
+    patterns.insert<CufAllocOpConversion>(ctx, &*dl, &typeConverter);
+    patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion,
+                    CufFreeOpConversion>(ctx);
     if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
                                                   std::move(patterns)))) {
       mlir::emitError(mlir::UnknownLoc::get(ctx),
diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt
index de1104f07ce6c..88243536139e4 100644
--- a/flang/runtime/CUDA/CMakeLists.txt
+++ b/flang/runtime/CUDA/CMakeLists.txt
@@ -11,6 +11,7 @@ find_library(CUDA_RUNTIME_LIBRARY cuda HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTOR
 
 add_flang_library(CufRuntime
   allocator.cpp
+  descriptor.cpp
 )
 target_link_libraries(CufRuntime
   PRIVATE
diff --git a/flang/runtime/CUDA/descriptor.cpp b/flang/runtime/CUDA/descriptor.cpp
new file mode 100644
index 0000000000000..bfac336b5b681
--- /dev/null
+++ b/flang/runtime/CUDA/descriptor.cpp
@@ -0,0 +1,28 @@
+//===-- runtime/CUDA/descriptor.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Runtime/CUDA/descriptor.h"
+#include "flang/Runtime/CUDA/allocator.h"
+
+namespace Fortran::runtime::cuf {
+extern "C" {
+RT_EXT_API_GROUP_BEGIN
+
+Descriptor *RTDEF(CUFAllocDesciptor)(
+    std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
+  return reinterpret_cast<Descriptor *>(CUFAllocManaged(sizeInBytes));
+}
+
+void RTDECL(CUFFreeDesciptor)(
+    Descriptor *desc, const char *sourceFile, int sourceLine) {
+  CUFFreeManaged(reinterpret_cast<void *>(desc));
+}
+
+RT_EXT_API_GROUP_END
+}
+} // namespace Fortran::runtime::cuf
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index ab4a253f33dd8..1274d3921dd85 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -1,5 +1,7 @@
 // RUN: fir-opt --cuf-convert %s | FileCheck %s
 
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+
 func.func @_QPsub1() {
   %0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Ea"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
   %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
@@ -8,14 +10,21 @@ func.func @_QPsub1() {
   %c0_i32 = arith.constant 0 : i32
   %9 = cuf.allocate %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
   %10 = cuf.deallocate %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
+  cuf.free %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>}
   return
 }
 
+
 // CHECK-LABEL: func.func @_QPsub1()
-// CHECK: %[[DESC:.*]] = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Ea"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK: %[[DESC_RT_CALL:.*]] = fir.call @_FortranACUFAllocDesciptor(%{{.*}}, %{{.*}}, %{{.*}}) : (i64, !fir.ref<i8>, i32) -> !fir.ref<!fir.box<none>>
+// CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 // CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK: fir.call @_FortranACUFFreeDesciptor(%[[BOX_NONE]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> none
+
+}
diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
index f372ae18c202f..2355c47778cca 100644
--- a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -10,12 +10,14 @@
 #include "../../../runtime/terminator.h"
 #include "flang/Common/Fortran.h"
 #include "flang/Runtime/CUDA/allocator.h"
+#include "flang/Runtime/CUDA/descriptor.h"
 #include "flang/Runtime/allocatable.h"
 #include "flang/Runtime/allocator-registry.h"
 
 #include "cuda.h"
 
 using namespace Fortran::runtime;
+using namespace Fortran::runtime::cuf;
 
 static OwningPtr<Descriptor> createAllocatable(
     Fortran::common::TypeCategory tc, int kind, int rank = 1) {
@@ -87,3 +89,15 @@ TEST(AllocatableCUFTest, SimplePinnedAllocate) {
   (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
   EXPECT_FALSE(a->IsAllocated());
 }
+
+TEST(AllocatableCUFTest, DescriptorAllocationTest) {
+  using Fortran::common::TypeCategory;
+  Fortran::runtime::cuf::CUFRegisterAllocator();
+  ScopedContext ctx;
+  // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+  auto a{createAllocatable(TypeCategory::Real, 4)};
+  Descriptor *desc = nullptr;
+  desc = RTNAME(CUFAllocDesciptor)(a->SizeInBytes());
+  EXPECT_TRUE(desc != nullptr);
+  RTNAME(CUFFreeDesciptor)(desc);
+}

Copy link
Contributor

@vzakhari vzakhari left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you, Valentin!

@clementval clementval merged commit a3ccaed into llvm:main Aug 6, 2024
5 of 6 checks passed
@clementval clementval deleted the cuf_alloc_descriptor branch August 6, 2024 18:28
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
flang:fir-hlfir flang:runtime flang Flang issues not falling into any other category
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants