Skip to content

[flang][cuda] Allocate local descriptor in managed memory #102060

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions flang/include/flang/Runtime/CUDA/descriptor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
//===-- include/flang/Runtime/CUDA/descriptor.h -----------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_
#define FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_

#include "flang/Runtime/descriptor.h"
#include "flang/Runtime/entry-names.h"
#include <cstddef>

namespace Fortran::runtime::cuf {

extern "C" {

// Allocate a descriptor in managed.
Descriptor *RTDECL(CUFAllocDesciptor)(
std::size_t, const char *sourceFile = nullptr, int sourceLine = 0);

// Deallocate a descriptor allocated in managed or unified memory.
void RTDECL(CUFFreeDesciptor)(
Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0);

} // extern "C"
} // namespace Fortran::runtime::cuf
#endif // FORTRAN_RUNTIME_CUDA_DESCRIPTOR_H_
111 changes: 103 additions & 8 deletions flang/lib/Optimizer/Transforms/CufOpConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,13 @@

#include "flang/Common/Fortran.h"
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
#include "flang/Optimizer/CodeGen/TypeConverter.h"
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
#include "flang/Optimizer/Dialect/FIRDialect.h"
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/HLFIR/HLFIROps.h"
#include "flang/Optimizer/Support/DataLayout.h"
#include "flang/Runtime/CUDA/descriptor.h"
#include "flang/Runtime/allocatable.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
Expand All @@ -25,6 +28,7 @@ namespace fir {
using namespace fir;
using namespace mlir;
using namespace Fortran::runtime;
using namespace Fortran::runtime::cuf;

namespace {

Expand Down Expand Up @@ -75,11 +79,11 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
}

struct CufAllocateOpConversion
: public mlir::OpRewritePattern<cuf::AllocateOp> {
: public mlir::OpRewritePattern<::cuf::AllocateOp> {
using OpRewritePattern::OpRewritePattern;

mlir::LogicalResult
matchAndRewrite(cuf::AllocateOp op,
matchAndRewrite(::cuf::AllocateOp op,
mlir::PatternRewriter &rewriter) const override {
// TODO: Allocation with source will need a new entry point in the runtime.
if (op.getSource())
Expand Down Expand Up @@ -108,16 +112,16 @@ struct CufAllocateOpConversion
mlir::func::FuncOp func =
fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc,
builder);
return convertOpToCall<cuf::AllocateOp>(op, rewriter, func);
return convertOpToCall<::cuf::AllocateOp>(op, rewriter, func);
}
};

struct CufDeallocateOpConversion
: public mlir::OpRewritePattern<cuf::DeallocateOp> {
: public mlir::OpRewritePattern<::cuf::DeallocateOp> {
using OpRewritePattern::OpRewritePattern;

mlir::LogicalResult
matchAndRewrite(cuf::DeallocateOp op,
matchAndRewrite(::cuf::DeallocateOp op,
mlir::PatternRewriter &rewriter) const override {
// TODO: Allocation of module variable will need more work as the descriptor
// will be duplicated and needs to be synced after allocation.
Expand All @@ -133,7 +137,84 @@ struct CufDeallocateOpConversion
mlir::func::FuncOp func =
fir::runtime::getRuntimeFunc<mkRTKey(AllocatableDeallocate)>(loc,
builder);
return convertOpToCall<cuf::DeallocateOp>(op, rewriter, func);
return convertOpToCall<::cuf::DeallocateOp>(op, rewriter, func);
}
};

struct CufAllocOpConversion : public mlir::OpRewritePattern<::cuf::AllocOp> {
using OpRewritePattern::OpRewritePattern;

CufAllocOpConversion(mlir::MLIRContext *context, mlir::DataLayout *dl,
fir::LLVMTypeConverter *typeConverter)
: OpRewritePattern(context), dl{dl}, typeConverter{typeConverter} {}

mlir::LogicalResult
matchAndRewrite(::cuf::AllocOp op,
mlir::PatternRewriter &rewriter) const override {
auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(op.getInType());

// Only convert cuf.alloc that allocates a descriptor.
if (!boxTy)
return failure();

auto mod = op->getParentOfType<mlir::ModuleOp>();
fir::FirOpBuilder builder(rewriter, mod);
mlir::Location loc = op.getLoc();
mlir::func::FuncOp func =
fir::runtime::getRuntimeFunc<mkRTKey(CUFAllocDesciptor)>(loc, builder);

auto fTy = func.getFunctionType();
mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
mlir::Value sourceLine =
fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));

mlir::Type structTy = typeConverter->convertBoxTypeAsStruct(boxTy);
std::size_t boxSize = dl->getTypeSizeInBits(structTy) / 8;
mlir::Value sizeInBytes =
builder.createIntegerConstant(loc, builder.getIndexType(), boxSize);

llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
builder, loc, fTy, sizeInBytes, sourceFile, sourceLine)};
auto callOp = builder.create<fir::CallOp>(loc, func, args);
auto convOp = builder.createConvert(loc, op.getResult().getType(),
callOp.getResult(0));
rewriter.replaceOp(op, convOp);
return mlir::success();
}

private:
mlir::DataLayout *dl;
fir::LLVMTypeConverter *typeConverter;
};

struct CufFreeOpConversion : public mlir::OpRewritePattern<::cuf::FreeOp> {
using OpRewritePattern::OpRewritePattern;

mlir::LogicalResult
matchAndRewrite(::cuf::FreeOp op,
mlir::PatternRewriter &rewriter) const override {
// Only convert cuf.free on descriptor.
if (!mlir::isa<fir::ReferenceType>(op.getDevptr().getType()))
return failure();
auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getDevptr().getType());
if (!mlir::isa<fir::BaseBoxType>(refTy.getEleTy()))
return failure();

auto mod = op->getParentOfType<mlir::ModuleOp>();
fir::FirOpBuilder builder(rewriter, mod);
mlir::Location loc = op.getLoc();
mlir::func::FuncOp func =
fir::runtime::getRuntimeFunc<mkRTKey(CUFFreeDesciptor)>(loc, builder);

auto fTy = func.getFunctionType();
mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
mlir::Value sourceLine =
fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
builder, loc, fTy, op.getDevptr(), sourceFile, sourceLine)};
builder.create<fir::CallOp>(loc, func, args);
rewriter.eraseOp(op);
return mlir::success();
}
};

Expand All @@ -143,8 +224,22 @@ class CufOpConversion : public fir::impl::CufOpConversionBase<CufOpConversion> {
auto *ctx = &getContext();
mlir::RewritePatternSet patterns(ctx);
mlir::ConversionTarget target(*ctx);
target.addIllegalOp<cuf::AllocateOp, cuf::DeallocateOp>();
patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion>(ctx);

mlir::Operation *op = getOperation();
mlir::ModuleOp module = mlir::dyn_cast<mlir::ModuleOp>(op);
if (!module)
return signalPassFailure();

std::optional<mlir::DataLayout> dl =
fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
/*forceUnifiedTBAATree=*/false, *dl);

target.addIllegalOp<::cuf::AllocOp, ::cuf::AllocateOp, ::cuf::DeallocateOp,
::cuf::FreeOp>();
patterns.insert<CufAllocOpConversion>(ctx, &*dl, &typeConverter);
patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion,
CufFreeOpConversion>(ctx);
if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
std::move(patterns)))) {
mlir::emitError(mlir::UnknownLoc::get(ctx),
Expand Down
1 change: 1 addition & 0 deletions flang/runtime/CUDA/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ find_library(CUDA_RUNTIME_LIBRARY cuda HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTOR

add_flang_library(CufRuntime
allocator.cpp
descriptor.cpp
)
target_link_libraries(CufRuntime
PRIVATE
Expand Down
28 changes: 28 additions & 0 deletions flang/runtime/CUDA/descriptor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
//===-- runtime/CUDA/descriptor.cpp ---------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "flang/Runtime/CUDA/descriptor.h"
#include "flang/Runtime/CUDA/allocator.h"

namespace Fortran::runtime::cuf {
extern "C" {
RT_EXT_API_GROUP_BEGIN

Descriptor *RTDEF(CUFAllocDesciptor)(
std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
return reinterpret_cast<Descriptor *>(CUFAllocManaged(sizeInBytes));
}

void RTDEF(CUFFreeDesciptor)(
Descriptor *desc, const char *sourceFile, int sourceLine) {
CUFFreeManaged(reinterpret_cast<void *>(desc));
}

RT_EXT_API_GROUP_END
}
} // namespace Fortran::runtime::cuf
11 changes: 10 additions & 1 deletion flang/test/Fir/CUDA/cuda-allocate.fir
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// RUN: fir-opt --cuf-convert %s | FileCheck %s

module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {

func.func @_QPsub1() {
%0 = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Ea"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
%4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
Expand All @@ -8,14 +10,21 @@ func.func @_QPsub1() {
%c0_i32 = arith.constant 0 : i32
%9 = cuf.allocate %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
%10 = cuf.deallocate %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
cuf.free %4#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>}
return
}


// CHECK-LABEL: func.func @_QPsub1()
// CHECK: %[[DESC:.*]] = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Ea"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
// CHECK: %[[DESC_RT_CALL:.*]] = fir.call @_FortranACUFAllocDesciptor(%{{.*}}, %{{.*}}, %{{.*}}) : (i64, !fir.ref<i8>, i32) -> !fir.ref<!fir.box<none>>
// CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
// CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32

// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: fir.call @_FortranACUFFreeDesciptor(%[[BOX_NONE]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> none

}
14 changes: 14 additions & 0 deletions flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@
#include "../../../runtime/terminator.h"
#include "flang/Common/Fortran.h"
#include "flang/Runtime/CUDA/allocator.h"
#include "flang/Runtime/CUDA/descriptor.h"
#include "flang/Runtime/allocatable.h"
#include "flang/Runtime/allocator-registry.h"

#include "cuda.h"

using namespace Fortran::runtime;
using namespace Fortran::runtime::cuf;

static OwningPtr<Descriptor> createAllocatable(
Fortran::common::TypeCategory tc, int kind, int rank = 1) {
Expand Down Expand Up @@ -87,3 +89,15 @@ TEST(AllocatableCUFTest, SimplePinnedAllocate) {
(*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
EXPECT_FALSE(a->IsAllocated());
}

TEST(AllocatableCUFTest, DescriptorAllocationTest) {
using Fortran::common::TypeCategory;
Fortran::runtime::cuf::CUFRegisterAllocator();
ScopedContext ctx;
// REAL(4), DEVICE, ALLOCATABLE :: a(:)
auto a{createAllocatable(TypeCategory::Real, 4)};
Descriptor *desc = nullptr;
desc = RTNAME(CUFAllocDesciptor)(a->SizeInBytes());
EXPECT_TRUE(desc != nullptr);
RTNAME(CUFFreeDesciptor)(desc);
}
Loading