Skip to content

[flang][cuda] Distinguish constant fir.global from globals with a #cuf.cuda<constant> attribute #118912

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions flang/include/flang/Optimizer/Transforms/CUFCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#ifndef FORTRAN_OPTIMIZER_TRANSFORMS_CUFCOMMON_H_
#define FORTRAN_OPTIMIZER_TRANSFORMS_CUFCOMMON_H_

#include "flang/Optimizer/Dialect/FIROps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/IR/BuiltinOps.h"

Expand All @@ -21,6 +22,7 @@ mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod,
mlir::SymbolTable &symTab);

bool isInCUDADeviceContext(mlir::Operation *op);
bool isRegisteredDeviceGlobal(fir::GlobalOp op);

} // namespace cuf

Expand Down
3 changes: 2 additions & 1 deletion flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,8 @@ struct CUFAddConstructor

mlir::func::FuncOp func;
switch (attr.getValue()) {
case cuf::DataAttribute::Device: {
case cuf::DataAttribute::Device:
case cuf::DataAttribute::Constant: {
func = fir::runtime::getRuntimeFunc<mkRTKey(CUFRegisterVariable)>(
loc, builder);
auto fTy = func.getFunctionType();
Expand Down
11 changes: 11 additions & 0 deletions flang/lib/Optimizer/Transforms/CUFCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,14 @@ bool cuf::isInCUDADeviceContext(mlir::Operation *op) {
}
return false;
}

bool cuf::isRegisteredDeviceGlobal(fir::GlobalOp op) {
if (op.getConstant())
return false;
auto attr = op.getDataAttr();
if (attr && (*attr == cuf::DataAttribute::Device ||
*attr == cuf::DataAttribute::Managed ||
*attr == cuf::DataAttribute::Constant))
return true;
return false;
}
63 changes: 26 additions & 37 deletions flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "mlir/IR/SymbolTable.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/ADT/DenseSet.h"

namespace fir {
#define GEN_PASS_DEF_CUFDEVICEGLOBAL
Expand All @@ -27,36 +28,30 @@ namespace fir {
namespace {

static void processAddrOfOp(fir::AddrOfOp addrOfOp,
mlir::SymbolTable &symbolTable, bool onlyConstant) {
mlir::SymbolTable &symbolTable,
llvm::DenseSet<fir::GlobalOp> &candidates) {
if (auto globalOp = symbolTable.lookup<fir::GlobalOp>(
addrOfOp.getSymbol().getRootReference().getValue())) {
bool isCandidate{(onlyConstant ? globalOp.getConstant() : true) &&
!globalOp.getDataAttr()};
if (isCandidate)
globalOp.setDataAttrAttr(cuf::DataAttributeAttr::get(
addrOfOp.getContext(), globalOp.getConstant()
? cuf::DataAttribute::Constant
: cuf::DataAttribute::Device));
// TO DO: limit candidates to non-scalars. Scalars appear to have been
// folded in already.
if (globalOp.getConstant()) {
candidates.insert(globalOp);
}
}
}

static void prepareImplicitDeviceGlobals(mlir::func::FuncOp funcOp,
mlir::SymbolTable &symbolTable,
bool onlyConstant = true) {
static void
prepareImplicitDeviceGlobals(mlir::func::FuncOp funcOp,
mlir::SymbolTable &symbolTable,
llvm::DenseSet<fir::GlobalOp> &candidates) {

auto cudaProcAttr{
funcOp->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName())};
if (!cudaProcAttr || cudaProcAttr.getValue() == cuf::ProcAttribute::Host) {
// Look for globlas in CUF KERNEL DO operations.
for (auto cufKernelOp : funcOp.getBody().getOps<cuf::KernelOp>()) {
cufKernelOp.walk([&](fir::AddrOfOp addrOfOp) {
processAddrOfOp(addrOfOp, symbolTable, onlyConstant);
});
}
return;
if (cudaProcAttr && cudaProcAttr.getValue() != cuf::ProcAttribute::Host) {
funcOp.walk([&](fir::AddrOfOp addrOfOp) {
processAddrOfOp(addrOfOp, symbolTable, candidates);
});
}
funcOp.walk([&](fir::AddrOfOp addrOfOp) {
processAddrOfOp(addrOfOp, symbolTable, onlyConstant);
});
}

class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase<CUFDeviceGlobal> {
Expand All @@ -67,9 +62,10 @@ class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase<CUFDeviceGlobal> {
if (!mod)
return signalPassFailure();

llvm::DenseSet<fir::GlobalOp> candidates;
mlir::SymbolTable symTable(mod);
mod.walk([&](mlir::func::FuncOp funcOp) {
prepareImplicitDeviceGlobals(funcOp, symTable);
prepareImplicitDeviceGlobals(funcOp, symTable, candidates);
return mlir::WalkResult::advance();
});

Expand All @@ -80,22 +76,15 @@ class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase<CUFDeviceGlobal> {
return signalPassFailure();
mlir::SymbolTable gpuSymTable(gpuMod);
for (auto globalOp : mod.getOps<fir::GlobalOp>()) {
auto attr = globalOp.getDataAttrAttr();
if (!attr)
continue;
switch (attr.getValue()) {
case cuf::DataAttribute::Device:
case cuf::DataAttribute::Constant:
case cuf::DataAttribute::Managed: {
auto globalName{globalOp.getSymbol().getValue()};
if (gpuSymTable.lookup<fir::GlobalOp>(globalName)) {
break;
}
gpuSymTable.insert(globalOp->clone());
} break;
default:
if (cuf::isRegisteredDeviceGlobal(globalOp))
candidates.insert(globalOp);
}
for (auto globalOp : candidates) {
auto globalName{globalOp.getSymbol().getValue()};
if (gpuSymTable.lookup<fir::GlobalOp>(globalName)) {
break;
}
gpuSymTable.insert(globalOp->clone());
}
}
};
Expand Down
13 changes: 2 additions & 11 deletions flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,6 @@ static bool hasDoubleDescriptors(OpTy op) {
return false;
}

bool isDeviceGlobal(fir::GlobalOp op) {
auto attr = op.getDataAttr();
if (attr && (*attr == cuf::DataAttribute::Device ||
*attr == cuf::DataAttribute::Managed ||
*attr == cuf::DataAttribute::Constant))
return true;
return false;
}

static mlir::Value createConvertOp(mlir::PatternRewriter &rewriter,
mlir::Location loc, mlir::Type toTy,
mlir::Value val) {
Expand Down Expand Up @@ -388,7 +379,7 @@ struct DeclareOpConversion : public mlir::OpRewritePattern<fir::DeclareOp> {
if (auto addrOfOp = op.getMemref().getDefiningOp<fir::AddrOfOp>()) {
if (auto global = symTab.lookup<fir::GlobalOp>(
addrOfOp.getSymbol().getRootReference().getValue())) {
if (isDeviceGlobal(global)) {
if (cuf::isRegisteredDeviceGlobal(global)) {
rewriter.setInsertionPointAfter(addrOfOp);
auto mod = op->getParentOfType<mlir::ModuleOp>();
fir::FirOpBuilder builder(rewriter, mod);
Expand Down Expand Up @@ -833,7 +824,7 @@ class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> {
addrOfOp.getSymbol().getRootReference().getValue())) {
if (mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(global.getType())))
return true;
if (isDeviceGlobal(global))
if (cuf::isRegisteredDeviceGlobal(global))
return false;
}
}
Expand Down
2 changes: 1 addition & 1 deletion flang/test/Fir/CUDA/cuda-constructor-2.f90
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
// CHECK-NOT: fir.call @_FortranACUFRegisterVariable

module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git 3372303188df0f7f8ac26e7ab610cf8b0f716d42)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
fir.global @_QMiso_c_bindingECc_int {data_attr = #cuf.cuda<constant>} constant : i32
fir.global @_QMiso_c_bindingECc_int constant : i32


fir.type_info @_QM__fortran_builtinsT__builtin_c_ptr noinit nodestroy nofinal : !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
Expand Down
32 changes: 31 additions & 1 deletion flang/test/Fir/CUDA/cuda-global-addr.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: fir-opt --cuf-convert %s | FileCheck %s
// RUN: fir-opt --split-input-file --cuf-convert %s | FileCheck %s

module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
fir.global @_QMmod1Eadev {data_attr = #cuf.cuda<device>} : !fir.array<10xi32> {
Expand Down Expand Up @@ -34,3 +34,33 @@ func.func @_QQmain() attributes {fir.bindc_name = "test"} {
// CHECK: %[[ARRAY_COOR:.*]] = fir.array_coor %[[DECL]](%{{.*}}) %c4{{.*}} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
// CHECK: %[[ARRAY_COOR_PTR:.*]] = fir.convert %[[ARRAY_COOR]] : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[ARRAY_COOR_PTR]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none

// -----

module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {

fir.global @_QMdevmodEdarray {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?xf32>>> {
%c0 = arith.constant 0 : index
%0 = fir.zero_bits !fir.heap<!fir.array<?xf32>>
%1 = fir.shape %c0 : (index) -> !fir.shape<1>
%2 = fir.embox %0(%1) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
fir.has_value %2 : !fir.box<!fir.heap<!fir.array<?xf32>>>
}
func.func @_QQmain() attributes {fir.bindc_name = "arraysize"} {
%0 = fir.address_of(@_QMiso_c_bindingECc_int) : !fir.ref<i32>
%1 = fir.declare %0 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QMiso_c_bindingECc_int"} : (!fir.ref<i32>) -> !fir.ref<i32>
%2 = fir.address_of(@_QMdevmodEdarray) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
%3 = fir.declare %2 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdevmodEdarray"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
%4 = fir.alloca i32 {bindc_name = "exp", uniq_name = "_QFEexp"}
%5 = fir.declare %4 {uniq_name = "_QFEexp"} : (!fir.ref<i32>) -> !fir.ref<i32>
%6 = fir.alloca i32 {bindc_name = "hsize", uniq_name = "_QFEhsize"}
%7 = fir.declare %6 {uniq_name = "_QFEhsize"} : (!fir.ref<i32>) -> !fir.ref<i32>
return
}
fir.global @_QMiso_c_bindingECc_int constant : i32
}

// We cannot call _FortranACUFGetDeviceAddress on a constant global.
// There is no symbol for it and the call would result into an unresolved reference.
// CHECK-NOT: fir.call {{.*}}GetDeviceAddress

12 changes: 7 additions & 5 deletions flang/test/Fir/CUDA/cuda-implicit-device-global.f90
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ // Test that global used in device function are flagged with the correct
// CHECK: %[[GLOBAL:.*]] = fir.address_of(@_QQcl[[SYMBOL:.*]]) : !fir.ref<!fir.char<1,32>>
// CHECK: %[[CONV:.*]] = fir.convert %[[GLOBAL]] : (!fir.ref<!fir.char<1,32>>) -> !fir.ref<i8>
// CHECK: fir.call @_FortranAioBeginExternalListOutput(%{{.*}}, %[[CONV]], %{{.*}}) fastmath<contract> : (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
// CHECK: fir.global linkonce @_QQcl[[SYMBOL]] {data_attr = #cuf.cuda<constant>} constant : !fir.char<1,32>
// CHECK: fir.global linkonce @_QQcl[[SYMBOL]] constant : !fir.char<1,32>

// CHECK-LABEL: gpu.module @cuda_device_mod
// CHECK: fir.global linkonce @_QQclX6995815537abaf90e86ce166af128f3a
Expand Down Expand Up @@ -99,10 +99,11 @@ // Test that global used in device function are flagged with the correct
fir.has_value %0 : !fir.char<1,11>
}

// CHECK: fir.global linkonce @_QQclX5465737420504153534544 {data_attr = #cuf.cuda<constant>} constant : !fir.char<1,11>
// Checking that a constant fir.global that is only used in host code is not copied over to the device
// CHECK: fir.global linkonce @_QQclX5465737420504153534544 constant : !fir.char<1,11>

// CHECK-LABEL: gpu.module @cuda_device_mod
// CHECK: fir.global linkonce @_QQclX5465737420504153534544 {data_attr = #cuf.cuda<constant>} constant
// CHECK-NOT: fir.global linkonce @_QQclX5465737420504153534544

// -----

Expand Down Expand Up @@ -140,7 +141,8 @@ // Test that global used in device function are flagged with the correct
}
func.func private @_FortranAioEndIoStatement(!fir.ref<i8>) -> i32 attributes {fir.io, fir.runtime}

// CHECK: fir.global linkonce @_QQclX5465737420504153534544 {data_attr = #cuf.cuda<constant>} constant : !fir.char<1,11>
// Checking that a constant fir.global that is used in device code is copied over to the device
// CHECK: fir.global linkonce @_QQclX5465737420504153534544 constant : !fir.char<1,11>

// CHECK-LABEL: gpu.module @cuda_device_mod
// CHECK: fir.global linkonce @_QQclX5465737420504153534544 {data_attr = #cuf.cuda<constant>} constant
// CHECK: fir.global linkonce @_QQclX5465737420504153534544 constant
Loading