Skip to content

Commit 27e458c

Browse files
authored
[flang][cuda] Distinguish constant fir.global from globals with a #cuf.cuda<constant> attribute (llvm#118912)
1. In `CufOpConversion` `isDeviceGlobal` was renamed `isRegisteredGlobal` and moved to the common file. `isRegisteredGlobal` excludes constant `fir.global` operation from registration. This is to avoid calls to `_FortranACUFGetDeviceAddress` on globals which do not have any symbols in the runtime. This was done for `_FortranACUFRegisterVariable` in llvm#118582, but also needs to be done here after llvm#118591 2. `CufDeviceGlobal` no longer adds the `#cuf.cuda<constant>` attribute to the constant global. As discussed in llvm#118582 a module variable with the #cuf.cuda<constant> attribute is not a compile time constant. Yet, the compile time constant also needs to be copied into the GPU module. The candidates for copy to the GPU modules are - the globals needing regsitrations regardless of their uses in device code (they can be referred to in host code as well) - the compile time constant when used in device code 3. The registration of "constant" module device variables ( #cuf.cuda<constant>) can be restored in `CufAddConstructor`
1 parent d20731c commit 27e458c

File tree

8 files changed

+82
-56
lines changed

8 files changed

+82
-56
lines changed

flang/include/flang/Optimizer/Transforms/CUFCommon.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#ifndef FORTRAN_OPTIMIZER_TRANSFORMS_CUFCOMMON_H_
1010
#define FORTRAN_OPTIMIZER_TRANSFORMS_CUFCOMMON_H_
1111

12+
#include "flang/Optimizer/Dialect/FIROps.h"
1213
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
1314
#include "mlir/IR/BuiltinOps.h"
1415

@@ -21,6 +22,7 @@ mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod,
2122
mlir::SymbolTable &symTab);
2223

2324
bool isInCUDADeviceContext(mlir::Operation *op);
25+
bool isRegisteredDeviceGlobal(fir::GlobalOp op);
2426

2527
} // namespace cuf
2628

flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,8 @@ struct CUFAddConstructor
106106

107107
mlir::func::FuncOp func;
108108
switch (attr.getValue()) {
109-
case cuf::DataAttribute::Device: {
109+
case cuf::DataAttribute::Device:
110+
case cuf::DataAttribute::Constant: {
110111
func = fir::runtime::getRuntimeFunc<mkRTKey(CUFRegisterVariable)>(
111112
loc, builder);
112113
auto fTy = func.getFunctionType();

flang/lib/Optimizer/Transforms/CUFCommon.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,14 @@ bool cuf::isInCUDADeviceContext(mlir::Operation *op) {
4343
}
4444
return false;
4545
}
46+
47+
bool cuf::isRegisteredDeviceGlobal(fir::GlobalOp op) {
48+
if (op.getConstant())
49+
return false;
50+
auto attr = op.getDataAttr();
51+
if (attr && (*attr == cuf::DataAttribute::Device ||
52+
*attr == cuf::DataAttribute::Managed ||
53+
*attr == cuf::DataAttribute::Constant))
54+
return true;
55+
return false;
56+
}

flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp

Lines changed: 26 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "mlir/IR/SymbolTable.h"
1919
#include "mlir/Pass/Pass.h"
2020
#include "mlir/Transforms/DialectConversion.h"
21+
#include "llvm/ADT/DenseSet.h"
2122

2223
namespace fir {
2324
#define GEN_PASS_DEF_CUFDEVICEGLOBAL
@@ -27,36 +28,30 @@ namespace fir {
2728
namespace {
2829

2930
static void processAddrOfOp(fir::AddrOfOp addrOfOp,
30-
mlir::SymbolTable &symbolTable, bool onlyConstant) {
31+
mlir::SymbolTable &symbolTable,
32+
llvm::DenseSet<fir::GlobalOp> &candidates) {
3133
if (auto globalOp = symbolTable.lookup<fir::GlobalOp>(
3234
addrOfOp.getSymbol().getRootReference().getValue())) {
33-
bool isCandidate{(onlyConstant ? globalOp.getConstant() : true) &&
34-
!globalOp.getDataAttr()};
35-
if (isCandidate)
36-
globalOp.setDataAttrAttr(cuf::DataAttributeAttr::get(
37-
addrOfOp.getContext(), globalOp.getConstant()
38-
? cuf::DataAttribute::Constant
39-
: cuf::DataAttribute::Device));
35+
// TO DO: limit candidates to non-scalars. Scalars appear to have been
36+
// folded in already.
37+
if (globalOp.getConstant()) {
38+
candidates.insert(globalOp);
39+
}
4040
}
4141
}
4242

43-
static void prepareImplicitDeviceGlobals(mlir::func::FuncOp funcOp,
44-
mlir::SymbolTable &symbolTable,
45-
bool onlyConstant = true) {
43+
static void
44+
prepareImplicitDeviceGlobals(mlir::func::FuncOp funcOp,
45+
mlir::SymbolTable &symbolTable,
46+
llvm::DenseSet<fir::GlobalOp> &candidates) {
47+
4648
auto cudaProcAttr{
4749
funcOp->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName())};
48-
if (!cudaProcAttr || cudaProcAttr.getValue() == cuf::ProcAttribute::Host) {
49-
// Look for globlas in CUF KERNEL DO operations.
50-
for (auto cufKernelOp : funcOp.getBody().getOps<cuf::KernelOp>()) {
51-
cufKernelOp.walk([&](fir::AddrOfOp addrOfOp) {
52-
processAddrOfOp(addrOfOp, symbolTable, onlyConstant);
53-
});
54-
}
55-
return;
50+
if (cudaProcAttr && cudaProcAttr.getValue() != cuf::ProcAttribute::Host) {
51+
funcOp.walk([&](fir::AddrOfOp addrOfOp) {
52+
processAddrOfOp(addrOfOp, symbolTable, candidates);
53+
});
5654
}
57-
funcOp.walk([&](fir::AddrOfOp addrOfOp) {
58-
processAddrOfOp(addrOfOp, symbolTable, onlyConstant);
59-
});
6055
}
6156

6257
class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase<CUFDeviceGlobal> {
@@ -67,9 +62,10 @@ class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase<CUFDeviceGlobal> {
6762
if (!mod)
6863
return signalPassFailure();
6964

65+
llvm::DenseSet<fir::GlobalOp> candidates;
7066
mlir::SymbolTable symTable(mod);
7167
mod.walk([&](mlir::func::FuncOp funcOp) {
72-
prepareImplicitDeviceGlobals(funcOp, symTable);
68+
prepareImplicitDeviceGlobals(funcOp, symTable, candidates);
7369
return mlir::WalkResult::advance();
7470
});
7571

@@ -80,22 +76,15 @@ class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase<CUFDeviceGlobal> {
8076
return signalPassFailure();
8177
mlir::SymbolTable gpuSymTable(gpuMod);
8278
for (auto globalOp : mod.getOps<fir::GlobalOp>()) {
83-
auto attr = globalOp.getDataAttrAttr();
84-
if (!attr)
85-
continue;
86-
switch (attr.getValue()) {
87-
case cuf::DataAttribute::Device:
88-
case cuf::DataAttribute::Constant:
89-
case cuf::DataAttribute::Managed: {
90-
auto globalName{globalOp.getSymbol().getValue()};
91-
if (gpuSymTable.lookup<fir::GlobalOp>(globalName)) {
92-
break;
93-
}
94-
gpuSymTable.insert(globalOp->clone());
95-
} break;
96-
default:
79+
if (cuf::isRegisteredDeviceGlobal(globalOp))
80+
candidates.insert(globalOp);
81+
}
82+
for (auto globalOp : candidates) {
83+
auto globalName{globalOp.getSymbol().getValue()};
84+
if (gpuSymTable.lookup<fir::GlobalOp>(globalName)) {
9785
break;
9886
}
87+
gpuSymTable.insert(globalOp->clone());
9988
}
10089
}
10190
};

flang/lib/Optimizer/Transforms/CUFOpConversion.cpp

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -81,15 +81,6 @@ static bool hasDoubleDescriptors(OpTy op) {
8181
return false;
8282
}
8383

84-
bool isDeviceGlobal(fir::GlobalOp op) {
85-
auto attr = op.getDataAttr();
86-
if (attr && (*attr == cuf::DataAttribute::Device ||
87-
*attr == cuf::DataAttribute::Managed ||
88-
*attr == cuf::DataAttribute::Constant))
89-
return true;
90-
return false;
91-
}
92-
9384
static mlir::Value createConvertOp(mlir::PatternRewriter &rewriter,
9485
mlir::Location loc, mlir::Type toTy,
9586
mlir::Value val) {
@@ -388,7 +379,7 @@ struct DeclareOpConversion : public mlir::OpRewritePattern<fir::DeclareOp> {
388379
if (auto addrOfOp = op.getMemref().getDefiningOp<fir::AddrOfOp>()) {
389380
if (auto global = symTab.lookup<fir::GlobalOp>(
390381
addrOfOp.getSymbol().getRootReference().getValue())) {
391-
if (isDeviceGlobal(global)) {
382+
if (cuf::isRegisteredDeviceGlobal(global)) {
392383
rewriter.setInsertionPointAfter(addrOfOp);
393384
auto mod = op->getParentOfType<mlir::ModuleOp>();
394385
fir::FirOpBuilder builder(rewriter, mod);
@@ -833,7 +824,7 @@ class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> {
833824
addrOfOp.getSymbol().getRootReference().getValue())) {
834825
if (mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(global.getType())))
835826
return true;
836-
if (isDeviceGlobal(global))
827+
if (cuf::isRegisteredDeviceGlobal(global))
837828
return false;
838829
}
839830
}

flang/test/Fir/CUDA/cuda-constructor-2.f90

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
3939
// CHECK-NOT: fir.call @_FortranACUFRegisterVariable
4040

4141
module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git 3372303188df0f7f8ac26e7ab610cf8b0f716d42)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
42-
fir.global @_QMiso_c_bindingECc_int {data_attr = #cuf.cuda<constant>} constant : i32
42+
fir.global @_QMiso_c_bindingECc_int constant : i32
4343

4444

4545
fir.type_info @_QM__fortran_builtinsT__builtin_c_ptr noinit nodestroy nofinal : !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>

flang/test/Fir/CUDA/cuda-global-addr.mlir

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: fir-opt --cuf-convert %s | FileCheck %s
1+
// RUN: fir-opt --split-input-file --cuf-convert %s | FileCheck %s
22

33
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
44
fir.global @_QMmod1Eadev {data_attr = #cuf.cuda<device>} : !fir.array<10xi32> {
@@ -34,3 +34,33 @@ func.func @_QQmain() attributes {fir.bindc_name = "test"} {
3434
// CHECK: %[[ARRAY_COOR:.*]] = fir.array_coor %[[DECL]](%{{.*}}) %c4{{.*}} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
3535
// CHECK: %[[ARRAY_COOR_PTR:.*]] = fir.convert %[[ARRAY_COOR]] : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
3636
// CHECK: fir.call @_FortranACUFDataTransferPtrPtr(%[[ARRAY_COOR_PTR]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.llvm_ptr<i8>, i64, i32, !fir.ref<i8>, i32) -> none
37+
38+
// -----
39+
40+
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
41+
42+
fir.global @_QMdevmodEdarray {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?xf32>>> {
43+
%c0 = arith.constant 0 : index
44+
%0 = fir.zero_bits !fir.heap<!fir.array<?xf32>>
45+
%1 = fir.shape %c0 : (index) -> !fir.shape<1>
46+
%2 = fir.embox %0(%1) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
47+
fir.has_value %2 : !fir.box<!fir.heap<!fir.array<?xf32>>>
48+
}
49+
func.func @_QQmain() attributes {fir.bindc_name = "arraysize"} {
50+
%0 = fir.address_of(@_QMiso_c_bindingECc_int) : !fir.ref<i32>
51+
%1 = fir.declare %0 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QMiso_c_bindingECc_int"} : (!fir.ref<i32>) -> !fir.ref<i32>
52+
%2 = fir.address_of(@_QMdevmodEdarray) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
53+
%3 = fir.declare %2 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdevmodEdarray"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
54+
%4 = fir.alloca i32 {bindc_name = "exp", uniq_name = "_QFEexp"}
55+
%5 = fir.declare %4 {uniq_name = "_QFEexp"} : (!fir.ref<i32>) -> !fir.ref<i32>
56+
%6 = fir.alloca i32 {bindc_name = "hsize", uniq_name = "_QFEhsize"}
57+
%7 = fir.declare %6 {uniq_name = "_QFEhsize"} : (!fir.ref<i32>) -> !fir.ref<i32>
58+
return
59+
}
60+
fir.global @_QMiso_c_bindingECc_int constant : i32
61+
}
62+
63+
// We cannot call _FortranACUFGetDeviceAddress on a constant global.
64+
// There is no symbol for it and the call would result into an unresolved reference.
65+
// CHECK-NOT: fir.call {{.*}}GetDeviceAddress
66+

flang/test/Fir/CUDA/cuda-implicit-device-global.f90

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ // Test that global used in device function are flagged with the correct
2323
// CHECK: %[[GLOBAL:.*]] = fir.address_of(@_QQcl[[SYMBOL:.*]]) : !fir.ref<!fir.char<1,32>>
2424
// CHECK: %[[CONV:.*]] = fir.convert %[[GLOBAL]] : (!fir.ref<!fir.char<1,32>>) -> !fir.ref<i8>
2525
// CHECK: fir.call @_FortranAioBeginExternalListOutput(%{{.*}}, %[[CONV]], %{{.*}}) fastmath<contract> : (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
26-
// CHECK: fir.global linkonce @_QQcl[[SYMBOL]] {data_attr = #cuf.cuda<constant>} constant : !fir.char<1,32>
26+
// CHECK: fir.global linkonce @_QQcl[[SYMBOL]] constant : !fir.char<1,32>
2727

2828
// CHECK-LABEL: gpu.module @cuda_device_mod
2929
// CHECK: fir.global linkonce @_QQclX6995815537abaf90e86ce166af128f3a
@@ -99,10 +99,11 @@ // Test that global used in device function are flagged with the correct
9999
fir.has_value %0 : !fir.char<1,11>
100100
}
101101

102-
// CHECK: fir.global linkonce @_QQclX5465737420504153534544 {data_attr = #cuf.cuda<constant>} constant : !fir.char<1,11>
102+
// Checking that a constant fir.global that is only used in host code is not copied over to the device
103+
// CHECK: fir.global linkonce @_QQclX5465737420504153534544 constant : !fir.char<1,11>
103104

104105
// CHECK-LABEL: gpu.module @cuda_device_mod
105-
// CHECK: fir.global linkonce @_QQclX5465737420504153534544 {data_attr = #cuf.cuda<constant>} constant
106+
// CHECK-NOT: fir.global linkonce @_QQclX5465737420504153534544
106107

107108
// -----
108109

@@ -140,7 +141,8 @@ // Test that global used in device function are flagged with the correct
140141
}
141142
func.func private @_FortranAioEndIoStatement(!fir.ref<i8>) -> i32 attributes {fir.io, fir.runtime}
142143

143-
// CHECK: fir.global linkonce @_QQclX5465737420504153534544 {data_attr = #cuf.cuda<constant>} constant : !fir.char<1,11>
144+
// Checking that a constant fir.global that is used in device code is copied over to the device
145+
// CHECK: fir.global linkonce @_QQclX5465737420504153534544 constant : !fir.char<1,11>
144146

145147
// CHECK-LABEL: gpu.module @cuda_device_mod
146-
// CHECK: fir.global linkonce @_QQclX5465737420504153534544 {data_attr = #cuf.cuda<constant>} constant
148+
// CHECK: fir.global linkonce @_QQclX5465737420504153534544 constant

0 commit comments

Comments
 (0)